diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml index 43cbd9ee3..b007278a9 100644 --- a/.github/workflows/github_actions.yml +++ b/.github/workflows/github_actions.yml @@ -16,7 +16,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04, macos-11] + os: [ubuntu-latest, macos-latest] experimental: [false] mpi_impl: - openmpi @@ -28,7 +28,7 @@ jobs: - armci - mpi-pr f77: - - gfortran + - gfortran-12 cc: - clang - gcc @@ -52,7 +52,7 @@ jobs: armci_network: sockets config_opts: --enable-i4 f77: ifort - cc: icc + cc: icx oneapi: /opt/intel/oneapi - os: ubuntu-latest experimental: true @@ -61,7 +61,7 @@ jobs: config_opts: --enable-i4 --without-blas --enable-cxx --disable-f77 f77: gfortran cc: gcc - - os: macos-11 + - os: macos-13 experimental: true mpi_impl: mpich armci_network: mpi-pr @@ -83,19 +83,19 @@ jobs: config_opts: "--disable-f77 --enable-cxx" f77: gfortran-10 cc: gcc-10 - - os: macos-12 + - os: macos-13 experimental: true mpi_impl: mpich armci_network: mpi-ts config_opts: "--disable-static --enable-shared" - f77: gfortran + f77: gfortran-13 cc: clang - - os: macos-12 + - os: macos-13 experimental: true use_cmake: "Y" mpi_impl: mpich armci_network: mpi-pr - f77: gfortran + f77: gfortran-13 cc: clang - os: ubuntu-20.04 experimental: true @@ -119,6 +119,8 @@ jobs: cc: gcc oneapi: /opt/intel/oneapi exclude: + - armci_network: ofi + os: macos-latest - armci_network: mpi-pr mpi_impl: openmpi - armci_network: mpi3 @@ -138,12 +140,12 @@ jobs: continue-on-error: ${{ matrix.experimental }} steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 40 - name: Cache install steps id: ga-cache-install - uses: actions/cache@v3 + uses: actions/cache@v4 env: cache-name: cache-install-steps with: @@ -157,7 +159,16 @@ jobs: case "${{ matrix.os }}" in ubuntu*|jessie|stretch|buster) sudo apt-get update -q -y - if [[ "$F77" == "gfortran-11" ]] || [[ "$CC" == "gcc-11" ]]; then sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get -y install gcc-11 gfortran-11 g++-11; fi + echo F77 is "$F77" + if [[ "$F77" =~ gfortran-[0-9][0-9] ]] || [[ "$CC" =~ gcc-[0-9][0-9] ]]; then + if [[ "$CC" =~ gcc-[0-9][0-9] ]]; then + version=$(echo "$CC" | cut -d - -f 2 ) + fi + if [[ "$F77" =~ gfortran-[0-9][0-9] ]]; then + version=$(echo "$F77" | cut -d - -f 2 ) + fi + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get -y install gcc-$version gfortran-$version g++-$version + fi sudo apt-get install -q -y gfortran ;; macos*) @@ -165,7 +176,7 @@ jobs: brew install gcc coreutils automake || true ;; esac - if [[ "$F77" == "ifort" ]] || [[ "$CC" == "icc" ]] ; then ./travis/install-intel.sh; source ${{ matrix.oneapi }}/setvars.sh --force; fi + if [[ "$F77" == "ifort" ]] || [[ "$CC" == "icc" ]] || [[ "$CC" == "icx" ]]; then ./travis/install-intel.sh; source ${{ matrix.oneapi }}/setvars.sh --force; fi echo F77 is `which "$F77"` echo F77 compiler version `"$F77" -v` - name: before_install @@ -174,12 +185,13 @@ jobs: pwd ls -lart mkdir -p $GITHUB_WORKSPACE/apps - if [[ "$F77" == "ifort" ]] || [[ "$CC" == "icc" ]] ; then source ${{ matrix.oneapi }}/setvars.sh --force; fi - sh ./travis/install-autotools.sh $GITHUB_WORKSPACE/apps + AUTO_CC="$CC" + if [[ "$F77" == "ifort" ]] || [[ "$CC" == "icc" ]] || [[ "$CC" == "icx" ]]; then source ${{ matrix.oneapi }}/setvars.sh --force; AUTO_CC=cc; fi + CC="$AUTO_CC" sh ./travis/install-autotools.sh $GITHUB_WORKSPACE/apps - name: install run: | export CI_ROOT=$GITHUB_WORKSPACE/apps - if [[ "$F77" == "ifort" ]] || [[ "$CC" == "icc" ]] ; then source ${{ matrix.oneapi }}/setvars.sh --force; fi + if [[ "$F77" == "ifort" ]] || [[ "$CC" == "icc" ]] || [[ "$CC" == "icx" ]]; then source ${{ matrix.oneapi }}/setvars.sh --force; fi ./travis/install-mpi.sh $CI_ROOT $MPI_IMPL if [[ "$PORT" == "ofi" ]]; then ./travis/install-libfabric.sh $CI_ROOT; else true; fi if [[ "$PORT" == "armci" ]]; then ./travis/install-armci-mpi.sh $CI_ROOT; else true; fi @@ -187,7 +199,7 @@ jobs: - name: compile and test global arrays if: ${{ success() }} run: | - if [[ "$F77" == "ifort" ]] || [[ "$CC" == "icc" ]] ; then source ${{ matrix.oneapi }}/setvars.sh --force; fi + if [[ "$F77" == "ifort" ]] || [[ "$CC" == "icc" ]] || [[ "$CC" == "icx" ]] ; then source ${{ matrix.oneapi }}/setvars.sh --force; fi ./travis/build-run.sh $GITHUB_WORKSPACE/apps $PORT $MPI_IMPL $USE_CMAKE $F77 - name: after_failure if: ${{ failure() }} diff --git a/CMakeLists.txt b/CMakeLists.txt index b90b063a0..62c5084a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,8 +82,15 @@ if(ENABLE_BLAS) endif() endif() -if(MSVC AND ENABLE_FORTRAN) - message(FATAL_ERROR "MSVC build needs ENABLE_FORTRAN=OFF") +if(ENABLE_FORTRAN) + if(MSVC) + message(FATAL_ERROR "MSVC build needs ENABLE_FORTRAN=OFF") + endif() + + find_program(ga_m4_cmd NAMES gm4 m4) + if(NOT ga_m4_cmd) + message(FATAL_ERROR "m4 command not found. GNU M4 is required.") + endif() endif() if(ENABLE_PROFILING) @@ -141,7 +148,20 @@ include(${PROJECT_SOURCE_DIR}/cmake/ga-checks.cmake) if (ENABLE_FORTRAN) include( FortranCInterface ) - FortranCInterface_HEADER(${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.h MACRO_NAMESPACE F77_FUNC_) + FortranCInterface_HEADER(${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.ignore MACRO_NAMESPACE F77_FUNC_) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.h + COMMAND ${CMAKE_COMMAND} -D INPUT:PATH="${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.ignore" -D OUTPUT:PATH="${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.h" -P ${PROJECT_SOURCE_DIR}/tools/config_fh_from_h.cmake + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.ignore + ) + add_custom_target( + GenerateF2C_CH ALL + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.h + ) + set_source_files_properties( + ${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.h + PROPERTIES GENERATED TRUE + ) else() CONFIGURE_FILE( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/f2c_dummy.h.in ${CMAKE_CURRENT_BINARY_DIR}/f2c_cmake.h ) @@ -155,7 +175,6 @@ endif() # hardwire various GA configuration parameters. Use convention that parameters # are defined and set to 0 if not used set (CYGWIN 0) -set (DECOSF 0) if (CMAKE_SYSTEM_NAME STREQUAL Linux) set(LINUX 1) @@ -186,10 +205,6 @@ set (USE_MALLOC 0) if(ENABLE_PROFILING) set (GA_PROFILING 1) endif() -if(USE_CRAYSHASTA) - set(__CRAYXE 1) - # list(APPEND GA_EXTRA_LIBS pmi2) -endif() # hardwire ARMCI configuration options set (HAVE_ARMCI_GROUP_COMM 1) @@ -217,7 +232,7 @@ if (ENABLE_FORTRAN) set (FORTRAN_I8_FLAG -fdefault-integer-8) set (FORTRAN_I4_FLAG "") elseif (CMAKE_Fortran_COMPILER_ID MATCHES "Flang") - message(STATUS "Using Flang compiler settings") + message(STATUS "Using Flang compiler settings for flang, flang-new, armflang") set (F77_GETARG_DECLS "external GETARG") set (FORTRAN_I8_FLAG -fdefault-integer-8) set (FORTRAN_I4_FLAG "") @@ -362,6 +377,15 @@ if(ENABLE_BLAS) #include_directories(${BLAS_INCLUDE_DIRS}) endif() +if(ENABLE_FORTRAN) + set(_CHK_FORTRAN_COMPILERS "GNU" "Intel" "LLVMFlang" "Flang") + list(FIND _CHK_FORTRAN_COMPILERS ${CMAKE_Fortran_COMPILER_ID} index) + if(index GREATER -1) + find_package( StandardFortran ) + list(APPEND GA_EXTRA_LIBS ${STANDARDFORTRAN_LIBRARIES}) + endif() +endif() + if(ENABLE_PROFILING) set(ga_profiler $) endif() @@ -393,6 +417,7 @@ target_include_directories(ga $ $ ) + if(ENABLE_BLAS) target_include_directories(ga INTERFACE @@ -441,6 +466,12 @@ if(ENABLE_TESTS) add_subdirectory(global/examples) endif() +foreach (bip ${GA_HEADER_PATHS}) + target_include_directories(ga + INTERFACE + $ + ) +endforeach() # -------------------------------------- # Global Arrays installation @@ -506,7 +537,7 @@ endif() list(REMOVE_AT CMAKE_MODULE_PATH 0) list(REMOVE_AT CMAKE_MODULE_PATH 0) -# export(EXPORT globalarrays-targets -# FILE ${CMAKE_CURRENT_BINARY_DIR}/globalarrays-targets.cmake -# NAMESPACE GlobalArrays::) - +# Export build tree +export(EXPORT globalarrays-targets + NAMESPACE GlobalArrays:: + FILE "${PROJECT_BINARY_DIR}/globalarrays-targets.cmake") diff --git a/Makefile.am b/Makefile.am index 8cdf10186..db429d117 100644 --- a/Makefile.am +++ b/Makefile.am @@ -412,9 +412,6 @@ libga_la_SOURCES += global/src/ga_diag_seq.F if ENABLE_EISPACK libga_la_SOURCES += global/src/rsg.F endif -if CRAY_XT_NETWORKS -libga_la_SOURCES += global/src/ga_dgemmf.F -endif # CRAY_XT_NETWORKS if ENABLE_PEIGS libga_la_SOURCES += global/src/ga_diag.F endif # ENABLE_PEIGS @@ -1607,19 +1604,6 @@ endif # MSG_COMMS_MPI EXTRA_DIST += tcgmsg/tcgmsg-mpi/README -############################################################################## -# armci -# if TCGMSG is coming from ARMCI -if ARMCI_NETWORK_ARMCI -else -if MSG_COMMS_TCGMSG4 -AM_CPPFLAGS += -I$(top_srcdir)/armci/tcgmsg -endif -if MSG_COMMS_TCGMSG5 -AM_CPPFLAGS += -I$(top_srcdir)/armci/tcgmsg -endif -endif - ############################################################################## # armci if ARMCI_NETWORK_ARMCI @@ -1650,18 +1634,7 @@ else if ARMCI_NETWORK_ARMCI else -if ARMCI_NETWORK_PORTALS -AM_CPPFLAGS += -I$(top_srcdir)/armci/src-portals -AM_CPPFLAGS += -I$(top_srcdir)/armci/tcgmsg -else -if ARMCI_NETWORK_GEMINI -AM_CPPFLAGS += -I$(top_srcdir)/armci/src-gemini -AM_CPPFLAGS += -I$(top_srcdir)/armci/tcgmsg -else AM_CPPFLAGS += -I$(top_srcdir)/armci/src/include -AM_CPPFLAGS += -I$(top_srcdir)/armci/tcgmsg -endif -endif endif endif @@ -1750,17 +1723,6 @@ CP__v_ = $(CP__v_$(AM_DEFAULT_VERBOSITY)) CP__v_0 = @echo " CP " $@; .PHONY: pfiles clean-pfiles -if MSG_COMMS_TCGMSG4 -pfiles: $(check_PROGRAMS) - for p in $(check_PROGRAMS); do \ - echo "`whoami` `hostname` $(NPROCS) `pwd`/$$p /tmp" > $$p.p; \ - done -check: pfiles -clean-pfiles: - find . -name '*.p' -exec rm {} \; ; -clean-local: clean-pfiles -MAYBE_PFILES = pfiles -endif # MSG_COMMS_TCGMSG4 ############################################################################## # test suite @@ -1805,13 +1767,6 @@ XFAIL_TESTS = XFAIL_TESTS += $(SERIAL_TESTS_XFAIL) XFAIL_TESTS += $(PARALLEL_TESTS_XFAIL) -if MSG_COMMS_TCGMSG4 -LOG_COMPILER = \ -maybe_tcgexec=`if echo "$(SERIAL_TESTS)" | $(GREP) "$$p" > /dev/null; then echo ""; else echo "$(TCGEXEC)"; fi`; eval $$maybe_tcgexec -else -if MSG_COMMS_TCGMSG5 -LOG_COMPILER = tst="$$dir$$f -np $(NPROCS)"; eval -else if CROSS_COMPILING LOG_COMPILER = \ maybe_mpiexec=`if echo "$(SERIAL_TESTS)" | $(GREP) "$$p" > /dev/null; then echo "$(MPIEXEC)" | $(SED) 's/%NP%/1/'; else echo "$(MPIEXEC)" | $(SED) 's/%NP%/$(NPROCS)/'; fi`; eval $$maybe_mpiexec @@ -1825,8 +1780,6 @@ LOG_COMPILER = \ maybe_mpiexec=`if echo "$(SERIAL_TESTS)" | $(GREP) "$$p" > /dev/null; then echo ""; else if echo "$(MPIEXEC)" | $(GREP) "%NP%" > /dev/null; then echo "$(MPIEXEC)" | $(SED) 's/%NP%/$(NPROCS)/'; else echo "$(MPIEXEC)"; fi; fi`; eval $$maybe_mpiexec endif # COMEX_NETWORK_MPI_PR endif # CROSS_COMPILING -endif # MSG_COMMS_TCGMSG5 -endif # MSG_COMMS_TCGMSG4 .PHONY: check-travis check-travis: $(all-am) $(check_LTLIBRARIES) $(TRAVIS_TESTS) $(MAYBE_PFILES) diff --git a/README.md b/README.md index f32e628a7..11fbbc0c7 100644 --- a/README.md +++ b/README.md @@ -1,84 +1,29 @@ # GLOBAL ARRAYS -Travis: [![Build Status](https://travis-ci.org/GlobalArrays/ga.svg?branch=master)](https://travis-ci.org/GlobalArrays/ga) -GH develop branch: [![Build Status](https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2FGlobalArrays%2Fga%2Fbadge%3Fref%3Ddevelop&style=flat)](https://actions-badge.atrox.dev/GlobalArrays/ga/goto?ref=develop) GH master branch: [![Build Status](https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2FGlobalArrays%2Fga%2Fbadge%3Fref%3Dmaster&style=flat)](https://actions-badge.atrox.dev/GlobalArrays/ga/goto?ref=master) +[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](LICENSE) +[![Documentation Status](https://readthedocs.org/projects/globalarrays/badge/?version=latest)](https://globalarrays.readthedocs.io/en/latest/?badge=latest) +[![GitHub Downloads](https://img.shields.io/github/downloads/GlobalArrays/ga/total)](https://github.com/GlobalArrays/ga/releases) +[![CI](https://github.com/GlobalArrays/ga/actions/workflows/github_actions.yml/badge.svg)](https://github.com/GlobalArrays/ga/actions?query=workflow:GlobalArrays_CI) ## Table of Contents -* [DISCLAIMER](#disclaimer) -* [ACKNOWLEDGMENT](#acknowledgment) -* [GETTING STARTED](#getting-started) -* [QUESTIONS/HELP/SUPPORT/BUG-REPORT](#questionshelpsupportbug-report) -* [WHERE IS THE DOCUMENTATION?](#where-is-the-documentation) +* [ACKNOWLEDGMENTS](#acknowledgment) * [ABOUT THIS SOFTWARE](#about-this-software) -* [HOW TO BUILD THE PACKAGE?](#how-to-build-the-package) - * [autotools build](#autotools-build) - * [Configuration Options](#configuration-options) - * [Selecting the Underlying One-Sided Runtime](#selecting-the-underlying-one-sided-runtime) - * [How to Use Progress Ranks](#how-to-use-progress-ranks) - * [How to Use Progress Threads](#how-to-use-progress-threads) - * [How to Use the New Default Two Sided Port](#how-to-use-the-new-default-two-sided-port) - * [Full List of Runtimes](#full-list-of-runtimes) - * [Other Options](#other-options) - * [Special Notes for BLAS](#special-notes-for-blas) - * [Cross-Compilation Issues](#cross-compilation-issues) - * [Cray XT](#cray-xt) - * [Compiler Selection](#compiler-selection) - * [After Configuration](#after-configuration) - * [Test Programs](#test-programs) - * [Test Suite](#test-suite) - * [Performance Tuning](#performance-tuning) - * [CMake](#cmake) - -## DISCLAIMER - -[back to top] - -This material was prepared as an account of work sponsored by an agency of the United States Government. Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT, SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS. - -## ACKNOWLEDGMENT +* [WHERE IS THE DOCUMENTATION?](#where-is-the-documentation) +* [QUESTIONS/HELP/SUPPORT/BUG-REPORT](#questionshelpsupportbug-report) -[back to top] +## ACKNOWLEDGMENTS This software and its documentation were produced with United States Government support under Contract Number DE-AC06-76RLO-1830 awarded by the United States Department of Energy. The United States Government retains a paid-up non-exclusive, irrevocable worldwide license to reproduce, prepare derivative works, perform publicly and display publicly by or for the US Government, including the right to distribute to other US Government contractors. -The primary current source of funding for development of GA is the Exascale Computing Project. -https://exascaleproject.org/ - -## GETTING STARTED - -[back to top] - -If the `configure` script is not present, run `./autogen.sh`. It will install the necessary versions of autoconf, automake, and libtool into an `autotools` subdirectory and then run `autoreconf` automatically to generate the `configure` script. - -The command:: - - ./configure && make && make install - -should compile the static GA library (libga.a) to use MPI two-sided communication primitives and install headers and libraries to /usr/local/include and /usr/local/lib, respectively. - -Please refer to the INSTALL file for generic build instructions. That is a good place to start if you are new to using "configure; make; make install" types of builds. Detailed instructions are covered later in this file. - -## QUESTIONS/HELP/SUPPORT/BUG-REPORT - -[back to top] - -Please submit issues to our [GitHub issue tracker](https://github.com/GlobalArrays/ga/issues). We use Google Groups to host [our discussion forum](https://groups.google.com/forum/#!forum/hpctools). - -## WHERE IS THE DOCUMENTATION? - -[back to top] - -The [GA webpage](http://hpc.pnl.gov/globalarrays/) has the most current versions of the Fortran and C documentation and the User's Manual in the HTML format. +The most recent source of funding for development of GA is the [Exascale Computing Project](https://exascaleproject.org). ## ABOUT THIS SOFTWARE -[back to top] - -This directory contains the Global Arrays (GA), Communications Runtime for Exascale (ComEx) run-time library, Aggregate Remote Memory Copy Interface (ARMCI) run-time library, Memory Allocator (MA), parallel I/O libraries (DRA,EAF,SF), TCGMSG, and TCGMSG-MPI packages bundled together. - Global Arrays is a portable Non-Uniform Memory Access (NUMA) shared-memory programming environment for distributed and shared memory computers. It augments the message-passing model by providing a shared-memory like access to distributed dense arrays. This is also known as the Partitioned Global Address Space (PGAS) model. +This library contains the Global Arrays (GA), Communications Runtime for Exascale (ComEx) run-time library, Aggregate Remote Memory Copy Interface (ARMCI) run-time library, Memory Allocator (MA), parallel I/O libraries (DRA,EAF,SF), TCGMSG, and TCGMSG-MPI packages bundled together. + ARMCI provides one-sided remote memory operations used by GA. ComEx is a successor to ARMCI and provides an ARMCI-compatible interface. New parallel runtime development takes place within ComEx including the MPI-only runtimes. @@ -97,392 +42,11 @@ MA is a dynamic memory allocator/manager for Fortran and C programs. GA++ is a C++ binding for global arrays. -See file 'COPYRIGHT' for copying conditions. -See file 'INSTALL' for compilation and installation instructions (generic). -See file 'CHANGELOG.md' for a list of major changes in the current release. - -## HOW TO BUILD THE PACKAGE? - -### autotools build - -[back to top] - -Please refer to the INSTALL file for generic build instructions. That is a good place to start if you are new to using "configure; make; make install" types of builds. The following will cover platform-specific considerations as well as the various optional features of GA. Customizations to the GA build via the configure script are discussed next. - -#### Configuration Options - -[back to top] - -There are many options available when configuring GA. Although configure can be safely run within this distributions' root folder, we recommend performing an out-of-source (aka VPATH) build. This will cleanly separate the generated Makefiles and compiled object files and libraries from the source code. This will allow, for example, one build using MPI two-sided versus another build using OpenIB for the communication layer to use the same source tree e.g.: -``` -mkdir bld_mpi_ts && cd bld_mpi_ts && ../configure -mkdir bld_mpi_openib && cd bld_mpi_openib && ../configure --with-openib -``` -Regardless of your choice to perform a VPATH build, the following should hopefully elucidate the myriad options to configure. Only the options requiring additional details are documented here. `./configure --help` will certainly list more options in addition to limited documentation. - -##### Selecting the Underlying One-Sided Runtime - -[back to top] - -This software contains a number of communication runtime implementations which directly use MPI instead of a native communication library, e.g., OpenIB verbs, Cray DMAPP. The basis of all of our MPI-based ports is the use of the MPI two-sided primitives (MPI_Send, MPI_Recv) to implement our ComEx/ARMCI one-sided protocols. The primary benefit of these ports is that Global Arrays and its user applications will now run on any platform where MPI is supported. - -The recommended port is MPI-1 with progress ranks `--with-mpi-pr`. However, there are some caveats which must be mentioned in order to use the new MPI ports. - -###### How to Use Progress Ranks - -[back to top] - -Your application code must not rely on MPI_COMM_WORLD directly. Instead, you must duplicate the MPI communicator that the GA library returns to you in place of any world communicator. Example code follows: - -Fortran77: -```fortranfixed - program main - implicit none -#include “mpi.fh" -#include "global.fh" -#include "ga-mpi.fh" - integer comm - integer ierr - call mpi_init(ierr) - call ga_initialize() - call ga_mpi_comm(comm) -! use the returned comm as ususal - call ga_terminate() - call mpi_finalize(ierr) - end -``` -C/C++: -```C -#include -#include "ga.h" -#include "ga-mpi.h" -int main(int argc, char **argv) { - MPI_Comm comm; - MPI_Init(&argc,&argv); - GA_Initialize(); - comm = GA_MPI_Comm(); - GA_Terminate(); - MPI_Finalize(); - return 0; -} -``` - -###### How to Use Progress Threads - -[back to top] - -This port uses MPI_Init_thread() internally with a threading level of MPI_THREAD_MULTIPLE. It will create one progress thread per compute node. It is advised to undersubscribe your compute nodes by one core. Your application code can remain unchanged unless you call MPI_Init() in your application code, in which case GA will detect the lower MPI threading level and abort with an error. - -###### How to Use the New Default Two Sided Port - -[back to top] - -The MPI two-sided port is fully compatible with the MPI-1 standard. However, your application code will require additional GA_Sync() calls prior to and after any MPI function calls. This effectively splits user application code into blocks/epochs/phases of MPI code and GA code. Not doing so will likely cause your application to hang since our two sided port can only make communication progress inside of a GA function call. - -Any application code which only makes GA function calls can remain unchanged. - -###### Full List of Runtimes - -[back to top] - -``` - --with-armci[=ARG] select armci network as external; path to external - ARMCI library - --with-cray-shmem[=ARG] select armci network as Cray XT shmem - --with-dmapp[=ARG] select armci network as (Comex) Cray DMAPP - --with-gemini[=ARG] select armci network as Cray XE Gemini using - libonesided - --with-lapi[=ARG] select armci network as IBM LAPI - --with-mpi-mt[=ARG] select armci network as (Comex) MPI-2 - multi-threading - --with-mpi-pt[=ARG] select armci network as (Comex) MPI-2 - multi-threading with progress thread - --with-mpi-pr[=ARG] select armci network as (Comex) MPI-1 two-sided with - progress rank - --with-mpi-spawn[=ARG] select armci network as MPI-2 dynamic process mgmt - --with-mpi-ts[=ARG] select armci network as (Comex) MPI-1 two-sided - --with-mpi3[=ARG] select armci network as (Comex) MPI-3 one-sided - --with-ofa[=ARG] select armci network as (Comex) Infiniband OpenIB - --with-ofi[=ARG] select armci network as (Comex) OFI - --with-openib[=ARG] select armci network as Infiniband OpenIB - --with-portals4[=ARG] select armci network as (Comex) Portals4 - --with-portals[=ARG] select armci network as Cray XT portals - --with-sockets[=ARG] select armci network as Ethernet TCP/IP -``` - -##### Other Options - -[back to top] - -``` ---disable-f77 Disable Fortran code. This used to be the old - GA_C_CORE or NOFORT environment variables which - enabled the C++ bindings. However, it is severely - broken. There are certain cases where Fortran code is - required but this will not inhibit the building of the - C++ bindings. In the future we may be able to - eliminate the need for the Fortran compiler/linker. - Use at your own risk (of missing symbols at link-time.) ---enable-cxx Build C++ interface. This will require the C++ linker - to locate the Fortran libraries (handled - automatically) but user C++ code will require the same - considerations (C++ linker, Fortran libraries.) ---disable-opt Don't use hard-coded optimization flags. GA is a - highly-optimized piece of software. There are certain - optimization levels or flags that are known to break - the software. If you experience mysterious faults, - consider rebuilding without optimization by using this - option. ---enable-sysv Enable System V Shared Memory. ---enable-peigs Enable Parallel Eigensystem Solver interface. This - will build the stubs required to call into the peigs - library (external). ---enable-checkpoint Enable checkpointing. Untested. For use with old - X-based visualization tool. ---enable-profile Enable profiling. Not sure what this does, sorry. ---enable-trace Enable tracing. Not sure what this does, sorry. ---enable-underscoring Force single underscore for all external Fortran - symbols. Usually, configure is able to detect the name - mangling scheme of the detected Fortran compiler and - will default to using what is detected. This includes - any variation of zero, one, or two underscores or - whether UPPERCASE or lowercase symbols are used. If - you want to force a single underscore which was the - default of older GA builds, use this option. - Otherwise, you can use the FFLAGS environment variable - to override the Fortran compiler's or platform's - defaults e.g. configure FFLAGS=-fno-underscoring. ---enable-i4 Use 4 bytes for Fortran INTEGER size. Otherwise, the - default INTEGER size is set to the results of the C - sizeof(void*) operator. ---enable-i8 Use 8 bytes for Fortran INTEGER size. Otherwise, the - default INTEGER size is set to the results of the C - sizeof(void*) operator. ---enable-shared Build shared libraries [default=no]. Useful, for - example, if you plan on wrapping GA with an - interpreted language such as Python. Otherwise, some - systems only support static libraries (or vice versa) - but static libraries are the default. -``` -For most of the external software packages an optional argument is allowed -(represented as ARG below.) **ARG can be omitted** or can be one or more -whitespace-separated directories, linker or preprocessor directives. For example:: -``` - --with-mpi="/path/to/mpi -lmylib -I/mydir" - --with-mpi=/path/to/mpi/base - --with-mpi=-lmpich -``` -The messaging libraries supported include MPI, TCGMSG, and TCGMSG over MPI. If you omit their respective `--with-` option, MPI is the default. GA can be built to work with MPI or TCGMSG. Since the TCGMSG package is small (comparing to portable MPI implementations) and compiles fast, it is still bundled with the GA package. -``` ---with-mpi=ARG Select MPI as the messaging library (default). If you - omit ARG, we attempt to locate the MPI compiler - wrappers. If you supply anything for ARG, we will - parse ARG as indicated above. ---with-tcgmsg Select TCGMSG as the messaging library; if - --with-mpi is also specified then TCGMSG over MPI is - used. ---with-blas=ARG Use external BLAS library; attempt to detect - sizeof(INTEGER) used to compile BLAS; if not found, an - internal BLAS is built ---with-blas4=ARG Use external BLAS library compiled with - sizeof(INTEGER)==4 ---with-blas8=ARG Use external BLAS library compiled with - sizeof(INTEGER)==8 ---with-lapack=ARG Use external LAPACK library. If not found, an internal - one is built. ---with-scalapack=ARG Use external ScaLAPACK library. -``` - -There are some influential environment variables as documented in `configure --help`, however there are a few that are special to GA. -``` -- F77_INT_FLAG - Fortran compiler flag to set the default INTEGER size. We know about certain - Fortran flags that set the default INTEGER size, but there will certainly be - some new (or old) ones that we don't know about. If the configure test to - determine the correct flag fails, please try setting this variable and - rerunning configure. - -- F2C_HIDDEN_STRING_LENGTH_AFTER_ARGS - If cross compiling, set to either "yes" (default) or "no" (after string). - For compatibility between Fortran and C, a Fortran subroutine written in C - that takes a character string must take an additional argument (one per - character string) indicating the length of the string. This 'hidden' - argument appears either immediately after the string in the argument list - or after all other arguments to the function. This is compiler dependent. We - attempt to detect this behavior automatically, but in the case of - cross-compiled systems it may be necessary to specify the less usual after - string convention the gaf2c/testarg program crashes. -``` - -#### Special Notes for BLAS - -[back to top] - -BLAS, being a Fortran library, can be compiled with a default INTEGER size of 4 or a promoted INTEGER size of 8. Experience has shown us that most of the time the default size of INTEGER used is 4. In some cases, however, you may have an external BLAS library which is using 8-byte INTEGERs. In order to correctly interface with an external BLAS library, GA must know the size of INTEGER used by the BLAS library. - -configure has the following BLAS-related options: `--with-blas`, `--with-blas4`, and `--with-blas8`. The latter two will force the INTEGER size to 4- or 8-bytes, respectively. The first option, `--with-blas`, defaults to 4-byte INTEGERS *however* in the two special cases of using ACML or MKL, it is possible to detect 8-byte INTEGERs automatically. As documented in the ACML manual, if the path to the library has `_int64` then 8-byte INTEGERs are used. As documented in the MKL manual, if the library is `ilp64`, then 8-byte INTEGERs are used. - -You may always override `--with-blas` by specifying the INTEGER size using one of the two more specific options. - -#### Cross-Compilation Issues - -[back to top] - -Certain platforms cross-compile from a login node for a compute node, or one might choose to cross-compile for other reasons. Cross-compiling requires the use of the `--host` option to configure which indicates to configure that certain run-time tests should not be executed. See INSTALL for details on use of the `--host` option. - -Two of our target platforms are known to require cross-compilation, Cray XT and IBM Blue Gene. - -##### Cray XT - -[back to top] - -It has been noted that configure still succeeds without the use of the --host flag. If you experience problems without --host, we recommend -``` - configure --host=x86_64-unknown-linux-gnu -``` -And if that doesn't work (cross-compilation is not detected) you must then *force* cross-compilation using both `--host` and `--build` together: -``` - configure --host=x86_64-unknown-linux-gnu --build=x86_64-unknown-linux-gnu -``` -Alternatively, you can just tell configure directly. -``` - configure cross_compiling=yes -``` - -#### Compiler Selection - -[back to top] - -Unless otherwise noted you can try to overwrite the default compiler names detected by configure by defining F77, CC, and CXX for Fortran (77), C, and C++ compilers, respectively. Or when using the MPI compilers MPIF77, MPICC, and MPICXX for MPI Fortran (77), C, and C++ compilers, respectively: -``` - configure F77=f90 CC=gcc - configure MPIF77=mpif90 MPICC=mpicc -``` -Although you can change the compiler at make-time it will likely fail. Many platform-specific compiler flags are detected at configure-time based on the compiler selection. If changing compilers, we recommend rerunning configure as above. - -#### After Configuration - -[back to top] - -By this point we assume you have successfully run configure either from the base distribution directory or from a separate build directory (aka VPATH build.) You are now ready to run 'make'. You can optionally run parallel make using the "-j" option which significantly speeds up the build. If using the MPI compiler wrappers, occasionally using "-j" will cause build failures because the MPI compiler wrapper creates a temporary symlink to the mpif.h header. In that case, you won't be able to use the "-j" option. Further, the influential environment variables used at configure-time can be overridden at make-time in case problems are encountered. For example:: -``` - ./configure CFLAGS=-Wimplicit - ... - make CFLAGS="-Wimplicit -g -O0" -``` -One particularly influential make variable is "V" which controls the verbosity of the make output. This variable corresponds to the `--disable-silent-rules/--enable-silent-riles` configure-time option, but we recommend the make-time variable: -``` - make V=0 (configure --enable-silent-rules) - make V=1 (configure --disable-silent-rules) -``` - -#### Test Programs - -[back to top] - -Running "make checkprogs" will build most test and example programs. Note that not all tests are built -- some tests depend on certain features being detected or enabled during configure. These programs are not intented to be examples of good GA coding practices because they often include private headers. However, they help us debug or time our GA library. - -#### Test Suite - -[back to top] - -Running "make check" will build most test and example programs (See "make checkprogs" notes above) in addition to running the test suite. The test suite runs both the serial and parallel tests. The test suite must know how to launch the parallel tests via the MPIEXEC variable. Please read your MPI flavor's documentation on how to launch, or if using TCGMSG you will use the "parallel" tool. For example, the following is the command to launch the test suite when compiled with OpenMPI: -``` - make check MPIEXEC="mpiexec -np 4" -``` -All tests have a per-test log file containing the output of the test. So if the test is global/testing/test.x, the log file would be global/testing/test.log. The output of failed tests is collected in the top-level log summary test-suite.log. - -The test suite will recurse into the ComEx directory and run the ComEx test suite first. If the ComEx test suite fails, the GA test suite will not run (the assumption here is that you should fix bugs in the dependent library first.) To run only the GA test suite, type "make check-ga" with the appropriate MPIEXEC variable. - -#### Performance Tuning - -[back to top] - -Setting an environment variable MA_USE_ARMCI_MEM forces MA library to use -ARMCI memory, communication via which can be faster on networks like GM, VIA -and InfiniBand. - -## CMake - -[back to top] - -The CMake build only supports the MPI-based runtimes so GA can only be built using MPI two-sided, MPI progress ranks, MPI thread multiple, MPI progress threads and MPI-3 (MPI RMA) runtimes. We recommend using MPI two-sided/MPI progress ranks based approach. - -### Dependencies -* CMake (v3.18+) -* MPI -* BLAS / LAPACK (Optional) - -### The following options are supported: - -* `ENABLE_CXX` [Default:ON] -* `ENABLE_FORTRAN` [Default:ON] -* `ENABLE_TESTS` Build GA testsuite. [Default:ON] -* `GA_RUNTIME` [Default: MPI_2SIDED] Options are - * MPI_2SIDED (Default) use simple MPI-2 sided runtime - * MPI_PROGRESS_RANK Use progress ranks runtime - * MPI_MULTITHREADED Use thread multiple runtime - * MPI_PROGRESS_THREAD Use progress thread runtime - * MPI_RMA Use MPI RMA based runtime. -* `ENABLE_SYSV` Enable System V Shared Memory -* `ENABLE_PROFILING` Build GA operation profiler. Does not work when using Clang compilers. [Default:OFF] -* `GA_EXTRA_LIBS` Specify additional libraries or linker options when building GA. -* `GCCROOT` Specify root of GCC installation. Only required when building with Clang compilers. -* `ENABLE_BLAS` Use an external BLAS library. [Default:ON] - * `Note`: Setting `ENABLE_BLAS` to `OFF` builds internal (netlib) `BLAS`. - * Only `IntelMKL`, `IBMESSL`, `BLIS`, `OpenBLAS`, `ReferenceBLAS`(Netlib) are supported. - * Need to provide the following cmake options if ENABLE_BLAS=ON - * `LINALG_VENDOR`: Should be one of `IntelMKL`, `IBMESSL`, `BLIS`, `OpenBLAS`, `ReferenceBLAS`(Netlib) [Default: `BLIS`] - * `LINALG_PREFIX`: Specify root of the LinAlg libraries installation. If the various libraries are in different locations, one needs to set - `BLAS_PREFIX`, `LAPACK_PREFIX`, `ScaLAPACK_PREFIX` individually. These three options are set to the `LINALG_PREFIX` provided by default unless explicitly set otherwise. - * `LINALG_THREAD_LAYER`: Options are `openmp` (default), `sequential` for `IntelMKL` and `smp` (default) for `IBMESSL`. Does not apply to other BLAS libraries. - * `LINALG_REQUIRED_COMPONENTS`: Options are `lp64` or `ilp64`. [Default:lp64] - * `LINALG_OPTIONAL_COMPONENTS`: `sycl` [Default:none] - * `ENABLE_SCALAPACK`: To enable ScaLAPACK discovery. -* `[OPTIONAL]` CTEST options for handling different types of job launchers and their parameters. - * `GA_JOB_LAUNCH_CMD`: `mpirun` - * `GA_JOB_LAUNCH_ARGS`: `"-n 5"` -#### The following options are standard CMake parameters. More information about them can be found in the CMake documentation. - -* `CMAKE_INSTALL_PREFIX` Specify the install location for GA. -* `CMAKE_BUILD_TYPE` [Default:RELEASE] The options are: - * RELWITHDEBINFO This will be compiled in a release mode but with debugger information (-g) included - * RELEASE Compiled in release mode and no debugger information is included in the code - * DEBUG Compiled with internal debugger information -* `BUILD_SHARED_LIBS` Build GA as a shared library. [Default:OFF] - -#### If there is a missing feature that you would like to be added to the CMake build, please submit a feature request to our [GitHub issue tracker](https://github.com/GlobalArrays/ga/issues). - -- Sample CMake invocation for Linux/MAC users - - - A minimal invocation with defaults for all options: - ``` - CC=gcc CXX=g++ FC=gfortran cmake -DCMAKE_INSTALL_PREFIX=$HOME/ga_install - ``` - - - A more complete invocation that shows most options: - ``` - CC=gcc CXX=g++ FC=gfortran cmake -DCMAKE_INSTALL_PREFIX=$HOME/ga_install \ - -DGA_RUNTIME=MPI_PROGRESS_RANK \ - -DENABLE_BLAS=ON -DLINALG_VENDOR=IntelMKL -DLINALG_PREFIX=/opt/intel/mkl \ - -DENABLE_TESTS=ON -DENABLE_CXX=ON -DENABLE_FORTRAN=ON -DENABLE_PROFILING=OFF - ``` - - -- Sample CMake invocation for Windows users. - * `ENABLE_FORTRAN=OFF` option is needed for Windows build. - * We do not recommend using the `BLAS` and `PROFILING` options as they are not tested for Windows builds. +## WHERE IS THE DOCUMENTATION? - ``` - cmake ^ - -D ENABLE_FORTRAN=OFF ^ - -D GA_RUNTIME=MPI_2SIDED ^ - -D CMAKE_INSTALL_PREFIX:PATH="\my\GA\install\path" ^ - .. - cmake --build . --config Release - cmake --build . --config Release --target install - ``` +The [GA manual](https://globalarrays.readthedocs.io) contains all the documentation. +The API reference can be found [here](https://hpc.pnl.gov/globalarrays/userinterface.shtml) -#### Known issues: The CMake build currently does not work with IBM XL compilers. +## QUESTIONS/HELP/SUPPORT/BUG-REPORT -[back to top]: #table-of-contents +Please submit issues to our [GitHub issue tracker](https://github.com/GlobalArrays/ga/issues). We use Google Groups to host [our discussion forum](https://groups.google.com/forum/#!forum/hpctools). diff --git a/armci/Makefile.am b/armci/Makefile.am index a6d0ad63e..9e585d437 100644 --- a/armci/Makefile.am +++ b/armci/Makefile.am @@ -110,23 +110,9 @@ flags: bin_SCRIPTS += tools/armci-config CLEANFILES += $(bin_SCRIPTS) -############################################################################## -# src-portals -# -if ARMCI_NETWORK_PORTALS -include $(top_srcdir)/src-portals/Makefile.inc - -############################################################################## -# src-gemini -# -else # !ARMCI_NETWORK_PORTALS -if ARMCI_NETWORK_GEMINI -include $(top_srcdir)/src-gemini/Makefile.inc - ############################################################################## # src # -else # !ARMCI_NETWORK_GEMINI libarmci_la_SOURCES += src/collectives/message.c libarmci_la_SOURCES += src/common/aggregate.c libarmci_la_SOURCES += src/common/armci.c @@ -140,10 +126,8 @@ libarmci_la_SOURCES += src/ft/armci_storage.h libarmci_la_SOURCES += src/include/acc.h libarmci_la_SOURCES += src/include/armcip.h libarmci_la_SOURCES += src/include/asm-ppc.h -libarmci_la_SOURCES += src/include/atomic_ops_ia64.h libarmci_la_SOURCES += src/include/atomics-i386.h libarmci_la_SOURCES += src/include/copy.h -libarmci_la_SOURCES += src/include/fujitsu-vpp.h libarmci_la_SOURCES += src/include/kr_malloc.h libarmci_la_SOURCES += src/include/locks.h libarmci_la_SOURCES += src/include/memlock.h @@ -170,18 +154,6 @@ libarmci_la_SOURCES += src/xfer/vector.c if MSG_COMMS_MPI libarmci_la_SOURCES += src/common/groups.c endif -if ARMCI_NETWORK_CRAY_SHMEM -AM_CPPFLAGS += -I$(top_srcdir)/src/devices/cray-shmem -libarmci_la_SOURCES += src/memory/shmalloc.c -endif -if ARMCI_NETWORK_LAPI -AM_CPPFLAGS += -I$(top_srcdir)/src/devices/lapi -libarmci_la_SOURCES += src/common/async.c -libarmci_la_SOURCES += src/common/request.c -libarmci_la_SOURCES += src/devices/lapi/lapi.c -libarmci_la_SOURCES += src/devices/lapi/lapidefs.h -libarmci_la_SOURCES += src/memory/buffers.c -endif if ARMCI_NETWORK_MPI_MT AM_CPPFLAGS += -I$(top_srcdir)/src/devices/mpi-mt libarmci_la_SOURCES += src/common/ds-shared.c @@ -254,27 +226,11 @@ libarmci_la_SOURCES += src/memory/shmlimit.c endif endif endif -if NB_NONCONT -libarmci_la_SOURCES += src/common/noncont.c -endif + if THREAD_SAFE libarmci_la_SOURCES += src/common/utils.c libarmci_la_SOURCES += src/util/threads.c endif -if NEC -libarmci_la_SOURCES += src/locks/tas-sx.s - -src/locks/tas-sx.o: src/locks/tas-sx.s - $(AM_V_CCAS) \ - $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS) \ - -c -o src/locks/tas-sx.o $(top_srcdir)/src/locks/tas-sx.s - -src/locks/tas-sx.lo: src/locks/tas-sx.s - $(AM_V_CCAS) \ - $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ - $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \ - $(CCASFLAGS) -c -o src/locks/tas-sx.lo $(top_srcdir)/src/locks/tas-sx.s -endif include_HEADERS += src/include/armci.h include_HEADERS += src/include/gpc.h @@ -282,9 +238,6 @@ include_HEADERS += src/include/message.h AM_CPPFLAGS += -I$(top_srcdir)/src/include -endif # ARMCI_NETWORK_GEMINI -endif # ARMCI_NETWORK_PORTALS - ############################################################################## # profiling # @@ -303,111 +256,6 @@ endif include_HEADERS += src/include/parmci.h -############################################################################## -# tcgmsg -# -EXTRA_DIST += tcgmsg/README - -############################################################################## -# tcgmsg/ipcv4.0 -# -if MSG_COMMS_TCGMSG4 -AM_CPPFLAGS += -I$(top_srcdir)/tcgmsg/ipcv4.0 - -libarmci_la_SOURCES += tcgmsg/ipcv4.0/brdcst.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/checkbyte.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/cluster.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/cluster.h -libarmci_la_SOURCES += tcgmsg/ipcv4.0/defglobals.h -libarmci_la_SOURCES += tcgmsg/ipcv4.0/drand48.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/error.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/evlog.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/evlog.h -libarmci_la_SOURCES += tcgmsg/ipcv4.0/evon.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/globalop.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/llog.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/mdtob.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/mdtoi.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/mitob.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/mitod.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/mtime.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/niceftn.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/nnodes.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/nodeid.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/nxtval.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/pbegin.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/pbeginf.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/pfilecopy.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/sema.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/sema.h -libarmci_la_SOURCES += tcgmsg/ipcv4.0/setdbg.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/shmem.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/signals.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/signals.h -libarmci_la_SOURCES += tcgmsg/ipcv4.0/snd.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/sndrcv.h -libarmci_la_SOURCES += tcgmsg/ipcv4.0/sndrcvP.h -libarmci_la_SOURCES += tcgmsg/ipcv4.0/sockets.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/srftoc.h -libarmci_la_SOURCES += tcgmsg/ipcv4.0/srmover.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/stats.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/synch.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/usleep.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/waitall.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/waitcom.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/xdrstuff.c -libarmci_la_SOURCES += tcgmsg/ipcv4.0/xdrstuff.h - -bin_PROGRAMS += parallel - -parallel_SOURCES = tcgmsg/ipcv4.0/parallel.c - -endif # MSG_COMMS_TCGMSG4 - -############################################################################## -# tcgmsg/ipcv5.0 -# -if MSG_COMMS_TCGMSG5 -AM_CPPFLAGS += -I$(top_srcdir)/tcgmsg/ipcv5.0 - -libarmci_la_SOURCES += tcgmsg/ipcv5.0/drand48.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/evlog.h -libarmci_la_SOURCES += tcgmsg/ipcv5.0/globalop.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/llog.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/mdtob.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/mdtoi.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/misc.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/mitob.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/mitod.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/mtime.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/niceftn.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/nnodes.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/nodeid.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/pbegin.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/pfilecopy.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/queues.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/signals.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/sndrcv.h -libarmci_la_SOURCES += tcgmsg/ipcv5.0/snd_rcv_probe.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/srftoc.h -libarmci_la_SOURCES += tcgmsg/ipcv5.0/tcgmsgP.h -libarmci_la_SOURCES += tcgmsg/ipcv5.0/usleep.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/waitall.c -if ARMCI_NETWORK_LAPI -libarmci_la_SOURCES += tcgmsg/ipcv5.0/async_send_lapi.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/lapi_putget.c -else -libarmci_la_SOURCES += tcgmsg/ipcv5.0/async_send.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/busy.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/error.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/nxtval.shm.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/pbeginf.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/shmem.c -libarmci_la_SOURCES += tcgmsg/ipcv5.0/synch.c -endif # ARMCI_NETWORK_LAPI - -endif # MSG_COMMS_TCGMSG5 - ############################################################################## # testing # @@ -444,13 +292,7 @@ endif if THREAD_SAFE check_PROGRAMS += testing/test_mt endif -if ARMCI_NETWORK_PORTALS -check_PROGRAMS += testing/clone -check_PROGRAMS += testing/fork -check_PROGRAMS += testing/origptl -check_PROGRAMS += testing/ptltest -check_PROGRAMS += testing/te -endif + if HAVE_ARMCI_STRIDE_INFO_INIT check_PROGRAMS += testing/testitr endif @@ -504,35 +346,24 @@ endif if THREAD_SAFE ARMCI_PARALLEL_TESTS += testing/test_mt$(EXEEXT) endif -if ARMCI_NETWORK_PORTALS -#ARMCI_PARALLEL_TESTS += testing/clone$(EXEEXT) -#ARMCI_PARALLEL_TESTS += testing/fork$(EXEEXT) -#ARMCI_PARALLEL_TESTS += testing/origptl$(EXEEXT) -#ARMCI_PARALLEL_TESTS += testing/ptltest$(EXEEXT) -#ARMCI_PARALLEL_TESTS += testing/te$(EXEEXT) -endif + if HAVE_ARMCI_STRIDE_INFO_INIT ARMCI_SERIAL_TESTS += testing/testitr$(EXEEXT) endif -testing_clone_SOURCES = testing/clone.c $(atsrc) -testing_fork_SOURCES = testing/fork.c $(atsrc) testing_fttest_SOURCES = testing/fttest.c $(atsrc) testing_gpctest_SOURCES = testing/gpctest.c $(atsrc) testing_ipctest_SOURCES = testing/ipctest.c $(atsrc) testing_msgcheck_SOURCES = testing/msgcheck.c $(atsrc) -testing_origptl_SOURCES = testing/origptl.c $(atsrc) testing_perf_aggr_SOURCES = testing/perf_aggr.c $(atsrc) testing_perf_nb_SOURCES = testing/perf_nb.c $(atsrc) testing_perf_strided_SOURCES= testing/perf_strided.c $(atsrc) testing_perf_SOURCES = testing/perf.c $(atsrc) testing_perf2_SOURCES = testing/perf2.c $(atsrc) -testing_ptltest_SOURCES = testing/ptltest.c $(atsrc) testing_shmclean_SOURCES = testing/shmclean.c $(atsrc) testing_shmtest_SOURCES = testing/shmtest.c $(atsrc) testing_simple_SOURCES = testing/simple.c $(atsrc) testing_simplelock_SOURCES = testing/simplelock.c $(atsrc) -testing_te_SOURCES = testing/te.c $(atsrc) testing_test2_SOURCES = testing/test2.c $(atsrc) testing_test_groups_SOURCES = testing/test_groups.c $(atsrc) testing_testitr_SOURCES = testing/testitr.c $(atsrc) @@ -751,16 +582,6 @@ ARMCI_X86COPY_AS__v_ = $(ARMCI_X86COPY_AS__v_$(AM_DEFAULT_VERBOSITY)) ARMCI_X86COPY_AS__v_0 = @echo " GCC " $@; .PHONY: pfiles clean-pfiles -if MSG_COMMS_TCGMSG4 -pfiles: $(check_PROGRAMS) - for p in $(check_PROGRAMS); do \ - echo "`whoami` `hostname` $(NPROCS) `pwd`/$$p /tmp" > $$p.p; \ - done -check: pfiles -clean-pfiles: - find . -name '*.p' -exec rm {} \; ; -clean-local: clean-pfiles -endif # MSG_COMMS_TCGMSG4 ############################################################################## # test suite @@ -796,21 +617,12 @@ XFAIL_TESTS = XFAIL_TESTS += $(SERIAL_TESTS_XFAIL) XFAIL_TESTS += $(PARALLEL_TESTS_XFAIL) -if MSG_COMMS_TCGMSG4 -LOG_COMPILER = \ -maybe_parexec=`if echo "$(SERIAL_TESTS)" | $(GREP) "$$p" > /dev/null; then echo ""; else echo "$(TCGEXEC)"; fi`; eval $$maybe_parexec -else -if MSG_COMMS_TCGMSG5 -LOG_COMPILER = tst="$$dir$$f -np $(NPROCS)"; eval -else if CROSS_COMPILING maybe_mpiexec=`if echo "$(SERIAL_TESTS)" | $(GREP) "$$p" > /dev/null; then echo "$(MPIEXEC)" | $(SED) 's/%NP%/1/'; else echo "$(MPIEXEC)" | $(SED) 's/%NP%/$(NPROCS)/'; fi`; eval $$maybe_mpiexec else LOG_COMPILER = \ maybe_mpiexec=`if echo "$(SERIAL_TESTS)" | $(GREP) "$$p" > /dev/null; then echo ""; else if echo "$(MPIEXEC)" | $(GREP) "%NP%" > /dev/null; then echo "$(MPIEXEC)" | $(SED) 's/%NP%/$(NPROCS)/'; else echo "$(MPIEXEC)"; fi; fi`; eval $$maybe_mpiexec endif # CROSS_COMPILING -endif # MSG_COMMS_TCGMSG5 -endif # MSG_COMMS_TCGMSG4 .PHONY: check-gaf2c check-gaf2c: $(all-am) $(check_LTLIBRARIES) $(GAF2C_TESTS) $(MAYBE_PFILES) diff --git a/armci/README b/armci/README index 242a4f098..7883c498e 100644 --- a/armci/README +++ b/armci/README @@ -58,7 +58,6 @@ Index ----- 1. Supported Platforms 2. General Settings -3. Building ARMCI on SGI. 4. Building ARMCI on IBM. 5. Building ARMCI on CRAY. 6. Building ARMCI on other platforms @@ -66,11 +65,8 @@ Index Supported Platforms ------------------- -- leadership class machines: Cray XE6, Cray XTs, IBM Blue Gene/L, IBM Blue - Gene /P -- shared-memory systems: SUN Solaris, SGI, SGI Altix, IBM, Linux, DEC, HP, - Cray SV1, Cray X1, and Windows NT/95/2000 -- distributed-memory systems: Cray T3E, IBM SP(TARGET=LAPI), FUJITSU VX/VPP. +- leadership class machines: Cray XT/XE/XK/XC, IBM Blue Gene/Q. +- shared-memory systems: SUN Solaris, IBM, Linux - clusters of workstations (InfiniBand, sockets) configure options @@ -131,21 +127,14 @@ likely need to specify the optional ARG pointing to the necessary directories and/or libraries. sockets is the default ARMCI network if nothing else is specified. ---with-bgml=ARG select armci network as IBM BG/L ---with-cray-shmem=ARG select armci network as Cray XT shmem ---with-dcmf=ARG select armci network as IBM BG/P Deep Computing - Message Framework ---with-lapi=ARG select armci network as IBM LAPI --with-mpi-spawn=ARG select armci network as MPI-2 dynamic process mgmt --with-openib=ARG select armci network as InfiniBand OpenIB ---with-portals=ARG select armci network as Cray XT portals --with-sockets=ARG select armci network as Ethernet TCP/IP (default) --enable-autodetect attempt to locate ARMCI network besides sockets SOCKETS is the assumed default for clusters connected with Ethernet. This protocol might also work on other networks however, the performance might be -sub-optimal and on Myrinet it could even hang (GM does not work with fork and -the standard version of ARMCI uses fork). +sub-optimal. Cross-Compilation Issues ------------------------ @@ -172,16 +161,6 @@ And if that doesn't work (cross-compilation is not detected) you must then configure --host=x86_64-unknown-linux-gnu --build=x86_64-unknown-linux-gnu -BlueGene/P -++++++++++ - -Currently the only way to detect the BGP platform and compile correctly is to -use:: - - configure --host=powerpc-bgp-linux - -The rest of the configure options apply as usual e.g. --with-dcmf in this case. - Compiler Selection ------------------ @@ -247,66 +226,6 @@ All tests have a per-test log file containing the output of the test. So if the test is testing/test.x, the log file would be testing/test.log. The output of failed tests is collected in the top-level log summary test-suite.log. -ANCIENT WISDOM -============== - -Building on SGI ---------------- - -For running on SGI machines running the irix os, three target settings are -available: - -- TARGET=SGI generates a MIPS-4 64-bit code with 32-bit address space when - compiling on any R8000 based machines and a 32 bit MPIS-2 code on any - non-R8000 machines. -- Use TARGET=SGI64 For generating a 64 bit code with 64-bit address space. -- TARGET=SGI_N32 generates a 32bit code with a 32bit address space. - -By default, SGI_N32 generates a MIPS3 code and SGI64 generates a MIPS4 code. - -There is a possibility of conflict between the SGI's implementation of MPI -(but not others, MPICH for example) and ARMCI in their use of the SGI specific -inter-processor communication facility called arena. - -Building on IBM ---------------- - -Running on IBM without LAPI -+++++++++++++++++++++++++++ - -On IBM's running AIX, target can be set to IBM or IBM64 to run 32/64 bit -versions of the code. - -Running on the IBM-SP -+++++++++++++++++++++ - -TARGET on IBM-SP can be set to LAPI (LAPI64 for 64 bit object). POE -environment variable settings for the parallel environment PSSP 3.1: - -- ARMCI applications like any other LAPI-based codes must define - MP_MSG_API=lapi or MP_MSG_API=mpi,lapi (when using ARMCI and MPI) -- The LAPI-based implementation of ARMCI cannot be used on the very old SP-2 - systems because LAPI did not support the TB2 switch used in those models. - If in doubt which switch you got use odmget command: odmget -q name=css0 - CuDv -- For AIX versions 4.3.1 and later, environment variable AIXTHREAD_SCOPE=S - must be set to assure correct operation of LAPI (IBM should do it in PSSP - by default). -- Under AIX 4.3.3 and later an additional environment variable is - required(RT_GRQ=ON) to restore the original thread scheduling that LAPI - relies on. - -Building on CRAY ----------------- - -- TARGET environment variable is also used by cc on CRAY. It has to be set to - CRAY-SV1 on SV1, CRAY-YMP on YMP, CRAY-T3E on T3E. ARMCI on CRAY'S hence - uses the same values to this environment variable as cc requires. - -- On CRAY-T3E, ARMCI can be run with either of the CRAY Message Passing - Libraries(PVM and MPI). For more information on running with PVM look at - docs/README.PVM. If running with PVM, MSG_COMMS has to be set to PVM. - Building on other platforms --------------------------- @@ -342,30 +261,6 @@ For example to make SHMMAX= 2GB, add either of the lines to /etc/system:: After rebooting, you should be able to take advantage of the increased shared memory limits. -Compaq/DEC -++++++++++ - -Tru64 is another example of an OS with a pitifully small size of the shared -memory region limit. Here are instruction on how to modify shared memory max -segment size to 256MB on the Tru64 UNIX Version 4.0F: - -1. create a file called /etc/sysconfig.shmmax:: - - cat > /etc/sysconfig.shmmax << EOF ipc: shm-max = 268435456 EOF - - You can check if the file created is OK by typing:: - - /sbin/sysconfigdb -l -t /etc/sysconfig.shmmax - -2. Modify kernel values:: - - sysconfigdb -a -f /etc/sysconfig.shmmax ipc - -3. Reboot -4. To check new values:: - - /sbin/sysconfig -q ipc|egrep shm-max - HP-UX +++++ diff --git a/armci/configure.ac b/armci/configure.ac index 307d4f55c..fab6f03d4 100644 --- a/armci/configure.ac +++ b/armci/configure.ac @@ -42,18 +42,8 @@ AS_IF([test "$ARMCI_TOP_BUILDDIR" != "$ARMCI_TOP_SRCDIR"], # MPI compiler wrappers instead of the standard compilers. GA_MSG_COMMS -# Hack to enable NEW_MALLOC feature -AC_ARG_ENABLE([portals-new-malloc], - [AS_HELP_STRING([--enable-portals-new-malloc], - [add -DNEW_MALLOC to CPPFLAGS])]) -AS_IF([test "x$enable_portals_new_malloc" = xyes], - [AC_DEFINE([NEW_MALLOC], [1], [for portals, enable new malloc])]) -AM_CONDITIONAL([PORTALS_ENABLE_NEW_MALLOC], - [test "x$enable_portals_new_malloc" = xyes]) - ARMCI_ENABLE_GPC ARMCI_ENABLE_GROUP -ARMCI_ENABLE_NB_NONCONT ARMCI_ENABLE_PROFILING ARMCI_SHMMAX GA_DISABLE_MPI_TESTS @@ -100,7 +90,7 @@ AC_HEADER_DIRENT AC_HEADER_STDBOOL AC_HEADER_STDC AC_HEADER_SYS_WAIT -GA_CHECK_HEADERS([assert.h c_asm.h errno.h fcntl.h float.h ia64/sys/inline.h malloc.h math.h memory.h mpp/shmem.h netdb.h netinet/in.h netinet/tcp.h process.h setjmp.h signal.h stdarg.h stdint.h stdio.h stdlib.h string.h strings.h sys/types.h sys/atomic_op.h sys/errno.h sys/file.h sys/ipc.h sys/mman.h sys/param.h sys/sem.h sys/shm.h sys/socket.h sys/stat.h sys/syscall.h sys/systemcfg.h sys/time.h sys/uio.h sys/wait.h time.h unistd.h windows.h winsock.h rpc/rpc.h rpc/types.h rpc/xdr.h], +GA_CHECK_HEADERS([assert.h c_asm.h errno.h fcntl.h float.h malloc.h math.h memory.h mpp/shmem.h netdb.h netinet/in.h netinet/tcp.h process.h setjmp.h signal.h stdarg.h stdint.h stdio.h stdlib.h string.h strings.h sys/types.h sys/atomic_op.h sys/errno.h sys/file.h sys/ipc.h sys/mman.h sys/param.h sys/sem.h sys/shm.h sys/socket.h sys/stat.h sys/syscall.h sys/systemcfg.h sys/time.h sys/uio.h sys/wait.h time.h unistd.h windows.h winsock.h rpc/rpc.h rpc/types.h rpc/xdr.h], [], [], [@%:@ifdef HAVE_RPC_TYPES_H @%:@include @@ -219,7 +209,6 @@ AC_MSG_NOTICE AC_MSG_NOTICE([Assembler]) AC_MSG_NOTICE -GA_AS AM_PROG_AS ARMCI_AS_NO_I386ASM @@ -296,7 +285,7 @@ AM_CONDITIONAL([ENABLE_SHARED], [test x$enable_shared = xyes]) AS_IF([test "x$ga_cv_target" = xLINUX64], [AS_CASE([$host_cpu], - [x86_64|ppc64|ia64], + [x86_64|ppc64], [AC_DEFINE([NEED_MEM_SYNC], [1], [Creates memfenc macro])])]) AS_IF([test "x$host_cpu" = xPWR4], [AC_DEFINE([NEED_MEM_SYNC], [1], [Creates memfenc macro])]) @@ -310,8 +299,6 @@ AC_SUBST([NPROCS]) AC_ARG_VAR([MPIEXEC], [how to run parallel tests if built with MPI e.g. "mpiexec -np %NP%"]) AS_CASE([$ga_msg_comms], - [TCGMSG], [TCGEXEC="`pwd`/parallel.x"], - [TCGMSG5], [TCGEXEC=], [MPI], [AS_IF([test "x$MPIEXEC" = x], [AC_PATH_PROGS([MPIEXEC], [mpirun mpiexec]) MPIEXEC="$MPIEXEC -n %NP%"])]) diff --git a/armci/doc/README.PVM b/armci/doc/README.PVM deleted file mode 100644 index 4eafbc5a5..000000000 --- a/armci/doc/README.PVM +++ /dev/null @@ -1,41 +0,0 @@ - -Regarding to the group used in PVM, a global variable is defined -in armci.h and a extra optional function, ARMCI_PVM_Init, -can be used to pass a PVM group to ARMCI. - -On CrayT3E: the default group is the global group which is (char *)NULL - It is the only working group. - -On Workstations: the default group is "mp_working_group". User can set - the group name by calling the ARMCI_PVM_init (defined - in message.c) and passing the group name to the library. - -Differencies between Cray and Unix Workstations - -1. Linker libraries: - On Cray: -lpvm3 - On Unix Workstations: -lgpvm3 -lpvm3 - also need the group lib - -2. Groups - On Cray: There is a default global group, which can be referred to - (char *)NULL, or PVMALL) - On Unix Workstations: There is no default group. - -3. Obtaining PE number. - On Cray: pvm_get_PE(get_mytid()) to obtain their own PE number. - On Unix Workstations: pvm_getinst(group, mytid) standard. - -4. Running the program - On Cray: PVM daemon should not be started. - The pvm_spawn is not called. The number of process is - controlled at compile time, or by using the run - command mpprun. - mpprun -n 4 test.x - So there is no need to initialize the PVM in the test.c - On Unix Workstations: Must run PVM daemon first - % pvm and then quit - Call pvm_spawn to creat other processes. - In test.c, there is a initialization routine and the - format to run is test.x - diff --git a/armci/doc/README.myrinet b/armci/doc/README.myrinet deleted file mode 100644 index 39cc34162..000000000 --- a/armci/doc/README.myrinet +++ /dev/null @@ -1,120 +0,0 @@ -The following instructions describe how to setup Myrinet and build -ARMCI to run over Myrinet using the Myricom GM interface and MPICH/GM. - -Notes: - - any other message-passing library is not supported - - This code was tested and is supported only on Linux Intel & Sparc - and Solaris/sparc platforms. If you have other configuration - please try if it works, otherwise please send a message to - - -1. Installation of Myrinet and the GM library (must have root privilege) - - * To build gm on linux smp nodes, run configure with the following - options. For details, refer to README-linux. - --enable-linux-smp - --enable-linux-modversions - then run make - - * Install the driver, - cd binary - ./GM_INSTAll - - * Run mapper - cd sbin - mapper active.args - - * Check the configuration - gm_board_info - - It lists the driver information and routing table. - -2. Setting up the environment - - * Note: We strongly recommend using GM version 1.2 or higher - * ARMCI on top of GM requires the following environment variables to - be set (Examples are taken on colony cluster of PNNL, using csh) - - setenv TARGET LINUX - setenv GM_HOME /home/myrinet/gm-1.2/gm - setenv GM_INCLUDE $GM_HOME/include - setenv GM_LIB $GM_HOME/lib - setenv ARMCI_NETWORK GM - - * For MPICH-GM the following additional environment variables need to be set - (again using examples of colony cluster of PNNL and csh) - - setenv MPI_LOC /home/myrinet/mpich-1.2/mpich-gm - setenv MPI_LIB $MPI_LOC/build/LINUX/ch_gm/lib - setenv MPI_INCLUDE $MPI_LOC/include - set path = ($MPI_LOC/bin $MPI_LOC/build/LINUX/ch_gm/bin $path) - setenv LIBMPI -lmpich - - However, if you prefer to use MPICH compiler wrappers mpicc, mpif77 etc - these definitions are not needed. Also, since the default name for - the MPI library assumed by armci is -lmpi, you would need to - set LIBMPI="" or "-lmpich". - - -- without PBS - - set a $HOME/.gmpi/conf file like this: - # .gmpi/conf file begin - # first the number of nodes in the file - 8 - # the list of (node,port) that make the MPI World - nb26.colony.emsl.pnnl.gov 2 - nb26.colony.emsl.pnnl.gov 4 - nb27.colony.emsl.pnnl.gov 2 - nb27.colony.emsl.pnnl.gov 4 - nb28.colony.emsl.pnnl.gov 2 - nb28.colony.emsl.pnnl.gov 4 - nb29.colony.emsl.pnnl.gov 2 - nb29.colony.emsl.pnnl.gov 4 - # .gmpi/conf file end - - -- with PBS - - node files are created dynamically, but should have the same - format as the default .gmpi/conf - - IT IS REQUIRED THAT MPI TASKS RUNNING ON THE SAME SMP NODE - BE NUMBERED CONSECUTIVELY - - GM Port Usage: - - The current ARMCI implementation uses two GM ports. - If STATIC_PORTS is defined, they are called - ARMCI_GM_SERVER_RCV_PORT - ARMCI_GM_SERVER_SND_PORT - The default port numbers are 5 and 6, set in the file armci/src/myrinet.h. - Users can choose different ports by editing the definition, depending on - the system settings and at least not conflicting with the ports used by MPI. - - If STATIC_PORTS is NOT defined, ARMCI will search for first two available - ports on up to two myrinet boards. - -3. Test run - - Go to armci/src and make test.x - - Using mpich-gm: run it as regular mpi programs (make sure the mpirun - is come from mpich-gm. many systems have multiple mpi installed) - - mpirun -np #procs test.x - - This will read the default .gmpi/conf - -4. Contact info and bug report - email: parsoft-support@emsl.pnl.gov - -5. Other issues: - -GM does not allow fork. This system call is normally used on Unix -in ARMCI to test how big a shared memory segment can be allocated. -This test is diabled under Myrinet/GM. It means that ARMCI can -use only shared memory segments predefined _SHMMAX in shmem.c -for a particular platform. If you increase that limit in the system, -_SHMMAX must be accordingly modiefied and ARMCI recompiled. - -The current port or ARMCI to GM is not fully optimized yet. - diff --git a/armci/examples/features/aggregation/simple/simple.c b/armci/examples/features/aggregation/simple/simple.c index 9412f459a..62786a0ad 100644 --- a/armci/examples/features/aggregation/simple/simple.c +++ b/armci/examples/features/aggregation/simple/simple.c @@ -61,12 +61,7 @@ #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif - /***************************** macros ************************/ #define COPY(src, dst, bytes) memcpy((dst),(src),(bytes)) @@ -77,43 +72,6 @@ /***************************** global data *******************/ int me, nproc; void* work[MAXPROC]; /* work array for propagating addresses */ - - - -#ifdef MSG_COMMS_PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if((argc != 2) && (argc != 1)) goto usage; - if(argc == 1) np = 1; - if(argc == 2) - if((np = atoi(argv[1])) < 1) goto usage; - if(np > MAXPROC) goto usage; - - mygid = pvm_joingroup(MPGROUP); - - if(np > 1) - if (mygid == 0) - i = pvm_spawn(argv[0], argv+1, 0, "", np-1, ctid); - - while(pvm_gsize(MPGROUP) < np) sleep(1); - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif void create_array(void *a[], int elem_size, int ndim, int dims[]) { diff --git a/armci/examples/features/aggregation/sparse_matvecmul/sparse_matvecmul.c b/armci/examples/features/aggregation/sparse_matvecmul/sparse_matvecmul.c index fc226d645..80723722a 100644 --- a/armci/examples/features/aggregation/sparse_matvecmul/sparse_matvecmul.c +++ b/armci/examples/features/aggregation/sparse_matvecmul/sparse_matvecmul.c @@ -61,11 +61,7 @@ #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif /***************************** macros ************************/ #define COPY(src, dst, bytes) memcpy((dst),(src),(bytes)) @@ -82,41 +78,6 @@ short int fortran_indexing=0; static int proc_row_list[MAXPROC];/*no of rows owned by each process - accumulated*/ static int proc_nz_list[MAXPROC]; /*no of non-zeros owned by each process */ -#ifdef MSG_COMMS_PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if((argc != 2) && (argc != 1)) goto usage; - if(argc == 1) np = 1; - if(argc == 2) - if((np = atoi(argv[1])) < 1) goto usage; - if(np > MAXPROC) goto usage; - - mygid = pvm_joingroup(MPGROUP); - - if(np > 1) - if (mygid == 0) - i = pvm_spawn(argv[0], argv+1, 0, "", np-1, ctid); - - while(pvm_gsize(MPGROUP) < np) sleep(1); - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - void create_array(void *a[], int elem_size, int ndim, int dims[]) { int bytes=elem_size, i, rc; diff --git a/armci/examples/features/gpc/hashtable/GPCHashmap.cc b/armci/examples/features/gpc/hashtable/GPCHashmap.cc index b599527c5..2336b312e 100644 --- a/armci/examples/features/gpc/hashtable/GPCHashmap.cc +++ b/armci/examples/features/gpc/hashtable/GPCHashmap.cc @@ -21,18 +21,6 @@ using std::endl; #define ARMCI_ENABLE_GPC_CALLS #include "gpc.h" -/***************************** macros ************************/ -extern "C" { -# if defined(__ia64) -# if defined(__GNUC__) && !defined (__INTEL_COMPILER) -# define MEM_FENCE __asm__ __volatile__ ("mf" ::: "memory"); -# else /* Intel Compiler */ - extern void _armci_ia64_mb(); -# define MEM_FENCE _armci_ia64_mb(); -# endif -# endif -} - #include "Hash_common.h" #include "GPCHashmap.h" #include "Util.h" diff --git a/armci/examples/features/notification/simple/testnotify.c b/armci/examples/features/notification/simple/testnotify.c index ac97e0898..26703b0aa 100644 --- a/armci/examples/features/notification/simple/testnotify.c +++ b/armci/examples/features/notification/simple/testnotify.c @@ -63,12 +63,7 @@ #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif - /***************************** macros ************************/ diff --git a/armci/src-gemini/Makefile.inc b/armci/src-gemini/Makefile.inc deleted file mode 100644 index 9a94be51a..000000000 --- a/armci/src-gemini/Makefile.inc +++ /dev/null @@ -1,61 +0,0 @@ -libarmci_la_SOURCES += src-gemini/acc.h -libarmci_la_SOURCES += src-gemini/aggregate.c -libarmci_la_SOURCES += src-gemini/armci.c -libarmci_la_SOURCES += src-gemini/armci-onesided.c -libarmci_la_SOURCES += src-gemini/armci-onesided.h -libarmci_la_SOURCES += src-gemini/armcip.h -libarmci_la_SOURCES += src-gemini/atomics-i386.h -libarmci_la_SOURCES += src-gemini/buffers.c -libarmci_la_SOURCES += src-gemini/caccumulate.c -libarmci_la_SOURCES += src-gemini/ccopy.c -libarmci_la_SOURCES += src-gemini/clusterinfo.c -libarmci_la_SOURCES += src-gemini/copy.h -libarmci_la_SOURCES += src-gemini/ds-shared.c -libarmci_la_SOURCES += src-gemini/fence.c -libarmci_la_SOURCES += src-gemini/groups.c -libarmci_la_SOURCES += src-gemini/kr_malloc.c -libarmci_la_SOURCES += src-gemini/kr_malloc.h -libarmci_la_SOURCES += src-gemini/locks.c -libarmci_la_SOURCES += src-gemini/locks.h -libarmci_la_SOURCES += src-gemini/memlock.c -libarmci_la_SOURCES += src-gemini/memlock.h -libarmci_la_SOURCES += src-gemini/memory.c -libarmci_la_SOURCES += src-gemini/message.c -libarmci_la_SOURCES += src-gemini/mutex.c -libarmci_la_SOURCES += src-gemini/pack.c -libarmci_la_SOURCES += src-gemini/pendbufs.h -libarmci_la_SOURCES += src-gemini/request.c -libarmci_la_SOURCES += src-gemini/request.h -libarmci_la_SOURCES += src-gemini/rmw.c -libarmci_la_SOURCES += src-gemini/rtinfo.c -libarmci_la_SOURCES += src-gemini/semaphores.c -libarmci_la_SOURCES += src-gemini/semaphores.h -libarmci_la_SOURCES += src-gemini/shmalloc.h -libarmci_la_SOURCES += src-gemini/shmem.c -libarmci_la_SOURCES += src-gemini/armci_shmem.h -libarmci_la_SOURCES += src-gemini/shmlimit.c -libarmci_la_SOURCES += src-gemini/shmlimit.h -libarmci_la_SOURCES += src-gemini/signaltrap.c -libarmci_la_SOURCES += src-gemini/signaltrap.h -libarmci_la_SOURCES += src-gemini/sockets.h -libarmci_la_SOURCES += src-gemini/spawn.c -libarmci_la_SOURCES += src-gemini/spinlock.h -libarmci_la_SOURCES += src-gemini/strided.c -libarmci_la_SOURCES += src-gemini/utils.h -libarmci_la_SOURCES += src-gemini/vector.c -if ARMCI_ENABLE_GPC_CALLS -libarmci_la_SOURCES += src-gemini/gpc.c -endif -if THREAD_SAFE -libarmci_la_SOURCES += src-gemini/threads.c -libarmci_la_SOURCES += src-gemini/utils.c -endif - -include_HEADERS += src-gemini/armci.h -include_HEADERS += src-gemini/gpc.h -include_HEADERS += src-gemini/message.h - -AM_CPPFLAGS += -I$(top_srcdir)/src-gemini -AM_CPPFLAGS += -I$(top_srcdir)/src/include -LDADD += -lnumatoolkit -LDADD += -lonesided diff --git a/armci/src-gemini/acc.c b/armci/src-gemini/acc.c deleted file mode 100644 index bc1ec3dfa..000000000 --- a/armci/src-gemini/acc.c +++ /dev/null @@ -1,171 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: acc.c,v 1.13 2006-09-13 23:43:36 andriy Exp $ */ - -#if defined(__crayx1) -#else -#define restrict -#endif - -void L_ACCUMULATE_2D(long* restrict alpha, int* restrict rows, - int* restrict cols, long* restrict a, - int* restrict lda, long* restrict b, int* restrict ldb) -{ -int i,j; - -#ifdef __crayx1 -#pragma _CRI concurrent -#endif - - for(j=0;j< *cols; j++){ - long * restrict aa = a + j* *lda; - long * restrict bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} - -void L_ACCUMULATE_1D(long * restrict alpha, long * restrict a, long * restrict b, - int * restrict rows) -{ -int i; - for(i=0;i< *rows; i++) - a[i] += *alpha * b[i]; -} - - -#if defined(CRAY_T3E) || defined(CATAMOUNT) -void F_ACCUMULATE_2D_(float* alpha, int* rows, int* cols, float* a, - int* lda, float* b, int* ldb) -{ -int i,j; - for(j=0;j< *cols; j++){ - float *aa = a + j* *lda; - float *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} -#endif - -void RA_ACCUMULATE_2D_(long* alpha, int* rows, int* cols, long* a, - int* lda, long* b, int* ldb) -{ -int i,j; - for(j=0;j< *cols; j++){ - long *aa = a + j* *lda; - long *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] ^= bb[i]; - } -} - -#ifdef NOFORT - -typedef struct { - float imag; - float real; -} cmpl_t; - -typedef struct { - double imag; - double real; -} dcmpl_t; - -void I_ACCUMULATE_2D(int* alpha, int* rows, int* cols, int* a, - int* lda, int* b, int* ldb) -{ -int i,j; - - for(j=0;j< *cols; j++){ - int *aa = a + j* *lda; - int *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} - -#if !defined(CRAY_T3E) && !defined(CATAMOUNT) -void F_ACCUMULATE_2D(float* alpha, int* rows, int* cols, float* a, - int* lda, float* b, int* ldb) -{ -int i,j; - for(j=0;j< *cols; j++){ - float *aa = a + j* *lda; - float *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} -#endif - -void D_ACCUMULATE_2D(double* alpha, int* rows, int* cols, double* a, - int* lda, double* b, int* ldb) -{ -int i,j; - - for(j=0;j< *cols; j++){ - double *aa = a + j* *lda; - double *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} - - -void C_ACCUMULATE_2D(cmpl_t* alpha, int* rows, int* cols, cmpl_t* a, - int* lda, cmpl_t* b, int* ldb) -{ -int i,j; - - for(j=0;j< *cols; j++){ - cmpl_t *aa = a + j* *lda; - cmpl_t *bb = b + j* *ldb; - for(i=0;i< *rows; i++){ - aa[i].real += alpha->real * bb[i].real - alpha->imag * bb[i].imag; - aa[i].imag += alpha->imag * bb[i].real + alpha->real * bb[i].imag; - } - } -} - - -void Z_ACCUMULATE_2D(dcmpl_t* alpha, int* rows, int* cols, dcmpl_t* a, - int* lda, dcmpl_t* b, int* ldb) -{ -int i,j; - - - for(j=0;j< *cols; j++){ - dcmpl_t *aa = a + j* *lda; - dcmpl_t *bb = b + j* *ldb; - for(i=0;i< *rows; i++){ - aa[i].real += alpha->real * bb[i].real - alpha->imag * bb[i].imag; - aa[i].imag += alpha->imag * bb[i].real + alpha->real * bb[i].imag; - } - } -} - -void FORT_DADD(int *n, double *x, double *work){ -int i; - for(i=0;i<*n;i++) - x[i] = x[i] + work[i]; -} -void FORT_DADD2(int *n, double *x, double *work, double *work2){ -int i; - for(i=0;i<*n;i++) - x[i] = work[i] + work2[i]; -} -void FORT_DMULT(int *n, double *x, double *work){ -int i; - for(i=0;i<*n;i++) - x[i] = x[i]*work[i]; -} -void FORT_DMULT2(int *n, double *x, double *work, double *work2){ -int i; - for(i=0;i<*n;i++) - x[i] = work[i]*work2[i]; -} - -#endif diff --git a/armci/src-gemini/acc.h b/armci/src-gemini/acc.h deleted file mode 100644 index da636d96f..000000000 --- a/armci/src-gemini/acc.h +++ /dev/null @@ -1,179 +0,0 @@ -#ifndef _ACC_H_ -#define _ACC_H_ - -typedef struct { - float real; - float imag; -} complex_t; - -typedef struct { - double real; - double imag; -} dcomplex_t; - -void c_d_accumulate_1d_(const double* const restrict alpha, - double* const restrict A, - const double* const restrict B, - const int* const restrict rows); -void c_f_accumulate_1d_(const float* const restrict alpha, - float* const restrict A, - const float* const restrict B, - const int* const restrict rows); -void c_c_accumulate_1d_(const complex_t* const restrict alpha, - complex_t* const restrict A, - const complex_t* const restrict B, - const int* const restrict rows); -void c_z_accumulate_1d_(const dcomplex_t* const restrict alpha, - dcomplex_t* const restrict A, - const dcomplex_t* const restrict B, - const int* const restrict rows); -void c_i_accumulate_1d_(const int* const restrict alpha, - int* const restrict A, - const int* const restrict B, - const int* const restrict rows); -void c_l_accumulate_1d_(const long* const restrict alpha, - long* const restrict A, - const long* const restrict B, - const int* const restrict rows); -void c_ll_accumulate_1d_(const long long* const restrict alpha, - long long* const restrict A, - const long long* const restrict B, - const int* const restrict rows); - -void c_d_accumulate_2d_(const double* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict B, - const int* const restrict bld); -void c_f_accumulate_2d_(const float* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - float* const restrict A, - const int* const restrict ald, - const float* const restrict B, - const int* const restrict bld); -void c_c_accumulate_2d_(const complex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - complex_t* const restrict A, - const int* const restrict ald, - const complex_t* const restrict B, - const int* const restrict bld); -void c_z_accumulate_2d_(const dcomplex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - dcomplex_t* const restrict A, - const int* const restrict ald, - const dcomplex_t* const restrict B, - const int* const restrict bld); -void c_i_accumulate_2d_(const int* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - int* const restrict A, - const int* const restrict ald, - const int* const restrict B, - const int* const restrict bld); -void c_l_accumulate_2d_(const long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long* const restrict A, - const int* const restrict ald, - const long* const restrict B, - const int* const restrict bld); -void c_ll_accumulate_2d_(const long long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long long* const restrict A, - const int* const restrict ald, - const long long* const restrict B, - const int* const restrict bld); - -void c_d_accumulate_2d_u_(const double* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict B, - const int* const restrict bld); -void c_f_accumulate_2d_u_(const float* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - float* const restrict A, - const int* const restrict ald, - const float* const restrict B, - const int* const restrict bld); -void c_c_accumulate_2d_u_(const complex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - complex_t* const restrict A, - const int* const restrict ald, - const complex_t* const restrict B, - const int* const restrict bld); -void c_z_accumulate_2d_u_(const dcomplex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - dcomplex_t* const restrict A, - const int* const restrict ald, - const dcomplex_t* const restrict B, - const int* const restrict bld); -void c_i_accumulate_2d_u_(const int* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - int* const restrict A, - const int* const restrict ald, - const int* const restrict B, - const int* const restrict bld); -void c_l_accumulate_2d_u_(const long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long* const restrict A, - const int* const restrict ald, - const long* const restrict B, - const int* const restrict bld); -void c_ll_accumulate_2d_u_(const long long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long long* const restrict A, - const int* const restrict ald, - const long long* const restrict B, - const int* const restrict bld); - -void RA_ACCUMULATE_2D_(long*, int*, int*, long*, int*, long*, int*); - -void c_dadd_(const int* const restrict n, - double* const restrict x, - const double* const restrict work); -void c_dadd2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2); -void c_dmult_(const int* const restrict n, - double* const restrict x, - const double* const restrict work); -void c_dmult2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2); - -#define I_ACCUMULATE_1D c_i_accumulate_1d_ -#define L_ACCUMULATE_1D c_l_accumulate_1d_ -#define LL_ACCUMULATE_1D c_ll_accumulate_1d_ -#define D_ACCUMULATE_1D c_d_accumulate_1d_ -#define C_ACCUMULATE_1D c_c_accumulate_1d_ -#define Z_ACCUMULATE_1D c_z_accumulate_1d_ -#define F_ACCUMULATE_1D c_f_accumulate_1d_ -#define I_ACCUMULATE_2D c_i_accumulate_2d_ -#define L_ACCUMULATE_2D c_l_accumulate_2d_ -#define LL_ACCUMULATE_2D c_ll_accumulate_2d_ -#define D_ACCUMULATE_2D c_d_accumulate_2d_ -#define C_ACCUMULATE_2D c_c_accumulate_2d_ -#define Z_ACCUMULATE_2D c_z_accumulate_2d_ -#define F_ACCUMULATE_2D c_f_accumulate_2d_ -#define FORT_DADD c_dadd_ -#define FORT_DADD2 c_dadd2_ -#define FORT_DMULT c_dmult_ -#define FORT_DMULT2 c_dmult2_ - -#endif /* _ACC_H_ */ diff --git a/armci/src-gemini/aggregate.c b/armci/src-gemini/aggregate.c deleted file mode 100644 index f945d75af..000000000 --- a/armci/src-gemini/aggregate.c +++ /dev/null @@ -1,351 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** $Id: aggregate.c,v 1.6 2003-10-22 22:12:14 d3h325 Exp $ - * Aggregate Put/Get requests - */ - -#include "armcip.h" -#include /* memcpy */ -#include - -#define _MAX_AGG_BUFFERS 32 /* Maximum # of aggregation buffers available*/ -#define _MAX_AGG_BUFSIZE 2048 /* size of each buffer. should be < 2^15 */ -#define _MAX_PTRS 256 /* < 2^15, as it is "short int" in agg_req_t */ -#define _MAX_AGG_HANDLE _MAX_AGG_BUFFERS /* Max # of aggregation handles */ - -/* aggregate request handle */ -typedef struct { - unsigned int tag; /* non-blocking request tag */ - short int proc; /* remote process id */ - short int request_len ; /* number of requests */ - short int ptr_array_len; /* pointer length for this request */ - short int buf_pos_end; /* position of buffer (from right end) */ - armci_giov_t *darr; /* giov vectors */ -}agg_req_t; -static agg_req_t *aggr[_MAX_AGG_HANDLE]; /* aggregate request handle */ - - -/* data structure for dynamic buffer management */ -typedef struct { - int size; /* represents the size of the list (not linked list) */ - int index[_MAX_AGG_HANDLE]; -} agg_list_t; -static agg_list_t ulist, alist;/*in-use & available aggr buffer index list*/ - - -/* aggregation buffer */ -static char agg_buf[_MAX_AGG_BUFFERS][_MAX_AGG_BUFSIZE]; -/* aggregation buffer to store the pointers */ -static void* agg_src_ptr[_MAX_AGG_BUFFERS][_MAX_PTRS]; -static void* agg_dst_ptr[_MAX_AGG_BUFFERS][_MAX_PTRS]; - -/** - * --------------------------------------------------------------------- - * fill descriptor from this side (left to right) - * ---> - * _______________________________________________ - * | | | |. . . . . . . . . . | | | | - * |__|__|__|_____________________________|__|__|__| - * - * <--- - * fill src and dst pointer (arrays) in this direction - * (right to left) - * - * Once they are about to cross each other (implies buffer is full), - * complete the data transfer. - * --------------------------------------------------------------------- - */ - -#define AGG_INIT_NB_HANDLE(op_type, p, nb_handle) \ - if(nb_handle->proc < 0) { \ - nb_handle->tag = GET_NEXT_NBTAG(); \ - nb_handle->op = op_type; \ - nb_handle->proc = p; \ - nb_handle->bufid= NB_NONE; \ - } \ - else if(nb_handle->op != op_type) \ - armci_die("ARMCI_NbXXX: AGG_INIT_NB_HANDLE(): Aggregate Failed, Invalid non-blocking handle", nb_handle->op); \ - else if(nb_handle->proc != p) \ - armci_die("ARMCI_NbXXX: AGG_INIT_NB_HANDLE(): Aggregate Failed, Invalid non-blocking handle", p) - - -/* initialize/set the fields in the buffer*/ -#define _armci_agg_set_buffer(index, tag, proc, len) { \ - aggr[(index)]->tag = (tag); \ - aggr[(index)]->proc = (proc); \ - aggr[(index)]->request_len = (len); \ - ulist.index[ulist.size++] = (index);/* add the new index to the in-use list and increment it's size*/ \ -} - -/* get the index of the aggregation buffer to be used */ -static int _armci_agg_get_bufferid(armci_ihdl_t nb_handle) { - int i, index, tag = nb_handle->tag, proc = nb_handle->proc; - - /* check if there is an entry for this handle in the existing list*/ - for(i=ulist.size-1; i>=0; i--) { - index = ulist.index[i]; - if(aggr[index]->tag == tag && aggr[index]->proc == proc) - return index; - } - - /* else it is a new handle, so get a aggr buffer from either - of the lists. ???? don't throw exception here */ - if(ulist.size >= _MAX_AGG_BUFFERS && alist.size == 0) - armci_die("_armci_agg_get_index: Too many outstanding aggregation requests\n", ulist.size); - - /*If there is a buffer in readily available list,use it*/ - if(alist.size > 0) index = alist.index[--alist.size]; - else { /* else use/get a buffer from the main list */ - index = ulist.size; - - /* allocate memory for aggregate request handle */ - aggr[index] = (agg_req_t *)agg_buf[index]; - - aggr[index]->request_len = 0; - aggr[index]->ptr_array_len = 0; - aggr[index]->buf_pos_end = _MAX_AGG_BUFSIZE; - - /* allocate memory for giov vector field in aggregate request handler */ - aggr[index]->darr = (armci_giov_t *)(agg_buf[index]+sizeof(agg_req_t)); - } - - _armci_agg_set_buffer(index, tag, proc, 0); - return index; -} - -static void _armci_agg_update_lists(int index) { - int i; - /* remove that index from the in-use list and bring the last element - in the in-use list to the position of the removed one. */ - for(i=0; irequest_len; /* index of giov descriptor */ - bytes_remaining = aggr[index]->buf_pos_end - - (sizeof(agg_req_t) + aggr[index]->request_len*sizeof(armci_giov_t)); - - /* extra bytes required to store registered put data */ - if(is_registered_put) bytes_needed = bytes; - - /* if (byte-)sizes are equal, use previously created descriptor - else get a new descriptor */ - if( rid && bytes==aggr[index]->darr[rid-1].bytes) --rid; - else { get_new_descr=1; bytes_needed += sizeof(armci_giov_t); } - - /* If buffer is full, then complete data transfer. After completion, - if still ptr array_len is greater than maximum limit(_MAX_PTRS), - then do it by parts. Determine new ptr_array_len that fits buffer */ - if( (bytes_needed > bytes_remaining) || - (_MAX_PTRS - aggr[index]->ptr_array_len < *ptr_array_len)) { - armci_agg_complete(nb_handle, SET); - rid = 0; get_new_descr=1; - if(*ptr_array_len > _MAX_PTRS) *ptr_array_len = _MAX_PTRS; - } - - /* if new descriptor, allocate memory for src_ptr & dst_ptr arrays */ - if(get_new_descr) { - int i = aggr[index]->ptr_array_len; - aggr[index]->darr[rid].src_ptr_array = (void **)&agg_src_ptr[index][i]; - aggr[index]->darr[rid].dst_ptr_array = (void **)&agg_dst_ptr[index][i]; - aggr[index]->darr[rid].ptr_array_len = 0; - aggr[index]->request_len++; - } - - /* store registered put data */ - if(is_registered_put) { - aggr[index]->buf_pos_end -= bytes; - memcpy(&((char *)aggr[index])[aggr[index]->buf_pos_end], - *((char **)registered_put_data), bytes); - *(char **)registered_put_data = (char *)&((char *)aggr[index])[aggr[index]->buf_pos_end]; - } - - aggr[index]->ptr_array_len += *ptr_array_len; - return (&aggr[index]->darr[rid]); -} - -int armci_agg_save_descriptor(void *src, void *dst, int bytes, int proc, int op, - int is_registered_put, armci_ihdl_t nb_handle) { - - int one=1, idx; - armci_giov_t * darr; - - /* set up the handle if it is a new aggregation request */ - AGG_INIT_NB_HANDLE(op, proc, nb_handle); - - darr = _armci_agg_get_descriptor(&one, bytes, nb_handle, - is_registered_put, &src); - idx = darr->ptr_array_len; - - darr->src_ptr_array[idx] = src; - darr->dst_ptr_array[idx] = dst; - darr->bytes = bytes; - darr->ptr_array_len += 1; - - fflush(stdout); - return 0; -} - - -int armci_agg_save_giov_descriptor(armci_giov_t dscr[], int len, int proc, - int op, armci_ihdl_t nb_handle) { - int i, j, k, idx, bytes, ptr_array_len; - armci_giov_t * darr; - - /* set up the handle if it is a new aggregation request */ - AGG_INIT_NB_HANDLE(op, proc, nb_handle); - - for(i=0; iptr_array_len; - - for(j=idx; jsrc_ptr_array[j] = dscr[i].src_ptr_array[k]; - darr->dst_ptr_array[j] = dscr[i].dst_ptr_array[k]; - } - darr->bytes = dscr[i].bytes; - darr->ptr_array_len += ptr_array_len; - - ptr_array_len = dscr[i].ptr_array_len - ptr_array_len; - if(ptr_array_len <0) armci_die("agg_save_giov_descr failed", 0L); - } while(k < darr[i].ptr_array_len); - } - return 0; -} - -int armci_agg_save_strided_descriptor(void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, int proc, - int op, armci_ihdl_t nb_handle) { - - int i, j, k, idx, ptr_array_len=1, total1D=1, num1D=0; - int offset1, offset2, factor[MAX_STRIDE_LEVEL]; - armci_giov_t * darr; - - /* set up the handle if it is a new aggregation request */ - AGG_INIT_NB_HANDLE(op, proc, nb_handle); - - for(i=1; i<=stride_levels; i++) { - total1D *= count[i]; - factor[i-1]=0; - } - ptr_array_len = total1D; - - do { - darr=_armci_agg_get_descriptor(&ptr_array_len,count[0],nb_handle,0,0); - idx = darr->ptr_array_len; - - /* converting stride into giov vector */ - for(i=idx; isrc_ptr_array[i] = (char *)src_ptr + offset1; - darr->dst_ptr_array[i] = (char *)dst_ptr + offset2; - ++factor[0]; - ++num1D; - for(j=1; jbytes = count[0]; - darr->ptr_array_len += ptr_array_len; - ptr_array_len = total1D - ptr_array_len; - if(ptr_array_len <0) armci_die("agg_save_strided_descr failed", 0L); - } while(num1D < total1D); - - return 0; -} - - -void armci_agg_complete(armci_ihdl_t nb_handle, int condition) { - int i, index=0, rc; - - /* get the buffer index for this handle */ - for(i=ulist.size-1; i>=0; i--) { - index = ulist.index[i]; - if(aggr[index]->tag == nb_handle->tag && - aggr[index]->proc == nb_handle->proc) - break; - } - if(i<0) return; /* implies this handle has no requests at all */ - -#if 0 - printf("%d: Aggregation Complete to remote process %d (%d:%d requests)\n", - armci_me, nb_handle->proc, index, aggr[index]->request_len); -#endif - - /* complete the data transfer. NOTE: in LAPI, Non-blocking calls - (followed by wait) performs better than blocking put/get */ - if(aggr[index]->request_len) { - switch(nb_handle->op) { -#ifdef LAPI - armci_hdl_t usr_hdl; - case PUT: - ARMCI_INIT_HANDLE(&usr_hdl); - if((rc=PARMCI_NbPutV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc, (armci_hdl_t*)&usr_hdl))) - ARMCI_Error("armci_agg_complete: nbputv failed",rc); - PARMCI_Wait((armci_hdl_t*)&usr_hdl); - break; - case GET: - ARMCI_INIT_HANDLE(&usr_hdl); - if((rc=PARMCI_NbGetV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc, (armci_hdl_t*)&usr_hdl))) - ARMCI_Error("armci_agg_complete: nbgetv failed",rc); - PARMCI_Wait((armci_hdl_t*)&usr_hdl); - break; -#else - case PUT: - if((rc=PARMCI_PutV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc))) - ARMCI_Error("armci_agg_complete: putv failed",rc); - break; - case GET: - if((rc=PARMCI_GetV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc))) - ARMCI_Error("armci_agg_complete: getv failed",rc); - break; -#endif - } - } - - /* setting request length to zero, as the requests are completed */ - aggr[index]->request_len = 0; - aggr[index]->ptr_array_len = 0; - aggr[index]->buf_pos_end = _MAX_AGG_BUFSIZE; - - /* If armci_agg_complete() is called PARMCI_Wait(), then unset nb_handle*/ - if(condition==UNSET) { - nb_handle->proc = -1; - _armci_agg_update_lists(index); - } -} - diff --git a/armci/src-gemini/armci-onesided.c b/armci/src-gemini/armci-onesided.c deleted file mode 100644 index 549fd34da..000000000 --- a/armci/src-gemini/armci-onesided.c +++ /dev/null @@ -1,642 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "armcip.h" -int armci_onesided_ds_handler(void *); - - -#ifdef ARMCI_REGISTER_SHMEM -typedef struct { - void *base_ptr; - void *serv_ptr; - size_t size; - int islocal; - int valid; -} aptl_reginfo_t; - -typedef struct { - aptl_reginfo_t reginfo[MAX_MEM_REGIONS]; - int reg_count; -} rem_meminfo_t; - -static rem_meminfo_t *_rem_meminfo; -static aptl_reginfo_t *_tmp_rem_reginfo; -#define IN_REGION(_ptr__,_reg__) ((_reg__.valid) && (_ptr__)>=(_reg__.serv_ptr) \ - && (_ptr__) <= ( (char *)(_reg__.serv_ptr)+_reg__.size)) -#endif - -static cos_mdesc_t _send_mdesc, _recv_mdesc; -static cos_mdesc_t *send_mdesc = NULL; -static cos_mdesc_t *recv_mdesc = NULL; - -int armci_onesided_direct_get_enabled = 1; -int armci_onesided_direct_put_enabled = 1; - -cos_desc_t __global_1sided_direct_comm_desc; -cos_desc_t __global_1sided_direct_get_comm_desc; - -// linked-list to hold mdh arrays for all ARMCI_Malloc calls -remote_mdh_node_t *remote_mdh_base_node = NULL; - -char **client_buf_ptrs; - -int -armci_onesided_init() -{ - int i; - cos_parameters_t cos_params; - - cos_params.options = ONESIDED_DS_PER_NUMA; - cos_params.nDataServers = 1; - cos_params.maxDescriptors = ARMCI_MAX_DESCRIPTORS*10; - cos_params.maxRequestSize = ARMCI_MAX_REQUEST_SIZE; - cos_params.dsHandlerFunc = armci_onesided_ds_handler; - - bzero(&__global_1sided_direct_comm_desc,sizeof(cos_desc_t)); - - // check to make sure things are properly sized - if(armci_me == 0) { - // ARMCI_ONESIDED_SIZEOF_IREQ is defined in armci.h - if(sizeof(armci_ireq_t) != ARMCI_ONESIDED_SIZEOF_IREQ) { - printf("ARMCI_ONESIDED_SIZEOF_IREQ is not sized correctly.\n"); - printf("ARMCI_ONESIDED_SIZEOF_IREQ = %d\nsizeof(armci_ireq_t) = %d\n", - ARMCI_ONESIDED_SIZEOF_IREQ,sizeof(armci_ireq_t)); - abort(); - } - } - - // initialize libonesided - COS_Init( &cos_params ); - - // initialize armci memory - # ifdef ARMCI_REGISTER_SHMEM - _rem_meminfo = (rem_meminfo_t *)calloc(armci_nproc,sizeof(rem_meminfo_t)); - _tmp_rem_reginfo = (aptl_reginfo_t *)malloc(sizeof(aptl_reginfo_t)*armci_nproc); - if( _rem_meminfo==NULL || _tmp_rem_reginfo ==NULL) { - armci_die("malloc failed in init_portals",0); - } - if(armci_me == 0) { - printf("sizeof(rem_meminfo_t)=%ld\n",sizeof(rem_meminfo_t)); - } - # endif - client_buf_ptrs = (char **) calloc(armci_nproc,sizeof(char *)); - assert(client_buf_ptrs); - armci_msg_barrier(); - _armci_buf_init(); - - // each armci buffer has a cos_request_t associated with it - // initialize that cos_request_t now - // moved into the above _armci_buf_init routine - // for(i=0; idscrlen + msginfo->datalen; - - // print_data(msginfo); - cpReqInit(remote_node, req); - cpPrePostRecv(buffer, length, req); - cpCopyLocalDataMDesc(req, &msginfo->tag.response_mdesc); - if(length > ARMCI_MAX_REQUEST_SIZE) length = sizeof(request_header_t); - cpReqSend(msginfo, length, req); - // cpReqWait(req); // required until a new fence operation is created -} - - - -void -print_data(void* buf) -{ - request_header_t *msginfo = (request_header_t *) buf; - char *buffer = (char *) buf; - buffer += sizeof(request_header_t) + msginfo->dscrlen; - - int ndouble = msginfo->datalen/8; - double *data = (double *) buffer; - - printf("%d: [0]=%lf; [%d]=%lf; from=%d; to=%d\n",armci_me,data[0],ndouble-1,data[ndouble-1],msginfo->from, msginfo->to); -} - - - -static void -armci_onesided_recv(void* buffer, request_header_t *msginfo, int remote_node, cos_request_t *req) -{ - size_t length = sizeof(request_header_t) + msginfo->dscrlen; - size_t reg_len = length + msginfo->datalen; - - cpReqInit(remote_node, req); - cpPrePostRecv(buffer, reg_len, req); - cpCopyLocalDataMDesc(req, &msginfo->tag.response_mdesc); - if(length > ARMCI_MAX_REQUEST_SIZE) length = sizeof(request_header_t); - cpReqSend(msginfo, length, req); -} - - - -static void -armci_onesided_oper(void* buffer, request_header_t *msginfo, int remote_node, cos_request_t *req) -{ - size_t length = sizeof(request_header_t); - - cpReqInit(remote_node, req); - cpPrePostRecv(buffer, length, req); - cpCopyLocalDataMDesc(req, &msginfo->tag.response_mdesc); - cpReqSend(msginfo, length, req); -} - - - -static void -armci_onesided_rmw(void *buffer, request_header_t *msginfo, int remote_node, cos_request_t *req) -{ - size_t length = sizeof(request_header_t) + msginfo->dscrlen + msginfo->datalen; - - cpReqInit(remote_node, req); - cpPrePostRecv(buffer, msginfo->datalen, req); - cpCopyLocalDataMDesc(req, &msginfo->tag.response_mdesc); - cpReqSend(msginfo, length, req); -} - -extern _buf_ackresp_t *_buf_ackresp_first,*_buf_ackresp_cur; - - -#if defined CRAY_REGISTER_ARMCI_MALLOC && HAVE_ONESIDED_FADD -void -armci_onesided_fadd(void *ploc, void *prem, int extra, int proc) -{ - onesided_hnd_t cp_hnd; - cos_desc_t comm_desc; - cos_mdesc_t local_mdh, remote_mdh, *mdh = NULL; - - cpGetOnesidedHandle(&cp_hnd); - armci_onesided_search_remote_mdh_list(prem, proc, &remote_mdh); - onesided_mem_register(cp_hnd, ploc, sizeof(long), NULL, &local_mdh); - onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, &comm_desc); - onesided_fadd(extra, &comm_desc); - onesided_wait(&comm_desc); -} -#endif - -int -armci_send_req_msg(int proc, void *buf, int bytes, int tag) -{ - int cluster = armci_clus_id(proc); - int serv = armci_clus_info[cluster].master; - char *buffer = (char *) buf; - request_header_t *msginfo = (request_header_t *) buf; - - # ifdef ARMCI_LIMIT_REMOTE_REQUESTS_BY_NODE - _armci_buf_ensure_one_outstanding_op_per_node(buf,cluster); - # endif - - # ifdef SPECIAL_PUT_OPERATION_BROKEN_WHEN_INITIATED_FROM_USER_BUFFER - // ensure any outstanding onesided direct operations have finished - int state = __global_1sided_direct_comm_desc.state; - onesided_wait(&__global_1sided_direct_comm_desc); - if(state) cpMemDeregister(&__global_1sided_direct_comm_desc.local_mdesc); - # endif - - BUF_INFO_T *bufinfo=_armci_buf_to_bufinfo(msginfo); - _buf_ackresp_t *ar = &bufinfo->ar; - cos_request_t *req = &ar->req; - - if(msginfo->operation == PUT || ARMCI_ACC(msginfo->operation)) { - armci_onesided_send(buffer, msginfo, cluster, req); - } - - else if(msginfo->operation == GET) { - // move the buffer shift into the data server handler - // buffer = (char *) buf; - // buffer += sizeof(request_header_t); - // buffer += msginfo->dscrlen; - armci_onesided_recv(buffer, msginfo, cluster, req); - } - - else if(msginfo->operation == ACK) { - armci_onesided_oper(buffer, msginfo, cluster, req); -#if HAVE_ONESIDED_MEM_HTFLUSH - onesided_mem_htflush(cluster); -#endif - } - - else if(msginfo->operation == ARMCI_SWAP || msginfo->operation == ARMCI_SWAP_LONG || - msginfo->operation == ARMCI_FETCH_AND_ADD || - msginfo->operation == ARMCI_FETCH_AND_ADD_LONG) { - buffer = (char *) buf; - buffer += sizeof(request_header_t); - buffer += msginfo->dscrlen; - armci_onesided_rmw(buffer, msginfo, cluster, req); - } - - else { - cosError("armci_send_req_msg: operation not supported",msginfo->operation); - } - - - // this had to be included in the portals version or shit would go down ... not sure y! - // for now, we'll leave it in and see what happens later when we remote it - # if 1 - ar->val = ar->valc = 0; - if(ar==_buf_ackresp_first)_buf_ackresp_first=ar->next; - if(ar->next!=NULL){ - ar->next->previous=ar->previous; - } - if(ar->previous!=NULL){ - ar->previous->next=ar->next; - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=ar->previous; - } - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=NULL; - ar->previous=ar->next=NULL; - # endif - - return 0; -} - - - -char * -armci_ReadFromDirect(int proc, request_header_t *msginfo, int len) -{ - // this is a CP funciton - BUF_INFO_T *bufinfo = _armci_buf_to_bufinfo(msginfo); - cos_request_t *req = &bufinfo->ar.req; - cpReqWait(req); - - // return pointer to data - char *ret = (char *) msginfo; - ret += sizeof(request_header_t); - ret += msginfo->dscrlen; - return ret; -} - - - -void -armci_WriteToDirect(int proc, request_header_t *msginfo, void *buf) -{ - // this is a DS function - cos_desc_t resp_desc; - cos_mdesc_t *resp_mdesc = &msginfo->tag.response_mdesc; - dsDescInit(resp_mdesc, &resp_desc); - resp_desc.event_type = EVENT_LOCAL | EVENT_REMOTE; - if(send_mdesc == NULL) { - send_mdesc = &_send_mdesc; - dsMemRegister(MessageSndBuffer, sizeof(double)*MSG_BUFLEN_DBL, send_mdesc); - } - memcpy(&resp_desc.local_mdesc, send_mdesc, sizeof(cos_mdesc_t)); - resp_desc.local_mdesc.addr = (uint64_t) buf; - resp_desc.local_mdesc.length = (uint64_t) msginfo->datalen; - // cosPut(buf, msginfo->datalen, &resp_desc); - cosPutWithDesc(&resp_desc); - dsDescWait(&resp_desc); -} - - - -int armci_onesided_ds_handler(void *buffer) -{ - size_t length = 0; - cos_desc_t get_desc; - cos_mdesc_t *mdesc = NULL; - void *buffer_to_data_server = buffer; - request_header_t *request = (request_header_t *) buffer; - if(request->operation == PUT || ARMCI_ACC(request->operation)) { - length = sizeof(request_header_t) + request->dscrlen + request->datalen; - if(length > ARMCI_MAX_REQUEST_SIZE) { - char *get_buffer = (char *) MessageRcvBuffer; - if(recv_mdesc == NULL) { - recv_mdesc = &_recv_mdesc; - dsMemRegister(MessageRcvBuffer, sizeof(double)*MSG_BUFLEN_DBL, recv_mdesc); - } - mdesc = &request->tag.response_mdesc; - dsDescInit(mdesc, &get_desc); - get_desc.event_type = EVENT_LOCAL; - memcpy(&get_desc.local_mdesc, recv_mdesc, sizeof(cos_mdesc_t)); - get_desc.local_mdesc.length = length; - assert(length <= sizeof(double)*MSG_BUFLEN_DBL); - cosGetWithDesc(&get_desc); - dsDescWait(&get_desc); - buffer_to_data_server = (void *) get_buffer; - } - } - else if(request->operation == GET) { - length = sizeof(request_header_t) + request->dscrlen; - if(length > ARMCI_MAX_REQUEST_SIZE) { - // printf("[ds %d]: boom - rz fetch of get dscr\n",armci_me); - char *get_buffer = (char *) MessageRcvBuffer; - if(recv_mdesc == NULL) { - recv_mdesc = &_recv_mdesc; - dsMemRegister(MessageRcvBuffer, sizeof(double)*MSG_BUFLEN_DBL, recv_mdesc); - } - mdesc = &request->tag.response_mdesc; - dsDescInit(mdesc, &get_desc); - get_desc.event_type = EVENT_LOCAL; - memcpy(&get_desc.local_mdesc, recv_mdesc, sizeof(cos_mdesc_t)); - get_desc.local_mdesc.length = length; - assert(length <= sizeof(double)*MSG_BUFLEN_DBL); - cosGetWithDesc(&get_desc); - dsDescWait(&get_desc); - buffer_to_data_server = (void *) get_buffer; - } - // regardless of rendez-vous or eager protocols - // we have to shift the buffer and data lengths in the response_mdesc tag - request = (request_header_t *) buffer_to_data_server; - char *rbuf = (char *) request->tag.response_mdesc.addr; - rbuf += length; - request->tag.response_mdesc.addr = (uint64_t) rbuf; - request->tag.response_mdesc.length -= length; - } - - if(request->operation == 0) { - printf("%d [ds] possible zeroed buffer problem\n",armci_me); - abort(); - } - - armci_data_server(buffer_to_data_server); -} - - - -void -armci_rcv_req(void *mesg,void *phdr,void *pdescr,void *pdata,int *buflen) -{ -int i,na; -char *a; -double *tmp; - - request_header_t *msginfo = (request_header_t *)mesg; - - ARMCI_PR_SDBG("enter",msginfo->operation); - *(void **) phdr = msginfo; - - if(0) { - printf("%d [ds]: got %d req (hdrlen=%d dscrlen=%d datalen=%d %d) from %d\n", - armci_me, msginfo->operation, sizeof(request_header_t), msginfo->dscrlen, - msginfo->datalen, msginfo->bytes,msginfo->from); - fflush(stdout); - } - /* we leave room for msginfo on the client side */ - *buflen = MSG_BUFLEN - sizeof(request_header_t); - - - if(send_mdesc == NULL) { - send_mdesc = &_send_mdesc; - dsMemRegister(MessageSndBuffer, sizeof(double)*MSG_BUFLEN_DBL, send_mdesc); - } - - // printf("%d [ds] oper=%d; bytes=%d\n",armci_me,msginfo->operation,msginfo->bytes); - if(msginfo->bytes) { - *(void **) pdescr = msginfo+1; - *(void **) pdata = msginfo->dscrlen + (char*)(msginfo+1); - - if(msginfo->operation == GET) { - // the descriptor will exists after the request header - // but there will be no data buffer - // use the MessageRcvBuffer - *(void**) pdata = MessageSndBuffer; -// printf("%s (server) overriding pdata in rcv_req\n",Portals_ID()); - if(send_mdesc == NULL) { - send_mdesc = &_send_mdesc; - dsMemRegister(MessageSndBuffer, sizeof(double)*MSG_BUFLEN_DBL, send_mdesc); - // printf("send_mdesc registered\n"); - // fflush(stdout); - } - } - } - else { - // printf("%d [ds]: hit this\n",armci_me); - *(void**) pdescr = NULL; - *(void**) pdata = MessageRcvBuffer; - if(recv_mdesc == NULL) { - recv_mdesc = &_recv_mdesc; - dsMemRegister(MessageRcvBuffer, sizeof(double)*MSG_BUFLEN_DBL, recv_mdesc); - } - } - ARMCI_PR_SDBG("exit",msginfo->operation); -} - - - -void -armci_server_send_ack(request_header_t *msginfo) -{ - // this is a DS function - cos_desc_t resp_desc; - cos_mdesc_t *resp_mdesc = &msginfo->tag.response_mdesc; - dsDescInit(resp_mdesc, &resp_desc); - resp_desc.event_type = EVENT_LOCAL | EVENT_REMOTE; - cosPut(NULL, 0, &resp_desc); - dsDescWait(&resp_desc); -} - - - -void -x_buf_wait_ack(request_header_t *msginfo, BUF_INFO_T *bufinfo) -{ - armci_die("x_buf_wait_ack not implemented",911); -} - - - -void -x_net_send_ack(request_header_t *msginfo, int proc, void *dst, void *src) -{ - armci_die("x_net_send_ack not implemented",911); -} - - - -long -x_net_offset(char *buf, int proc) -{ - armci_die("x_net_offset not implemented",911); - # if 0 - ARMCI_PR_DBG("enter",_rem_meminfo[proc].reg_count); - if(DEBUG_COMM) { - printf("\n%d:%s:buf=%p",armci_me,__FUNCTION__,buf);fflush(stdout); - } - for(i=0;i<_rem_meminfo[proc].reg_count;i++) { - if(IN_REGION(buf,_rem_meminfo[proc].reginfo[i])) { - return((long)((char *)_rem_meminfo[proc].reginfo[i].serv_ptr-(char *)_rem_meminfo[proc].reginfo[i].base_ptr)); - } - } - ARMCI_PR_DBG("exit",0); - # endif - return 0; -} - - -// currently our list of remote mdhs appears that it can get several entries with various -// lengths. we should scan the mdh list first to see if an entry exists in the list -// if so, that could be an indication that the remote list entry function is not working -// properly, or that a different type of armci_free call is being used to by pass the -// removal of the mdh entry. either way, we need to examine these occurences. -void -armci_onesided_append_remote_mdh_list(void* tgt_ptr, int proc, cos_mdesc_t *ret_mdh) -{ - -} - -void -armci_onesided_search_remote_mdh_list(void* tgt_ptr, int proc, cos_mdesc_t *ret_mdh) -{ - int node = armci_clus_id(proc); - uint64_t length; - uint64_t rem_addr; - uint64_t tgt_addr = (uint64_t) tgt_ptr; - remote_mdh_node_t *ll = remote_mdh_base_node; - const cos_mdesc_t *mdh = NULL; - - // search the link-list for remote address and return the - while(ll) { - // if we are in this routine, we are doing a direct onesided operations on a chuck of local - // memory that was registered by the master process on this node. typically, an armci operation - // would work directly off the virtual address of that data as attached by the current process; - // however, because we are going to do a UGNI operation targetted at the MDH registered by the - // armci_master rank on this node, we have to translate the virtual address on this rank to the - // virtual address on armci_master. this means we have to find the mdh by searching the ptrs - // array and not the mdhs[*].addr values - if(SAMECLUSNODE(proc) && armci_me != armci_master) { - rem_addr = (uint64_t) ll->ptrs[proc]; - } else { - rem_addr = (uint64_t) ll->mdhs[proc].addr; - } - length = ll->mdhs[proc].length; - if(tgt_addr >= rem_addr && tgt_addr < (rem_addr+length) /* check length of msg */) { - mdh = &ll->mdhs[proc]; - break; - } - ll = ll->next; - } - - // if remote mdh not found - if(mdh == NULL) { - printf("[cp %d]: warning - could not locate remote mdh for a direct put.\n",armci_me); - printf("[cp %d]: searching for tgt_ptr=%p on node=%d / proc=%d\n",armci_me,tgt_ptr,node,proc); - ll = remote_mdh_base_node; - while(ll) { - rem_addr = (uint64_t) ll->ptrs[proc]; - length = ll->mdhs[proc].length; - printf("[cp %d]: ll->ptrs[proc]=%p; ll->mdhs[node].length=%ld\n",armci_me,ll->ptrs[proc], length); - ll = ll->next; - } - abort(); - } - - // setup return mdh - // on the remote side the node master is the only rank that registers the "shared" memmory. however, - // shmat doesn't guarantee that all ranks on the node share the same starting virtual address. that - // is why we have to calculate the offset from the starting address on the node master based on the - // actual virutal addresses on the remote rank. - memcpy(ret_mdh, mdh, sizeof(cos_mdesc_t)); - // ret_mdh->addr += (tgt_addr-rem_addr); - // if(ret_mdh->addr != tgt_addr) { - // printf("%d: ret_mdh->addr=%ld; tgt_addr=%ld\n",armci_me,ret_mdh->addr, tgt_addr); - // fflush(stdout); - // } - - // if we are targeting a rank on the node for a direct operation, we need to translate the address - // if not, then we can use the tgt_addr as passed in - if(SAMECLUSNODE(proc) && armci_me != armci_master) { - ret_mdh->addr += (tgt_addr-rem_addr); - } else { - ret_mdh->addr = tgt_addr; - } -} - -void -armci_onesided_remove_from_remote_mdh_list(void *tgt_ptr) -{ - cos_comm_t info; - cos_mdesc_t *mdh = NULL; - onesided_hnd_t cp_hnd; - int node = armci_clus_id(armci_me); - long total_bytes; - remote_mdh_node_t *rm_ll, *ll = remote_mdh_base_node; - - NTK_MPI_GetComm(ARMCI_COMM_WORLD, &info); - - // get the onesided v2.0 api handle for the compute process - cpGetOnesidedHandle(&cp_hnd); - - // find mdh - while(ll) { - if(tgt_ptr == ll->ptrs[armci_me]) { - mdh = &ll->mdhs[armci_me]; - break; - } - ll = ll->next; - } - - // ensure we have a valid mdh - if(mdh == NULL) abort(); - - // sum the total bytes allocated on the node - MPI_Allreduce(&mdh->length, &total_bytes, 1, MPI_LONG, MPI_SUM, info.numa_comm); - - // node master only - if(info.numa_me == 0 && total_bytes) { - - // deregister memory - onesided_mem_deregister(cp_hnd, mdh); - // cpMemDeregister(mdh); - } - - // free mdhs - free(ll->mdhs); - ll->mdhs = NULL; - - // update linked-list - rm_ll = ll; - if(rm_ll == remote_mdh_base_node) remote_mdh_base_node = rm_ll->next; - else { - ll = remote_mdh_base_node; - while(ll->next != rm_ll) ll = ll->next; - assert(ll->next == rm_ll); - ll->next = rm_ll->next; - } - free(rm_ll); -} - - -void ARMCI_INIT_HANDLE(void *hdl) -{ - bzero(hdl, ARMCI_ONESIDED_SIZEOF_IREQ); -} - - -void armci_direct_on() -{ - armci_onesided_direct_get_enabled = 1; - armci_onesided_direct_put_enabled = 1; -} - -void armci_direct_off() -{ - armci_onesided_direct_get_enabled = 0; - armci_onesided_direct_put_enabled = 0; -} - -void armci_direct_on_() { armci_direct_on(); } -void armci_direct_off_() { armci_direct_off(); } diff --git a/armci/src-gemini/armci-onesided.h b/armci/src-gemini/armci-onesided.h deleted file mode 100644 index 6892d555f..000000000 --- a/armci/src-gemini/armci-onesided.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef __ARMCI_ONESIDED_H__ -#define __ARMCI_ONESIDED_H__ - -#include "onesided.h" - -#define NUM_SERV_BUFS 1 -#define MAX_MEM_REGIONS 30 - -#define ARMCI_BUF_SIZE 262144 -#define ARMCI_SMALL_BUF_SIZE 2048 - -#define ARMCI_MAX_BUFS 4 -#define ARMCI_MAX_SMALL_BUFS 8 - -#define ARMCI_MAX_DESCRIPTORS (ARMCI_MAX_BUFS+ARMCI_MAX_SMALL_BUFS) -#define ARMCI_MAX_REQUEST_SIZE ARMCI_SMALL_BUF_SIZE - -/* - There is a problem with ga_transpose when CRAY_REGISTER_ARMCI_MALLOC - is defined. - - The fix is a special hook in ga_transpose to turn off the direct puts - during the transpose. This may indicate a race condition in the - transpose code or a problem with the direct fencing. -*/ -#define CRAY_REGISTER_ARMCI_MALLOC -#define ARMCI_LIMIT_REMOTE_REQUESTS_BY_NODE_TURNED_OFF -#define MAX_OUTSTANDING_ONESIDED_GETS 64 - -#define ARMCI_ONESIDED_GETS_USES_NBGETS - -/* typedefs */ - -typedef struct armci_onesided_msg_tag_s { - int msgid; - cos_mdesc_t response_mdesc; -} armci_onesided_msg_tag_t; - - -typedef struct remote_mdh_node { - void **ptrs; - cos_mdesc_t *mdhs; - struct remote_mdh_node *next; -} remote_mdh_node_t; - -// linked-list of remote mdhs -// a new node is created on each ARMCI_Malloc operation -// not an ideal scenario -- perhaps use Abhinav's new dreg routines -// to store this data ... it would require two pieces of info ... -// remote target and remote virtual addr ... and return the mdh -// for now: manually look up the mdh in ARMCI_GetS and ARMCI_PutS -extern remote_mdh_node_t *remote_mdh_base_node; - -/* functions */ -int armci_onesided_init(); -void armci_transport_cleanup(); -void armci_rcv_req(void *,void *,void *,void *,int *); - -void print_data(void *); - -void armci_onesided_search_remote_mdh_list(void* tgt_ptr, int proc, cos_mdesc_t *mdh); -void armci_onesided_remove_from_remote_mdh_list(void *tgt_ptr); - -#if defined CRAY_REGISTER_ARMCI_MALLOC && HAVE_ONESIDED_FADD -void armci_onesided_fadd(void *ploc, void *prem, int extra, int proc); -#endif - -extern int armci_onesided_direct_get_enabled; -extern int armci_onesided_direct_put_enabled; -extern cos_desc_t __global_1sided_direct_comm_desc; -extern cos_desc_t __global_1sided_direct_get_comm_desc; - - -/* set up internals */ - -#ifdef MAX_BUFS -#error "MAX_BUFS should not be defined yet" -#else -#define MAX_BUFS ARMCI_MAX_BUFS -#endif - -#ifdef MAX_SMALL_BUFS -#error "MAX_SMALL_BUFS should not be defined yet" -#else -#define MAX_SMALL_BUFS ARMCI_MAX_SMALL_BUFS -#endif - -#ifdef MSG_BUFLEN_DBL -#error "MSG_BUFLEN_DBL should not be defined yet" -#else -#define MSG_BUFLEN_DBL ARMCI_BUF_SIZE -#endif - - -/* for buffers */ - -extern char **client_buf_ptrs; -#define BUF_ALLOCATE armci_portals_client_buf_allocate -//define BUF_EXTRA_FIELD_T comp_desc* -//define INIT_SEND_BUF(_field,_snd,_rcv) _snd=1;_rcv=1;_field=NULL -#define GET_SEND_BUFFER _armci_buf_get -#define FREE_SEND_BUFFER _armci_buf_release - -//define CLEAR_SEND_BUF_FIELD(_field,_snd,_rcv,_to,_op) if((_op==UNLOCK || _op==PUT || ARMCI_ACC(_op)) && _field!=NULL)x_buf_wait_ack((request_header_t *)((void **)&(_field)+1),((char *)&(_field)-sizeof(BUF_INFO_T)));_field=NULL; -//define TEST_SEND_BUF_FIELD(_field,_snd,_rcv,_to,_op,_ret) - -#define CLEAR_SEND_BUF_FIELD(_field,_snd,_rcv,_to,_op) -#define TEST_SEND_BUF_FIELD(_field,_snd,_rcv,_to,_op,_ret) - -#define COMPLETE_HANDLE _armci_buf_complete_nb_request - -//define NB_CMPL_T comp_desc* -#if 0 -#define ARMCI_NB_WAIT(_cntr) if(_cntr){\ - int rc;\ - if(nb_handle->tag)\ - if(nb_handle->tag==_cntr->tag)\ - rc = armci_client_complete(0,nb_handle->proc,nb_handle->tag,_cntr);\ -} else{\ -printf("\n%d:wait null ctr\n",armci_me);} -#endif -#define ARMCI_NB_WAIT(_cntr) - - - -#endif diff --git a/armci/src-gemini/armci.c b/armci/src-gemini/armci.c deleted file mode 100644 index d7e760199..000000000 --- a/armci/src-gemini/armci.c +++ /dev/null @@ -1,1030 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: armci.c,v 1.114.2.17 2007-08-30 22:58:18 manoj Exp $ */ - -/* DISCLAIMER - * - * This material was prepared as an account of work sponsored by an - * agency of the United States Government. Neither the United States - * Government nor the United States Department of Energy, nor Battelle, - * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR - * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY, - * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT, - * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT - * INFRINGE PRIVATELY OWNED RIGHTS. - * - * - * ACKNOWLEDGMENT - * - * This software and its documentation were produced with United States - * Government support under Contract Number DE-AC06-76RLO-1830 awarded by - * the United States Department of Energy. The United States Government - * retains a paid-up non-exclusive, irrevocable worldwide license to - * reproduce, prepare derivative works, perform publicly and display - * publicly by or for the US Government, including the right to - * distribute to other US Government contractors. - */ - -#define EXTERN -/*#define PRINT_BT*/ -#define _GNU_SOURCE -#include -#include -#include -#if defined(CRAY) && !defined(__crayx1) -# include -# include -# include -#endif -#ifdef LAPI -# include "lapidefs.h" -#endif -#include -#include "armcip.h" -#include "copy.h" -#include "memlock.h" -#include "armci_shmem.h" -#include "signaltrap.h" - -#ifdef ARMCIX -#include "x/armcix.h" -#endif -#ifdef BGML -#include "bgml.h" -#include -#include "bgmldefs.h" -extern void armci_msg_barrier(void); -#endif - -#ifdef CRAY_SHMEM -# ifdef CRAY_XT -# include -# else -# include -# endif -#endif - -#include - -/* global variables */ -int armci_me, armci_Sme, armci_nproc; -int armci_clus_me, armci_nclus, armci_master; -int armci_clus_first, armci_clus_last; -int *_armci_argc=NULL; -char ***_armci_argv=NULL; -int _armci_initialized_args=0; -int _armci_initialized=0; -int _armci_terminating =0; -thread_id_t armci_usr_tid; -armci_ireq_t armci_inb_handle[ARMCI_MAX_IMPLICIT];/*implicit non-blocking handle*/ -#ifndef HITACHI -double armci_internal_buffer[BUFSIZE_DBL]; -#endif -#if defined(SYSV) || defined(WIN32) || defined(MMAP) || defined(HITACHI) || defined(CATAMOUNT) || defined(BGML) -# include "locks.h" - lockset_t lockid; -#endif - -int* armci_prot_switch_fence=NULL; -int armci_prot_switch_preproc = -1; -int armci_prot_switch_preop = -1; - -#ifdef BGML -/* void armci_allocate_locks(); */ - void armci_init_memlock(); -#endif - -#ifdef LIBELAN_ATOMICS -ELAN_ATOMIC *a; -#warning "Enabling new atomics" -#endif - -typedef struct{ - int sent; - int received; - int waited; -}armci_notify_t; - -armci_notify_t **_armci_notify_arr; - -void ARMCI_Cleanup() -{ -#if defined(DATA_SERVER) -#if defined(LIBONESIDED) - dsTurnOff(); -#else - if(armci_nclus >1){ - armci_wait_for_server(); - } -#endif -#endif - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP))&& !defined(HITACHI) - Delete_All_Regions(); - if(armci_nproc>1) -#if !defined(LAPI) - DeleteLocks(lockid); -#endif - -#ifndef WIN32 - ARMCI_RestoreSignals(); -#endif - -#endif - armci_transport_cleanup(); - -} - -int armci_getbufsize() -{ - return(BUFSIZE); -} - -void armci_notify_init() -{ - int rc,bytes=sizeof(armci_notify_t)*armci_nproc; - -#ifdef DOELAN4 - armci_elan_notify_init(); - return; -#endif - - _armci_notify_arr= - (armci_notify_t**)malloc(armci_nproc*sizeof(armci_notify_t*)); - if(!_armci_notify_arr)armci_die("armci_notify_ini:malloc failed",armci_nproc); - - if((rc=PARMCI_Malloc((void **)_armci_notify_arr, bytes))) - armci_die(" armci_notify_init: armci_malloc failed",bytes); - bzero(_armci_notify_arr[armci_me], bytes); -} - - -static void armci_perror_msg() -{ - char perr_str[80]; - if(!errno)return; - sprintf(perr_str,"Last System Error Message from Task %d:",armci_me); - perror(perr_str); -} - -static void armci_abort(int code) -{ - abort(); -#if !defined(BGML) - armci_perror_msg(); -#endif - ARMCI_Cleanup(); - /* data server process cannot use message-passing library to abort - * it simply exits, parent will get SIGCHLD and abort the program - */ -#if defined(DATA_SERVER) - if(armci_me<0)_exit(1); - else -#endif - armci_msg_abort(code); -} - - -void armci_die(char *msg, int code) -{ - void *bt[100]; - - if(_armci_terminating)return; - else _armci_terminating=1; - - if(SERVER_CONTEXT){ - fprintf(stdout,"%d(s):%s: %d\n",armci_me, msg, code); fflush(stdout); - // fprintf(stderr,"%d(s):%s: %d\n",armci_me, msg, code); - }else{ - fprintf(stdout,"%d:%s: %d\n",armci_me, msg, code); fflush(stdout); - //fprintf(stderr,"%d:%s: %d\n",armci_me, msg, code); - } - -#ifdef PRINT_BT - backtrace_symbols_fd(bt, backtrace(bt, 100), 2); -#endif - - armci_abort(code); -} - - -void armci_die2(char *msg, int code1, int code2) -{ - void *bt[100]; - - if(_armci_terminating)return; - else _armci_terminating=1; - - if(SERVER_CONTEXT){ - fprintf(stdout,"%d(s):%s: (%d,%d)\n",armci_me,msg,code1,code2); - fflush(stdout); - fprintf(stderr,"%d(s):%s: (%d,%d)\n",armci_me,msg,code1,code2); - }else{ - fprintf(stdout,"%d:%s: (%d,%d)\n",armci_me,msg,code1,code2); - fflush(stdout); - fprintf(stderr,"%d:%s: (%d,%d)\n",armci_me,msg,code1,code2); - } -#ifdef PRINT_BT - backtrace_symbols_fd(bt, backtrace(bt, 100), 2); -#endif - armci_abort(code1); -} - - -void ARMCI_Error(char *msg, int code) -{ - armci_die(msg,code); -} - - -void armci_allocate_locks() -{ - /* note that if ELAN_ACC is defined the scope of locks is limited to SMP */ -#if !defined(CRAY_SHMEM) && (defined(HITACHI) || defined(CATAMOUNT) || \ - (defined(QUADRICS) && defined(_ELAN_LOCK_H) && !defined(ELAN_ACC))) - armcill_allocate_locks(NUM_LOCKS); -#elif (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(HITACHI) - if(armci_nproc == 1)return; -# if defined(SPINLOCK) || defined(PMUTEX) || defined(PSPIN) - CreateInitLocks(NUM_LOCKS, &lockid); -# else - if(armci_master==armci_me)CreateInitLocks(NUM_LOCKS, &lockid); - armci_msg_clus_brdcst(&lockid, sizeof(lockid)); - if(armci_master != armci_me)InitLocks(NUM_LOCKS, lockid); -# endif -#endif -} - - -void ARMCI_Set_shm_limit(unsigned long shmemlimit) -{ -#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(HITACHI) -#define EXTRASHM 1024 /* extra shmem used internally in ARMCI */ -unsigned long limit; - limit = shmemlimit + EXTRASHM; - armci_set_shmem_limit_per_core(limit); -#endif -} - - - -/*\ allocate and initialize memory locking data structure -\*/ -void armci_init_memlock() -{ - int bytes = MAX_SLOTS*sizeof(memlock_t); - int rc, msize_per_proc=bytes; - -#ifdef MEMLOCK_SHMEM_FLAG - /* last proc on node allocates memlock flag in shmem */ - if(armci_clus_last == armci_me) bytes += sizeof(int); -#endif - - memlock_table_array = malloc(armci_nproc*sizeof(void*)); - if(!memlock_table_array) armci_die("malloc failed for ARMCI lock array",0); - - rc = PARMCI_Malloc(memlock_table_array, bytes); - if(rc) armci_die("failed to allocate ARMCI memlock array",rc); - - armci_msg_barrier(); - - bzero(memlock_table_array[armci_me],bytes); - -#ifdef BGML - bgml_init_locks ((void *) memlock_table_array[armci_me]); -#elif ARMCIX - ARMCIX_init_memlock ((memlock_t *) memlock_table_array[armci_me]); -#endif - - -#ifdef MEMLOCK_SHMEM_FLAG - /* armci_use_memlock_table is a pointer to local memory variable=1 - * we overwrite the pointer with address of shared memory variable - * armci_use_memlock_table and initialize it >0 - */ - armci_use_memlock_table = (int*) (msize_per_proc + - (char*) memlock_table_array[armci_clus_last]); - - /* printf("%d: last=%d bytes=%d ptr =(%d, %d)\n", - armci_me,armci_clus_last,bytes,armci_use_memlock_table, - memlock_table_array[armci_clus_last]); fflush(stdout); */ - - if(armci_clus_last == armci_me) *armci_use_memlock_table =1+armci_me; - -#endif - - *armci_use_memlock_table = 0; - armci_msg_barrier(); -} - - -#if defined(SYSV) || defined(WIN32) -static void armci_check_shmmax() -{ - long mylimit, limit; - mylimit = limit = (long) armci_max_region(); - armci_msg_bcast_scope(SCOPE_MASTERS, &limit, sizeof(long), 0); - if(mylimit != limit){ - printf("%d:Shared mem limit in ARMCI is %ld bytes on node %s vs %ld on %s\n", - armci_me,mylimit<<10,armci_clus_info[armci_clus_me].hostname, - limit<<10, armci_clus_info[0].hostname); - fflush(stdout); sleep(1); - armci_die("All nodes must have the same SHMMAX limit if NO_SHM is not defined",0); - } -} -#endif - -extern void armci_region_shm_malloc(void *ptr_arr[], size_t bytes); - - -void ARMCI_NetInit() -{ - /*armci_portals_net_init();*/ -} - -int PARMCI_Init_args(int *argc, char ***argv) -{ - armci_msg_init(argc,argv); - - _armci_argc = argc; - _armci_argv = argv; - _armci_initialized_args=1; - PARMCI_Init(); -} - - -extern void *sbrk(intptr_t); -extern void code_summary(); - -int _armci_init(MPI_Comm comm) -{ - caddr_t atbeginbrval = (caddr_t)sbrk(0); - if(_armci_initialized>0) return 0; -#ifdef NEW_MALLOC - mallopt(M_MMAP_MAX, 0); - mallopt(M_TRIM_THRESHOLD, -1); -#endif - - armci_msg_init(NULL, NULL); - - armci_nproc = armci_msg_nproc(); - armci_me = armci_msg_me(); - armci_usr_tid = THREAD_ID_SELF(); /*remember the main user thread id */ - armci_init_clusinfo(); - armci_prot_switch_fence = malloc(sizeof(int*)*armci_nproc); - assert(armci_prot_switch_fence !=NULL); - # ifdef LIBONESIDED - armci_onesided_init(); - # endif -#ifdef MSG_COMMS_MPI - armci_group_init(); -#endif -#ifndef NEW_MALLOC - armci_krmalloc_init_localmem(); -#endif -#if defined(SYSV) || defined(WIN32) || defined(MMAP) - if(ARMCI_Uses_shm() ) { - armci_shmem_init(); - } -#endif - armci_allocate_locks(); - armci_init_fence(); -#if ARMCI_ENABLE_GPC_CALLS - gpc_init_signals(); -#endif - armci_msg_barrier(); - armci_init_memlock(); /* allocate data struct for locking memory areas */ - armci_msg_barrier(); - //if(armci_me == 0) code_summary(); - armci_msg_barrier(); - armci_msg_gop_init(); - _armci_initialized++; - return 0; -} - - -int PARMCI_Init() -{ - return _armci_init(MPI_COMM_WORLD); -} - - -int PARMCI_Init_mpi_comm(MPI_Comm comm) -{ - return _armci_init(comm); -} - - -void PARMCI_Finalize() -{ - if(!_armci_initialized)return; - _armci_initialized--; - if(_armci_initialized)return; - - _armci_terminating =1; - armci_msg_barrier(); - if(armci_me==armci_master) ARMCI_ParentRestoreSignals(); - -#ifdef PORTALS - request_header_t msg; - portals_ds_req_t req; - ptl_process_id_t dsid = portals_id_map[armci_me]; - msg.operation = QUIT; - - if(armci_me == armci_master) { - portalsBlockingRemoteOperationToNode(&msg,sizeof(request_header_t),armci_clus_me); - } - - armci_msg_barrier(); - portals_cp_finalize(); - -#else - - ARMCI_Cleanup(); - armci_msg_barrier(); - armci_group_finalize(); - free(armci_prot_switch_fence); -#endif -#ifdef MSG_COMMS_MPI - MPI_Comm_free(&ARMCI_COMM_WORLD); /*JD: free at last*/ -#endif -} - - -/* Indicates whether ARMCI_Init or ARMCI_Init_args has been called. */ -int PARMCI_Initialized() -{ - return (_armci_initialized > 0) ? 1 : 0; -} - - -#if !(defined(SYSV) || defined(WIN32)) -void ARMCI_Set_shmem_limit(unsigned long shmemlimit) -{ - /* not applicable here - * aborting would make user's life harder - */ -} -#endif - - - -void ARMCI_Copy(void *src, void *dst, int n) -{ - armci_copy(src,dst,n); -} - -extern void cpu_yield(); -void armci_util_wait_int(volatile int *p, int val, int maxspin) -{ -int count=0; -extern void cpu_yield(); - while(*p != val) - if((++count)proc); - - if(direct) { - return(success); - } - - if(nb_handle) { - - if(nb_handle->onesided_direct) { - for(i=0; icomm_desc[i].state) { - onesided_wait(&nb_handle->comm_desc[i]); - cpMemDeregister(&nb_handle->comm_desc[i].local_mdesc); - } - } - __asm__ __volatile__ ("mfence" ::: "memory"); - __asm__ __volatile__ ("sfence" ::: "memory"); - ARMCI_INIT_HANDLE(nb_handle); - return(success); - } - - if(nb_handle->agg_flag) { - armci_agg_complete(nb_handle, UNSET); - return (success); - } - - if(nb_handle->tag!=0 && nb_handle->bufid==NB_NONE) { - ARMCI_NB_WAIT(nb_handle->cmpl_info); - __asm__ __volatile__ ("mfence" ::: "memory"); - __asm__ __volatile__ ("sfence" ::: "memory"); - return(success); - } - - # ifdef COMPLETE_HANDLE - COMPLETE_HANDLE(nb_handle->bufid,nb_handle->tag,(&success)); - # endif - } - - __asm__ __volatile__ ("mfence" ::: "memory"); - __asm__ __volatile__ ("sfence" ::: "memory"); - return(success); -} - -/** - * implicit handle - */ -static char hdl_flag[ARMCI_MAX_IMPLICIT]; -static int impcount=0; -armci_ihdl_t armci_set_implicit_handle (int op, int proc) { - - int i=impcount%ARMCI_MAX_IMPLICIT; - if(hdl_flag[i]=='1') - PARMCI_Wait((armci_hdl_t*)&armci_inb_handle[i]); - -#ifdef BGML - armci_inb_handle[i].count=0; -#endif - armci_inb_handle[i].tag = GET_NEXT_NBTAG(); - armci_inb_handle[i].op = op; - armci_inb_handle[i].proc = proc; - armci_inb_handle[i].bufid = NB_NONE; - armci_inb_handle[i].agg_flag = 0; - hdl_flag[i]='1'; - ++impcount; - return &armci_inb_handle[i]; -} - - -/* wait for all non-blocking operations to finish */ -int PARMCI_WaitAll (void) { -#ifdef BGML - BGML_WaitAll(); -#elif ARMCIX - ARMCIX_WaitAll (); -#else - int i; - if(impcount) { - for(i=0; iagg_flag = 1; - ((armci_ihdl_t)(nb_handle))->proc = -1; -} - -void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t* nb_handle) { - ((armci_ihdl_t)(nb_handle))->agg_flag = 0; - ((armci_ihdl_t)(nb_handle))->proc = -1; -} - -int parmci_notify(int proc) -{ -#ifdef DOELAN4 - if(proc==armci_me){ - return 0; - } -#endif -#if defined(GM) || (defined(DOELAN4) && defined(ELAN_ACC)) - { - extern int armci_inotify_proc(int); - return(armci_inotify_proc(proc)); - } -#else - armci_notify_t *pnotify = _armci_notify_arr[armci_me]+proc; - pnotify->sent++; -# ifdef MEM_FENCE - if(SAMECLUSNODE(proc)) MEM_FENCE; -# endif - PARMCI_Put(&pnotify->sent,&(_armci_notify_arr[proc]+armci_me)->received, - sizeof(pnotify->sent),proc); - return(pnotify->sent); -#endif -} - - -/*\ blocks until received count becomes >= waited count - * return received count and store waited count in *pval -\*/ -int parmci_notify_wait(int proc,int *pval) -{ - int retval; -#ifdef DOELAN4 - if(proc==armci_me){ -#ifdef MEM_FENCE - MEM_FENCE; -#endif - return 0; - } -#endif - -#if defined(GM) || (defined(DOELAN4) && defined(ELAN_ACC)) - { - extern int armci_inotify_wait(int,int*); - retval=armci_inotify_wait(proc,pval); - } -#else - { - long loop=0; - armci_notify_t *pnotify = _armci_notify_arr[armci_me]+proc; - pnotify->waited++; - while( pnotify->waited > pnotify->received) { - if(++loop == 1000) { loop=0;cpu_yield(); } - armci_util_spin(loop, pnotify); - } - *pval = pnotify->waited; - retval=pnotify->received; - } -#endif - - return retval; -} - -long armci_util_long_getval(long* p) -{ - return *p; -} - -int armci_util_int_getval(int* p) -{ - return *p; -} - - -int PARMCI_Test(armci_hdl_t *usr_hdl) -{ -armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; -int success=0; -#ifdef BGML - success=(int)nb_handle->count; -#else -int direct=SAMECLUSNODE(nb_handle->proc); - if(direct)return(success); - if(nb_handle) { - if(nb_handle->agg_flag) { - armci_die("test for aggregate handle not yet implemented\n",0); - } - } - if(nb_handle){ -# ifdef ARMCI_NB_TEST - if(nb_handle->tag==0){ - ARMCI_NB_TEST(nb_handle->cmpl_info,&success); - return(success); - } -# ifdef LAPI - if(nb_handle->tag!=0 && nb_handle->bufid==NB_NONE){ - ARMCI_NB_TEST(nb_handle->cmpl_info,&success); - return(success); - } -# endif -# endif -# ifdef TEST_HANDLE - TEST_HANDLE(nb_handle->bufid,nb_handle->tag,(&success)); -# endif - } -#endif - return(success); -} - -#ifdef DO_CKPT -void ARMCI_Ckpt_create_ds(armci_ckpt_ds_t *ckptds, int count) -{ - armci_create_ckptds(ckptds,count); -} - -int ARMCI_Ckpt_init(char *filename, ARMCI_Group *grp, int savestack, int saveheap, armci_ckpt_ds_t *ckptds) -{ -int rid; - rid = armci_icheckpoint_init(filename,grp,savestack,saveheap,ckptds); - return(rid); -} - -int ARMCI_Ckpt(int rid) -{ - return(armci_icheckpoint(rid)); -} - -void ARMCI_Ckpt_Recover(int rid, int iamreplacement) -{ - armci_irecover(rid, iamreplacement); -} -void ARMCI_Ckpt_finalize(int rid) -{ - armci_icheckpoint_finalize(rid); -} -#endif -#if ARMCI_ENABLE_GPC_CALLS -int armci_gpc(int hndl, int proc, void *hdr, int hlen, void *data, int dlen, - void *rhdr, int rhlen, void *rdata, int rdlen, - armci_hdl_t* nbh) { -armci_ihdl_t nb_handle = (armci_ihdl_t)nbh; -armci_giov_t darr[2]; /* = {{&rhdr, &rhdr, 1, rhlen}, {&rdata, &rdata, 1, rdlen}};*/ -gpc_send_t send; -char *ptr; - - /* initialize giov */ - darr[0].src_ptr_array = &rhdr; - darr[0].dst_ptr_array = &rhdr; - darr[0].ptr_array_len = 1; - darr[0].bytes = rhlen; - - darr[1].src_ptr_array = &rdata; - darr[1].dst_ptr_array = &rdata; - darr[1].ptr_array_len = 1; - darr[1].bytes = rdlen; - - -/* if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen()) */ -/* return FAIL2; */ -/* if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen()) */ -/* return FAIL2; */ -/* if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) */ -/* return FAIL2; */ -/* if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) */ -/* return FAIL2; */ - - if(hlen>0 && hdr==NULL) - return FAIL3; - if(rhlen>0 && rhdr==NULL) - return FAIL3; - if(dlen>0 && data==NULL) - return FAIL3; - if(rdlen>0 && rdata==NULL) - return FAIL3; - - if(proc<0 || proc >= armci_nproc) - return FAIL4; - - send.hndl = hndl; - send.hlen = hlen; - send.dlen = dlen; - send.hdr = hdr; - send.data = data; - - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = GET; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else { - ORDER(GET,proc); /*ensure ordering */ - nb_handle = NULL; - } - -#if defined(LAPI) || defined(GM) || defined(VAPI) || defined(QUADRICS) - if(armci_rem_gpc(GET, darr, 2, &send, proc, 1, nb_handle)) -#endif - return FAIL2; - return 0; -} - -int armci_sameclusnode(int proc) { - return SAMECLUSNODE(proc); -} -#endif - -void _armci_init_handle(armci_hdl_t *hdl) -{ - ((double *)((hdl)->data))[0]=0; - ((double *)((hdl)->data))[1]=0; -} - -static inline int val_to_char(int v) -{ - if (v >= 0 && v < 10) - return '0' + v; - else if (v >= 10 && v < 16) - return ('a' - 10) + v; - else - return -1; -} -static const char *nexttoken(const char *q, int sep) -{ - if (q) - q = strchr(q, sep); - if (q) - q++; - return q; -} - -#ifdef PORTALS_UNRESOLVED -int cstr_to_cpuset(cpu_set_t * mask, const char *str) -{ -const char *p, *q; -q = str; - CPU_ZERO(mask); - - while (p = q, q = nexttoken(q, ','), p) { - unsigned int a; /* beginning of range */ - unsigned int b; /* end of range */ - unsigned int s; /* stride */ - const char *c1, *c2; - if (sscanf(p, "%u", &a) < 1) - return 1; - b = a; - s = 1; - c1 = nexttoken(p, '-'); - c2 = nexttoken(p, ','); - if (c1 != NULL && (c2 == NULL || c1 < c2)) { - if (sscanf(c1, "%u", &b) < 1) - return 1; - c1 = nexttoken(c1, ':'); - if (c1 != NULL && (c2 == NULL || c1 < c2)) - if (sscanf(c1, "%u", &s) < 1) { - return 1; - } - } - if (!(a <= b)) - return 1; - while (a <= b) { - CPU_SET(a, mask); - a += s; - } - } - return 0; -} - -char *cpuset_to_cstr(cpu_set_t * mask, char *str) -{ -int i; -char *ptr = str; -int entry_made = 0; - for (i = 0; i < CPU_SETSIZE; i++) { - if (CPU_ISSET(i, mask)) { - int j; - int run = 0; - entry_made = 1; - for (j = i + 1; j < CPU_SETSIZE; j++) { - if (CPU_ISSET(j, mask)) - run++; - else - break; - } - if (!run) - sprintf(ptr, "%d,", i); - else if (run == 1) { - sprintf(ptr, "%d,%d,", i, i + 1); - i++; - } else { - sprintf(ptr, "%d-%d,", i, i + run); - i += run; - } - while (*ptr != 0) - ptr++; - } - } - ptr -= entry_made; - *ptr = 0; - return str; -} - -char *cpuset_to_str(cpu_set_t * mask, char *str) -{ -int base; -char *ptr = str; -char *ret = 0; - for (base = CPU_SETSIZE - 4; base >= 0; base -= 4) { - char val = 0; - if (CPU_ISSET(base, mask)) - val |= 1; - if (CPU_ISSET(base + 1, mask)) - val |= 2; - if (CPU_ISSET(base + 2, mask)) - val |= 4; - if (CPU_ISSET(base + 3, mask)) - val |= 8; - if (!ret && val) - ret = ptr; - *ptr++ = val_to_char(val); - } - *ptr = 0; - return ret ? ret : ptr - 1; -} -#endif - - -long armci_cksm_copy(char *src, char *dst, size_t bytes) -{ -long sum = 0; -size_t count=bytes; - while( count > 1 ) { - sum += * (unsigned int *) src++; - count -= 4; - } - - if( count > 0 ){ - printf("\nblistering barnicles"); - sum += * (unsigned char *) src; - } - - while (sum>>32) - sum = (sum & 0xffffffff) + (sum >> 32); - return(~sum); -} - -void code_summary() { - printf("\nActive #defines that could affect ARMCI"); - printf("\n----------------------------------------"); -# ifdef ORNL_USE_DS_FOR_REMOTE_GETS - printf("\n#define ORNL_USE_DS_FOR_REMOTE_GETS"); -# endif - -# ifdef PORTALS_USE_RENDEZ_VOUS - printf("\n#define PORTALS_USE_RENDEZ_VOUS"); -# endif - -# ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - printf("\n#define PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE"); -# endif - -# ifdef PORTALS_AFFINITY - printf("\n#define PORTALS_AFFINITY"); -# endif - -/* -# ifdef CRAY_USE_MDMD_COPY - printf("\n#define CRAY_USE_MDMD_COPY"); -# endif -*/ - printf("\n----------------------------------------"); - printf("\nInfo @ armci/src/code_options.h"); - printf("\n----------------------------------------\n"); - -# ifdef PORTALS - portals_print_summary(); -# endif -} diff --git a/armci/src-gemini/armci.h b/armci/src-gemini/armci.h deleted file mode 100644 index 643240cb2..000000000 --- a/armci/src-gemini/armci.h +++ /dev/null @@ -1,403 +0,0 @@ -/*$id$*/ -/* ARMCI header file */ -#ifndef _ARMCI_H -#define _ARMCI_H - -/* for size_t */ -#include - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -typedef unsigned long long u64Int; -typedef long long s64Int; - -extern int armci_sameclusnode(int proc); - -typedef struct { - void **src_ptr_array; - void **dst_ptr_array; - int ptr_array_len; - int bytes; -} armci_giov_t; -typedef long armci_size_t; -extern int armci_notify(int proc); -extern int armci_notify_wait(int proc,int *pval); -extern int ARMCI_Init(void); /* initialize ARMCI */ -extern int ARMCI_Init_mpi_comm(MPI_Comm comm); /* initialize ARMCI */ -extern int ARMCI_Init_args(int *argc, char ***argv); -extern void ARMCI_Barrier(void); /* ARMCI Barrier*/ - -extern int ARMCI_Put(void *src, void* dst, int bytes, int proc); -extern int ARMCI_Put_flag(void *src, void* dst,int bytes,int *f,int v,int proc); - -#define ARMCI_Put1(_s,_d,_b,_p) memcpy(_d,_s,_b), 0 - -extern int ARMCI_PutS( /* strided put */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_PutS_flag_dir( /* put with flag that uses direct put */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int *flag, /* pointer to remote flag */ - int val, /* value to set flag upon completion of - data transfer */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_PutS_flag( - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int *flag, /* pointer to remote flag */ - int val, /* value to set flag upon completion of - data transfer */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_Acc(int optype, void *scale, void *src, void *dst, int bytes, int proc); - -extern int ARMCI_AccS( /* strided accumulate */ - int optype, /* operation */ - void *scale, /* scale factor x += scale*y */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ); - - -extern int ARMCI_Get(void *src, void* dst, int bytes, int proc); - -extern int ARMCI_GetS( /* strided get */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_GetV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_PutV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_AccV( int op, /* operation code */ - void *scale, /* scaling factor for accumulate */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_PutValueInt(int src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc /* remote process (or) ID */ - ); - -extern int ARMCI_PutValueLong(long src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc /* remote process (or) ID */ - ); - -extern int ARMCI_PutValueFloat(float src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc /* remote process (or) ID */ - ); - -extern int ARMCI_PutValueDouble(double src,/* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc /* remote process (or) ID */ - ); - -extern int ARMCI_GetValueInt(void *src, int proc); -extern long ARMCI_GetValueLong(void *src, int proc); -extern float ARMCI_GetValueFloat(void *src, int proc); -extern double ARMCI_GetValueDouble(void *src, int proc); - - -extern int ARMCI_Malloc(void* ptr_arr[], armci_size_t bytes); -extern int ARMCI_Malloc_memdev(void* ptr_arr[], armci_size_t bytes, - const char *device); -extern int ARMCI_Free(void *ptr); -extern int ARMCI_Free_memdev(void *ptr); -extern void* ARMCI_Malloc_local(armci_size_t bytes); -extern int ARMCI_Free_local(void *ptr); -extern int ARMCI_Same_node(int proc); - -extern void ARMCI_Finalize(); /* terminate ARMCI */ -extern void ARMCI_Error(char *msg, int code); -extern void ARMCI_Fence(int proc); -extern void ARMCI_DoFence(int proc); -extern void ARMCI_AllFence(void); -extern int ARMCI_Rmw(int op, void *ploc, void *prem, int extra, int proc); -extern void ARMCI_Cleanup(void); -extern int ARMCI_Create_mutexes(int num); -extern int ARMCI_Destroy_mutexes(void); -extern void ARMCI_Lock(int mutex, int proc); -extern void ARMCI_Unlock(int mutex, int proc); -extern void ARMCI_Set_shm_limit(unsigned long shmemlimit); -extern int ARMCI_Uses_shm(); -extern void ARMCI_Copy(void *src, void *dst, int n); - -#define FAIL -1 -#define FAIL2 -2 -#define FAIL3 -3 -#define FAIL4 -4 -#define FAIL5 -5 -#define FAIL6 -6 -#define FAIL7 -7 -#define FAIL8 -8 - -#define ARMCI_SWAP 10 -#define ARMCI_SWAP_LONG 11 -#define ARMCI_FETCH_AND_ADD 12 -#define ARMCI_FETCH_AND_ADD_LONG 13 - -#define ARMCI_ACC_OFF 36 -#define ARMCI_ACC_INT (ARMCI_ACC_OFF + 1) -#define ARMCI_ACC_DBL (ARMCI_ACC_OFF + 2) -#define ARMCI_ACC_FLT (ARMCI_ACC_OFF + 3) -#define ARMCI_ACC_CPL (ARMCI_ACC_OFF + 4) -#define ARMCI_ACC_DCP (ARMCI_ACC_OFF + 5) -#define ARMCI_ACC_LNG (ARMCI_ACC_OFF + 6) -#define ARMCI_ACC_RA (ARMCI_ACC_OFF + 7) - -#define ARMCI_MAX_STRIDE_LEVEL 8 - -#ifdef BGML -#define ARMCI_CRITICAL_SECTION_ENTER() BGML_CriticalSection_enter(); -#define ARMCI_CRITICAL_SECTION_EXIT() BGML_CriticalSection_exit(); -#else -#define ARMCI_CRITICAL_SECTION_ENTER() -#define ARMCI_CRITICAL_SECTION_EXIT() -#endif - -/************ locality information **********************************************/ -typedef int armci_domain_t; -#define ARMCI_DOMAIN_SMP 0 /* SMP node domain for armci_domain_XXX calls */ -extern int armci_domain_nprocs(armci_domain_t domain, int id); -extern int armci_domain_id(armci_domain_t domain, int glob_proc_id); -extern int armci_domain_glob_proc_id(armci_domain_t domain, int id, int loc_proc_id); -extern int armci_domain_my_id(armci_domain_t domain); -extern int armci_domain_count(armci_domain_t domain); -extern int armci_domain_same_id(armci_domain_t domain, int proc); -extern int armci_smp_master(int); - - -/* PVM group - * On CrayT3E: the default group is the global group which is (char *)NULL - * It is the only working group. - * On Workstations: the default group is "mp_working_group". User can set - * the group name by calling the ARMCI_PVM_init (defined - * in message.c) and passing the group name to the library. - */ - -extern char *mp_group_name; - -/*********************stuff for non-blocking API******************************/ -/*\ the request structure for non-blocking api. - rmo: it appears that we need measure the size of armci_ireq_t and set the - size of armci_hdl_t accordingly. this really needs to be re-designed. -\*/ - -#define ARMCI_ONESIDED_SIZEOF_IREQ 21016 -typedef struct { - char data[ARMCI_ONESIDED_SIZEOF_IREQ]; -} armci_hdl_t; -#define armci_req_t armci_hdl_t - -typedef int ARMCI_Group; - -extern void ARMCI_Group_create(int n, int *pid_list, ARMCI_Group *group_out); -extern void ARMCI_Group_create_child(int n, int *pid_list, - ARMCI_Group *group_out, ARMCI_Group *group_parent); -extern void ARMCI_Group_free(ARMCI_Group *group); -extern int ARMCI_Group_rank(ARMCI_Group *group, int *rank); -extern void ARMCI_Group_size(ARMCI_Group *group, int *size); -extern void ARMCI_Group_set_default(ARMCI_Group *group); -extern void ARMCI_Group_get_default(ARMCI_Group *group_out); -extern void ARMCI_Group_get_world(ARMCI_Group *group_out); - -extern int ARMCI_Malloc_group(void *ptr_arr[], armci_size_t bytes,ARMCI_Group *group); -extern int ARMCI_Malloc_group_memdev(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group, const char *device); -extern int ARMCI_Free_group(void *ptr, ARMCI_Group *group); - -extern int ARMCI_NbPut(void *src, void* dst, int bytes, int proc,armci_hdl_t* nb_handle); - -extern int ARMCI_NbPutS( /* strided put */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbAccS( /* strided accumulate */ - int optype, /* operation */ - void *scale, /* scale factor x += scale*y */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbGet(void *src, void* dst, int bytes, int proc,armci_hdl_t* nb_handle); - -extern int ARMCI_NbGetS( /* strided get */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handler/*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbAccV( int op, /* operation code */ - void *scale, /* scaling factor for accumulate */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbPutValueInt(int src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc, /* remote process (or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking - request handle */ - ); - -extern int ARMCI_NbPutValueLong(long src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc, /* remote process (or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking - request handle */ - ); - -extern int ARMCI_NbPutValueFloat(float src,/* value in a register to put */ - void *dst,/* dest starting addr to put data */ - int proc, /* remote process (or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking - request handle */ - ); - -extern int ARMCI_NbPutValueDouble(double src,/* value in a register to put */ - void *dst,/* dest starting addr to put data*/ - int proc, /* remote process (or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking - request handle */ - ); - -extern int ARMCI_Wait(armci_hdl_t* nb_handle); /*non-blocking request handle*/ - -extern int ARMCI_Test(armci_hdl_t* nb_handle); /*non-blocking request handle*/ - -extern int ARMCI_WaitAll (void); - -extern int ARMCI_WaitProc (int proc); - -extern void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t* nb_handle); - -extern void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t* nb_handle); - -extern void ARMCI_INIT_HANDLE(void *hdl); - - -/* -------------- ARMCI Non-collective memory allocator ------------- */ -typedef struct armci_meminfo_ds { - char * armci_addr; /* remote address of the creator which can be - used in ARMCI communication */ - char *addr; /* local address of creator which can be used in - to set SMP memoffset, armci_set_mem_offset() */ - size_t size; /* size of remote pid's segment (bytes) */ - int cpid; /* armci pid of creator */ - long idlist[64]; -} armci_meminfo_t; - -extern void ARMCI_Memget(size_t bytes, armci_meminfo_t *meminfo, int memflg); - -extern void* ARMCI_Memat(armci_meminfo_t *meminfo, long offset); - -extern void ARMCI_Memdt(armci_meminfo_t *meminfo, int memflg); - -extern void ARMCI_Memctl(armci_meminfo_t *meminfo); - -/* ------------------- ARMCI Checkpointing/Recovery ----------------- */ -#ifdef DO_CKPT -#define ARMCI_CKPT 0 -#define ARMCI_RESTART 1 -typedef struct { - void **ptr_arr; - size_t *sz; - int *saveonce; - int count; -}armci_ckpt_ds_t; -void ARMCI_Ckpt_create_ds(armci_ckpt_ds_t *ckptds, int count); -int ARMCI_Ckpt_init(char *filename, ARMCI_Group *grp, int savestack, int saveheap, armci_ckpt_ds_t *ckptds); -int ARMCI_Ckpt(int rid); -void ARMCI_Ckpt_finalize(int rid); -#define ARMCI_Restart_simulate armci_irecover -# ifdef MSG_COMMS_MPI - ARMCI_Group * ARMCI_Get_ft_group(); -# endif -#endif - -/* ------------------------------------------------------------------ */ - - -#if defined(__cplusplus) || defined(c_plusplus) -} -#endif - -#endif /* _ARMCI_H */ - - diff --git a/armci/src-gemini/armci_portals.c b/armci/src-gemini/armci_portals.c deleted file mode 100644 index 72f103325..000000000 --- a/armci/src-gemini/armci_portals.c +++ /dev/null @@ -1,2227 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - - /*$id:$*/ -#define _GNU_SOURCE -#include -#include -#include "armcip.h" -#include "message.h" -#include -#include -#include -#include -#include - -#define DEBUG_COMM 0 -#define DEBUG_INIT 0 -#define DEBUG_SERV 0 -#define PUT_LOCAL_ONLY_COMPLETION__ - -typedef struct arminfo{ - caddr_t ptr[MAX_DS]; - size_t size[MAX_DS]; - long serv_offs[MAX_DS]; - int cur_ds; -}rm_info_t; - -static rm_info_t *all_meminfo; - -static int client_md_count=0,serv_md_count=0; - -typedef struct arns{ - long data; - long data1; - struct arns *next; -} arnode; - -#ifdef ARMCI_CHECK_STATE -arnode * arlist_add(arnode **p, long i,long j) -{ - arnode *n = (arnode *)malloc(sizeof(arnode)); - if(n == NULL) - return NULL; - n->next = *p; - *p = n; - n->data = i; - n->data1 = j; - return *p; -} - -void arlist_remove(arnode **p) -{ - if(*p != NULL){ - arnode *n = *p; - *p = (*p)->next; - free(n); - } -} - -arnode **arlist_search(arnode **n, long i) -{ - while (*n != NULL){ - if ((*n)->data == i){ - return n; - } - n = &(*n)->next; - } - return NULL; -} - -void arlist_print(arnode *n) -{ - if (n == NULL){ - /*printf("arlist is empty\n");*/ - } - while (n != NULL){ - printf("%d:%d %d next=%d\n", armci_me,n->data,n->data1,(n->next==NULL)?0:1); - n = n->next; - } -} -#endif - -extern void armci_util_wait_int(volatile int *, int , int ); -extern void armci_util_wait_long(volatile long *, long, int ); - -int _armci_portals_server_ready=0; -int _armci_portals_client_ready=0; -int _armci_server_mutex_ready=0; -void *_armci_server_mutex_ptr = NULL; - -#ifdef ARMCI_REGISTER_SHMEM -typedef struct { - void *base_ptr; - void *serv_ptr; - size_t size; - int islocal; - int valid; -}aptl_reginfo_t; - -typedef struct { - aptl_reginfo_t reginfo[MAX_MEM_REGIONS]; - int reg_count; -} rem_meminfo_t; -#endif - -typedef struct serv_buf_t{ - ptl_handle_md_t md_h; - ptl_handle_me_t me_h; - ptl_md_t md; - char *buf; - char *bufend; -} serv_buf_t; - -char **client_buf_ptrs; -static int armci_server_terminating=0; - -serv_buf_t *serv_bufs; -long servackval=ARMCI_STAMP,*serv_ack_ptr=&servackval; -ptl_handle_md_t serv_ack_md_h,serv_response_md_h; - -static armci_portals_proc_t _armci_portals_proc_struct; -static armci_portals_serv_t _armci_portals_serv_struct; -static armci_portals_proc_t *portals = &_armci_portals_proc_struct; -static armci_portals_serv_t *serv_portals = &_armci_portals_serv_struct; -/*static */comp_desc _compdesc_array[NUM_COMP_DSCR]; - -static arnode *arn = NULL; - -#ifdef ARMCI_REGISTER_SHMEM -static rem_meminfo_t *_rem_meminfo; -static aptl_reginfo_t *_tmp_rem_reginfo; - -#define IN_REGION(_ptr__,_reg__) ((_reg__.valid) && (_ptr__)>=(_reg__.serv_ptr) \ - && (_ptr__) <= ( (char *)(_reg__.serv_ptr)+_reg__.size)) -#endif - -static int ptl_initialized = 0; -extern pid_t server_pid; - -ptl_ni_limits_t armci_ptl_nilimits; -ptl_ni_limits_t armci_ptl_Snilimits; - -void armci_portals_init_ptl() -{ -int rc; -int npes,i; - ARMCI_PR_DBG("enter",0); - - /*initialize data structures*/ - portals->ptl = ARMCI_PORTALS_PTL_NUMBER; /* our own ptl number */ - - rc=PtlNIInit(IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS), - PTL_PID_ANY, NULL, &armci_ptl_nilimits, &(portals->ni_h)); - switch(rc) { - case PTL_OK: - /*printf("\n%d:ok for nii\n",armci_me);*/ - break; - case PTL_IFACE_DUP: - /*printf("\n%d:dup for nii\n",armci_me);*/ - break; - default: - printf( "PtlNIInit() failed %d error=%s\n",rc,ARMCI_NET_ERRTOSTR(rc) ); - exit(1); - } - - if((rc=PtlGetId(portals->ni_h,&portals->rank)) !=PTL_OK) { - printf("%s: PtlGetId failed: %d(%d)\n",__FUNCTION__, rc, server_pid); - exit(1); - } - ARMCI_PR_DBG("exit",0); -} - -static inline void init_serv_buf(serv_buf_t *tmp) -{ -int rc; -ptl_match_bits_t ignbits = 0xFFFFFFFFF00000FF; -ptl_match_bits_t mbits; -ptl_process_id_t match_id; -ptl_md_t *md_ptr,md; - - ARMCI_PR_DBG("enter",0); - tmp->md.user_ptr=tmp; - tmp->md.start=tmp->buf; - tmp->md.length=armci_nproc*NUM_SERV_BUFS*VBUF_DLEN; - tmp->md.eq_handle=portals->Seq_h; - tmp->md.max_size=0; - tmp->md.threshold=PTL_MD_THRESH_INF; - tmp->md.options=PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - { - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - mbits = 16<<8; - - rc = PtlMEAttach(portals->Sni_h,portals->ptl,match_id,mbits,ignbits, - PTL_RETAIN,PTL_INS_AFTER,&(tmp->me_h)); - if (rc != PTL_OK) { - printf("(%d):PtlMEAttach: %s\n", portals->Srank,ARMCI_NET_ERRTOSTR(rc)); - armci_die("portals attach error isb",rc); - } - tmp->md.options=PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE | PTL_MD_MANAGE_REMOTE; - - rc = PtlMDAttach((tmp->me_h),tmp->md,PTL_RETAIN,&(tmp->md_h)); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->Srank, ARMCI_NET_ERRTOSTR(rc),(serv_md_count+client_md_count) ); - exit(1); - } - serv_md_count++; - } - /*set up for sending acks */ - md_ptr = &(md); - md_ptr->start = serv_ack_ptr; - md_ptr->length = sizeof(long); - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = sizeof(long); - md_ptr->eq_handle = portals->Seq_h; - - rc = PtlMDBind(portals->Sni_h,md,PTL_RETAIN,&serv_ack_md_h); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBindxn: %s %d\n", portals->Srank.nid, - ARMCI_NET_ERRTOSTR(rc),(serv_md_count+client_md_count)); - armci_die("ptlmdbind failed",0); - } - serv_md_count++; - /*set up for sending response */ - md_ptr = &(md); - md_ptr->start = tmp->buf; - md_ptr->length = tmp->md.length; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = tmp->md.length; - md_ptr->eq_handle = portals->Seq_h; - - rc = PtlMDBind(portals->Sni_h, md, PTL_RETAIN, &serv_response_md_h); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBindxn: %s %d\n", portals->Srank.nid, - ARMCI_NET_ERRTOSTR(rc),(serv_md_count+client_md_count)); - armci_die("ptlmdbind failed",0); - } - serv_md_count++; - ARMCI_PR_DBG("exit",0); -} - -void armci_portals_wait_for_client() -{ -int rc; -int *procidinfo; -extern armci_clus_t *armci_clus_info; -ptl_process_id_t *tmp; - ARMCI_PR_SDBG("enter",0); - //printf(" "); - armci_util_wait_int(&_armci_portals_client_ready,1,1000); - if((armci_me)!=armci_master){ - exit(0); - } - else{ - if(DEBUG_SERV){ - printf("\n%d:chosen one nid,pid=%d,%d\n",armci_me,portals->Srank.nid,portals->Srank.pid); - } - } - ARMCI_PR_SDBG("exit",0); -} - - -void armci_portals_prepare_server() -{ -int rc,i,j; - ARMCI_PR_SDBG("enter",0); - serv_bufs=(serv_buf_t *)malloc(sizeof(serv_buf_t)); - bzero(serv_bufs,sizeof(serv_buf_t)); - assert(serv_bufs); - serv_bufs->buf=(char *)malloc((NUM_SERV_BUFS*armci_nproc*VBUF_DLEN)); - bzero(serv_bufs->buf,(NUM_SERV_BUFS*armci_nproc*VBUF_DLEN)); - assert(serv_bufs->buf); - serv_bufs->bufend=(char *)serv_bufs->buf+(NUM_SERV_BUFS*armci_nproc*VBUF_DLEN); - rc = PtlEQAlloc(portals->Sni_h,4*(NUM_SERV_BUFS*armci_nproc),NULL, &(portals->Seq_h)); - if (rc != PTL_OK) { - printf("(%d):Ptleaalloc() failed: %s %d (%d)\n",portals->Srank, - ARMCI_NET_ERRTOSTR(rc),(NUM_SERV_BUFS*armci_nproc),rc); - armci_die("EQ Alloc failed",rc); - } - init_serv_buf(serv_bufs); - _armci_portals_server_ready=1; - ARMCI_PR_SDBG("exit",0); -} - - -void *armci_server_code(void *data) -{ -int rc,num_interface; - ARMCI_PR_SDBG("enter",0); - if(DEBUG_INIT) - printf("%d: in server after creating thread.\n",armci_me); - - rc = PtlInit(&num_interface); - if (rc != PTL_OK) { - printf("PtlInit() failed %d %s\n",rc, ARMCI_NET_ERRTOSTR(rc) ); - exit(1); - } - - rc=PtlNIInit(IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS), - PTL_PID_ANY, NULL, &armci_ptl_Snilimits, &(portals->Sni_h)); - switch(rc) { - case PTL_OK: - //printf("\n(%d):ok for serv nii\n",armci_me); - break; - case PTL_IFACE_DUP: - //printf("\n(%d):dup for serv nii\n",armci_me); - break; - default: - printf( "PtlNIInit() serv failed %d error=%s\n",rc,ARMCI_NET_ERRTOSTR(rc) ); - exit(1); - } - - if((rc=PtlGetId(portals->Sni_h,&portals->Srank)) !=PTL_OK) { - printf("%s: PtlGetId failed: %d(%d)\n",__FUNCTION__, rc, server_pid); - exit(1); - } - /*printf("\n(%d):server nid=%d pid=%d\n",armci_me,portals->Srank.nid,portals->Srank.pid);*/ - - armci_portals_wait_for_client(); - armci_portals_prepare_server(); - - if(DEBUG_INIT) { - printf("(%d): connected to all computing processes\n",armci_me); - fflush(stdout); - } - armci_call_data_server(); - - armci_transport_cleanup(); - ARMCI_PR_SDBG("exit",0); - return(NULL); -} - - -void armci_client_connect_to_servers() -{ -int rc; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_md_t md_local; -ptl_handle_md_t md_hdl_local; -ptl_process_id_t *tmp; -int *procidinfo; -int c_info; -int *flag,shmid; -void *addr; -char *buf; -extern int _armci_server_started; - ARMCI_PR_DBG("enter",0); - - _armci_portals_client_ready=1; - if(armci_me==armci_master){ - armci_util_wait_int(&_armci_portals_server_ready,1,1000); - } - - armci_msg_barrier(); - _armci_server_started=1; - - if(armci_me==armci_master){ - portals->servid_map[armci_clus_me].pid=portals->Srank.pid; - portals->servid_map[armci_clus_me].nid=portals->Srank.nid; - } - - armci_msg_gop_scope(SCOPE_ALL,portals->servid_map,(sizeof(ptl_process_id_t)*armci_nclus)/sizeof(int),"+",ARMCI_INT); - - ARMCI_PR_DBG("exit",0); -} - -static int check_meminfo(void *ptr, long size, int proc) -{ - for(int i=0;i=0) && (right>=size)) - return(i+1); - } - return 0; -} - - -static void add_meminfo(void *ptr, size_t size, int proc) -{ - if(check_meminfo(ptr,(long)size,proc)!=0)armci_die("repeat add request for dss",proc); - all_meminfo[proc].cur_ds++; - all_meminfo[proc].ptr[all_meminfo[proc].cur_ds]=ptr; - all_meminfo[proc].size[all_meminfo[proc].cur_ds]=size; -#ifdef DEBUG_MEM - printf("\n%d:%s:adding %p %ld %d at %d",armci_me,__FUNCTION__,ptr,size,proc,all_meminfo[proc].cur_ds); -#endif -} - - -typedef struct{ - void *ptr; - size_t size; - size_t serv_offs; -} meminfo_t; - - -void armci_exchange_meminfo(void *ptr, size_t size,size_t off) -{ - static meminfo_t exng[armci_nproc]; - bzero(exng,sizeof(meminfo_t)*armci_nproc); - exng[armci_me].ptr=ptr; exng[armci_me].size=size; exng[armci_me].serv_offs = off; - armci_msg_gop_scope(SCOPE_ALL,exng,(sizeof(meminfo_t)*armci_nproc)/sizeof(int),"+",ARMCI_INT); - for(int i=0;ibrval[portals->cur_ds]){ - ptl_md_t *md_ptr; - ptl_match_bits_t ignbits = 0xFFFFFFFFFFFFFF00; - ptl_process_id_t match_id; - int rc,cds=++portals->cur_ds; - if(cds>=MAX_DS)armci_die("increase MAX_CDS",cds); - portals->dsbase[cds]=portals->brval[cds-1]; - //portals->dsbase[cds]=sbrk(0); - ptr = portals->brval[cds] = br_val; - size = portals->dssizes[cds]=((caddr_t)portals->brval[cds] - portals->dsbase[cds]); - portals->serv_offs[cds] = serv_offset; - printf("\n%d:%s:base=%p brval=%p dslen=%ld %p end=%p",armci_me,__FUNCTION__,portals->dsbase[cds],br_val, - portals->dssizes[cds],portals->brval[cds],get_heap_bottom_addr()); - - md_ptr = &(portals->heap_md[cds]); - md_ptr->start = portals->dsbase[cds]; - md_ptr->length = portals->dssizes[cds]; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = 0; - md_ptr->eq_handle = PTL_EQ_NONE; - - portals->heap_mb[cds]=cds+1; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id, - portals->heap_mb[cds], - ignbits, - PTL_RETAIN,PTL_INS_AFTER, - &(portals->heap_me_h[cds])); - if (rc != PTL_OK) { - printf("%d:PtlMEAttach: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("portals attach error reg",rc); - } - - rc = PtlMDAttach((portals->heap_me_h[cds]), - *md_ptr,PTL_RETAIN, - &(portals->heap_md_h[cds])); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->rank, ARMCI_NET_ERRTOSTR(rc),(client_md_count+serv_md_count)); - armci_die("portals attach error reg",rc); - } - } - else{ -#ifdef DEBUG_MEM_ - extern caddr_t _end; - printf("\n%d:%s:curds=%d brvalin=%p curbrval=%p _end=%p &_end=%p",armci_me,__FUNCTION__,portals->cur_ds,portals->brval[portals->cur_ds],br_val,_end,&_end); -#endif - } - armci_exchange_meminfo(ptr,size,serv_offset); -} - - -#ifndef PMI_SUCCESS -#define PMI_SUCCESS 0 -#endif - -static int *_client_servbuf_count; -int armci_init_portals(caddr_t atbeginbrval) -{ -#ifndef OLD_PORTALS_CODE - int i,rc,np,me; - ptl_process_id_t id; - ptl_process_id_t clone_id; - - MPI_Comm_size(ARMCI_COMM_WORLD,&np); - MPI_Comm_rank(ARMCI_COMM_WORLD,&me); - - if(armci_me != me) { - printf("[mpi %d]: armci_me=%d ... this is a problem\n",me,armci_me); - armci_die("mpi rank mismatch",911); - } - - portals_cp_init(); - - MPI_Barrier(ARMCI_COMM_WORLD); - - portals_ds_ready = 0; - if(armci_me == armci_master) { - portalsCloneDataServer( portals_ds_thread ); - portalsSpinLockOnInt( &portals_ds_ready,1,10000 ); - } - MPI_Barrier(ARMCI_COMM_WORLD); - - i=0; - if((rc=PMI_Initialized(&i))!=PMI_SUCCESS){ - printf("PMI_Initialized failed\n"); - } - - if(i==0){ - if((rc==PMI_Init(&i))!=PMI_SUCCESS){ - printf("MPI_Init failed (npes=%d)\n", armci_nproc); - } - } - - if((rc=PMI_CNOS_Get_nidpid_map(&portals_id_map))!=PMI_SUCCESS){ - printf("Getting proc map failed (npes=%d)\n", armci_nproc); - } - - /* create intra-node communicator */ - MPI_Barrier(ARMCI_COMM_WORLD); - portals_cp_init_throttle(armci_nclus); - MPI_Barrier(ARMCI_COMM_WORLD); - - /* stuff from old code ... */ - bzero(portals,sizeof(armci_portals_proc_t)); - // note: i got rid of this rem_meminfo stuff with the gemini version - // see that code to see how to remove it here - # ifdef ARMCI_REGISTER_SHMEM - _rem_meminfo = (rem_meminfo_t *)calloc(armci_nproc,sizeof(rem_meminfo_t)); - _tmp_rem_reginfo = (aptl_reginfo_t *)malloc(sizeof(aptl_reginfo_t)*armci_nproc); - if( _rem_meminfo==NULL || _tmp_rem_reginfo ==NULL) - armci_die("malloc failed in init_portals",0); - //if(armci_me == 0) { - // printf("sizeof(rem_meminfo_t)=%ld\n",sizeof(rem_meminfo_t)); - //} - # endif - # ifdef CRAY_USE_ARMCI_CLIENT_BUFFERS - client_buf_ptrs = (char **) calloc(armci_nproc,sizeof(char *)); - assert(client_buf_ptrs); - armci_msg_barrier(); - _armci_buf_init(); - # endif - /* end old stuff */ - - return 0; - -#else - -int num_interface; -int rc; -int npes,i; - ARMCI_PR_DBG("enter",0); - - bzero(portals,sizeof(armci_portals_proc_t)); - - _rem_meminfo = (rem_meminfo_t *)calloc(armci_nproc,sizeof(rem_meminfo_t)); - _tmp_rem_reginfo = (aptl_reginfo_t *)malloc(sizeof(aptl_reginfo_t)*armci_nproc); - if( _rem_meminfo==NULL || _tmp_rem_reginfo ==NULL) - armci_die("malloc failed in init_portals",0); - - portals->servid_map=(ptl_process_id_t *)calloc(armci_nclus,sizeof(ptl_process_id_t)); - if(portals->servid_map==NULL)armci_die("calloc of servidmap failed",0); - - rc = PtlInit(&num_interface); - if (rc != PTL_OK) { - printf("PtlInit() failed %d %s\n",rc, ARMCI_NET_ERRTOSTR(rc) ); - exit(1); - } - armci_portals_init_ptl(); - -#if 1 - i=0; - if((rc=PMI_Initialized(&i))!=PMI_SUCCESS){ - printf("PMI_Initialized failed\n"); - } - - if(i==0){ - if((rc==PMI_Init(&i))!=PMI_SUCCESS){ - printf("MPI_Init failed (npes=%d)\n", armci_nproc); - } - } - - if((rc=PMI_CNOS_Get_nidpid_map(&portals->procid_map))!=PMI_SUCCESS){ - printf("Getting proc map failed (npes=%d)\n", armci_nproc); - } - //printf(" "); -# else - portals->procid_map = (ptl_process_id_t *) calloc(armci_nproc,sizeof(ptl_process_id_t)); - portals->procid_map[armci_me]=portals->rank; - armci_msg_gop_scope(SCOPE_ALL,portals->procid_map,(sizeof(ptl_process_id_t)*armci_nproc)/sizeof(int),"+",ARMCI_INT); - //printf(" "); -#endif - - client_buf_ptrs = (char **) calloc(armci_nproc,sizeof(char *)); - assert(client_buf_ptrs); - armci_msg_barrier(); - - if(armci_me==armci_master)armci_create_server_process( armci_server_code ); - - rc = PtlEQAlloc(portals->ni_h,16*NUM_COMP_DSCR,NULL, &(portals->eq_h)); - if (rc != PTL_OK) { - printf("%d:Ptleqalloc() failed: %s (%d)\n", - armci_me, ARMCI_NET_ERRTOSTR(rc) , rc); - armci_die("EQ Alloc failed",rc); - } - /*printf("\n%d:client nid=%d pid=%d\n",armci_me,portals->rank.nid,portals->rank.pid);*/ - - _armci_buf_init(); - - for(i=0;ieq_h; - _compdesc_array[i].mem_dsc.max_size=0; - /*_compdesc_array[i].mem_dsc.threshold=PTL_MD_THRESH_INF;*/ - _compdesc_array[i].mem_dsc.threshold=2; - _compdesc_array[i].mem_dsc.options=PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - } - - ptl_initialized = 1; - portals->free_comp_desc_index=0; - /*for(i=0;iprocid_map[i].nid,portals->procid_map[i].pid);*/ - _client_servbuf_count = calloc(armci_nclus,sizeof(int)); - armci_msg_barrier(); - armci_client_connect_to_servers(); - armci_msg_barrier(); - if(DEBUG_COMM){ - cpu_set_t mycpuid,new_mask; - char cid[8],*cidptr; - int rrr; - extern char * cpuset_to_cstr(cpu_set_t *mask, char *str); - rrr=sched_getaffinity(0, sizeof(mycpuid), &mycpuid); - if(rrr)perror("sched_getaffinity"); - cidptr = cpuset_to_cstr(&mycpuid,cid); - printf("%d:my affinity is to %s\n",armci_me,cid); - } -#ifdef NEW_MALLOC - /*post entire heap wildcard for direct communication*/ - { - ptl_md_t *md_ptr; - ptl_match_bits_t ignbits = 0xFFFFFFFFFFFFFF00; - ptl_process_id_t match_id; - portals->cur_ds = 0; - portals->dsbase[0]=get_heap_bottom_addr(); - //portals->brval[0] = sbrk(0); - portals->brval[0] = atbeginbrval; - portals->dssizes[0]=((caddr_t)portals->brval[0] - portals->dsbase[0]); - printf("\n%d:base=%p dslen=%ld %p",armci_me,portals->dsbase[0], - portals->dssizes[0],portals->brval[0]); - - md_ptr = &(portals->heap_md[0]); - md_ptr->start = portals->dsbase[0]; - md_ptr->length = portals->dssizes[0]; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = 0; - md_ptr->eq_handle = PTL_EQ_NONE; - - portals->heap_mb[0]=1; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id, - portals->heap_mb[0], - ignbits, - PTL_RETAIN,PTL_INS_AFTER, - &(portals->heap_me_h[0])); - if (rc != PTL_OK) { - printf("%d:PtlMEAttach: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("portals attach error reg",rc); - } - - rc = PtlMDAttach((portals->heap_me_h[0]), - *md_ptr,PTL_RETAIN, - &(portals->heap_md_h[0])); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->rank, ARMCI_NET_ERRTOSTR(rc),(client_md_count+serv_md_count)); - armci_die("portals attach error reg",rc); - } - all_meminfo = (rm_info_t *)malloc(sizeof(rm_info_t)*armci_nproc); - all_meminfo[armci_me].cur_ds = -1; - armci_exchange_meminfo(portals->dsbase[0],portals->dssizes[0],0); - } -#endif - ARMCI_PR_DBG("exit",0); - return 0; -#endif -} - - -void armci_fini_portals() -{ - ARMCI_PR_DBG("enter",0); - if(DEBUG_INIT){ - printf("ENTERING ARMCI_FINI_PORTALS\n");fflush(stdout); - } -#ifdef ARMCI_CHECK_STATE - arlist_print(arn); -#endif - PtlNIFini(portals->ni_h); - /*PtlFini();*/ - if(DEBUG_INIT){ - printf("LEAVING ARMCI_FINI_PORTALS\n");fflush(stdout); - } - ARMCI_PR_DBG("exit",0); -} - - -void armci_pin_contig1(void *start,size_t bytes) -{ - -} - -#ifdef ARMCI_REGISTER_SHMEM -#ifndef NEW_MALLOC -void armci_register_req(void *start,int bytes, int ID) -{ -int rc; -ptl_md_t *md_ptr; -ptl_match_bits_t ignbits = 0xFFFFFFFFFFFFFF00; -ptl_process_id_t match_id; - - ARMCI_PR_DBG("enter",serv_portals->reg_count); - -#ifdef DEBUG_MEM - printf("\n(%d):armci_register_req start=%p bytes=%d\n", - armci_me,start,bytes);fflush(stdout); -#endif - - md_ptr = &(serv_portals->meminfo[serv_portals->reg_count].md); - md_ptr->start = start; - md_ptr->length = bytes; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = 0; - md_ptr->eq_handle = PTL_EQ_NONE; - - serv_portals->meminfo[serv_portals->reg_count].mb=serv_portals->reg_count+1; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id, - serv_portals->meminfo[serv_portals->reg_count].mb, - ignbits, - PTL_RETAIN,PTL_INS_AFTER, - &(serv_portals->meminfo[serv_portals->reg_count].me_h)); - if (rc != PTL_OK) { - printf("%d:PtlMEAttach: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("portals attach error reg",rc); - } - - rc = PtlMDAttach((serv_portals->meminfo[serv_portals->reg_count].me_h), - *md_ptr,PTL_RETAIN, - &serv_portals->meminfo[serv_portals->reg_count].md_h); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->rank, ARMCI_NET_ERRTOSTR(rc),(client_md_count+serv_md_count)); - armci_die("portals attach error reg",rc); - } - client_md_count++; - serv_portals->reg_count++; - - ARMCI_PR_DBG("exit",serv_portals->reg_count); -} -#endif -#endif - -int armci_must_remotecomplete=1; -extern _buf_ackresp_t *_buf_ackresp_first,*_buf_ackresp_cur; -int x_net_wait_ackresp(_buf_ackresp_t *ar) -{ -int rc; -ptl_event_t ev_t; -ptl_event_t *ev=&ev_t; -comp_desc *temp_comp = NULL; -int loop=1; -int temp_proc; - ARMCI_PR_DBG("enter",0); - while(ar->val){ - ev->type=0; - if((rc = PtlEQWait(portals->eq_h, ev)) != PTL_OK){ - printf("%d:PtlEQWait(): %d %s\n", portals->rank,rc, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("EQWait problem",rc); - } - if (ev->ni_fail_type != PTL_NI_OK) { - temp_comp = (comp_desc *)ev->md.user_ptr; - printf("%d:NI sent %d in event %d,%d.\n", - armci_me,portals->rank.nid, portals->rank.pid, ev->ni_fail_type); - armci_die("event failure problem",temp_comp->dest_id); - } - if(DEBUG_COMM){ - printf("\n%d:net_wait_ackresp:done waiting type=%d\n",armci_me, - ev->type); - fflush(stdout); - } - if (ev->type == PTL_EVENT_SEND_END){ - if(DEBUG_COMM){ - printf("\n%d:net_wait_ackresp:event send end\n",armci_me); - fflush(stdout); - } - temp_comp = (comp_desc *)ev->md.user_ptr; - if(temp_comp->type==ARMCI_PORTALS_GETPUT || temp_comp->type==ARMCI_PORTALS_NBGETPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - continue; - } - if(!armci_must_remotecomplete){ - if(temp_comp->type==ARMCI_PORTALS_PUT || temp_comp->type==ARMCI_PORTALS_NBPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - } - else - continue; - } - else{ - temp_comp->active++; - continue; - } - - - } - - else if (ev->type == PTL_EVENT_REPLY_END){ - temp_comp = (comp_desc *)ev->md.user_ptr; - if(DEBUG_COMM){ - printf("\n%d:net_wait_ackresp:reply end tag=%d\n",armci_me,temp_comp->tag); - fflush(stdout); - } - temp_comp->active = 0; /*this was a get request, so we are done*/ - temp_comp->tag=-1; - continue; - } - else if (ev->type == PTL_EVENT_ACK){ - temp_comp = (comp_desc *)ev->md.user_ptr; - if(DEBUG_COMM){ - printf("\n%d:net_wait_ackresp:event ack tag=%d\n",armci_me,temp_comp->tag); - fflush(stdout); - } - temp_comp->active=0; - temp_comp->tag=-1; - portals->outstanding_puts--; - } - else if (ev->type==PTL_EVENT_PUT_END){ - _buf_ackresp_t *sweep=_buf_ackresp_first; - if(DEBUG_COMM){printf("\n%d:put end offset=%d",armci_me,ev->offset);fflush(stdout);} - if(ar->val==ev->offset){ - /*bingo!*/ - ar->val=0; - } - else{ - while(sweep!=NULL){ - if(sweep->val==ev->offset){ - sweep->val=0; - break; - } - sweep=sweep->next; - } - /*if(sweep==NULL)armci_die("server wrote data at unexpected offset",ev->offset);*/ - if(sweep==NULL){ - int y; - printf("%d:server wrote data at unexpected offset %d",armci_me,ev->offset);fflush(stdout); - abort(); -# ifdef ARMCI_CHECK_STATE - for(y=0;yservid_map[y].pid==ev->initiator.pid && portals->servid_map[y].nid==ev->initiator.nid)break; - assert(y!=armci_nclus); - arlist_print(arn); - armci_rem_state(y); -# endif - } - } - } - else - armci_die("in net_wait_ackresp unknown event",ev->type); - } - -# ifdef ARMCI_CHECK_STATE - arlist_remove(arlist_search(&arn, ar->valc)); -# endif - ar->valc=0; - if(ar==_buf_ackresp_first)_buf_ackresp_first=ar->next; - if(ar->next!=NULL){ - ar->next->previous=ar->previous; - } - if(ar->previous!=NULL){ - /*printf("\n%d:prev=%p %d %p %p\n",armci_me,ar->previous, ar->val,ar->next,ar);fflush(stdout);*/ - ar->previous->next=ar->next; - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=ar->previous; - } - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=NULL; - - ar->previous=ar->next=NULL; - - ARMCI_PR_DBG("exit",0); - - return rc; -} -int armci_client_complete(ptl_event_kind_t evt,int proc_id, int nb_tag, - comp_desc *cdesc) -{ -int rc; -ptl_event_t ev_t; -ptl_event_t *ev=&ev_t; -comp_desc *temp_comp = NULL; -int loop=1; -int temp_proc; - ARMCI_PR_DBG("enter",0); - if(DEBUG_COMM){ - printf("\n%d:enter:client_complete active=%d tag=%d %d\n",armci_me, - cdesc->active,cdesc->tag,nb_tag);fflush(stdout); - } - if(nb_tag>0){ - if(cdesc->tag!=nb_tag)return 0; - } - while(cdesc->active!=0){ - ev->type=0; - if((rc = PtlEQWait(portals->eq_h, ev)) != PTL_OK){ - printf("%d:PtlEQWait(): %d %s\n", portals->rank,rc, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("EQWait problem",rc); - } - if (ev->ni_fail_type != PTL_NI_OK) { - temp_comp = (comp_desc *)ev->md.user_ptr; - printf("%d:NI sent %d in event %d,%d.\n", - armci_me,portals->rank.nid, portals->rank.pid, ev->ni_fail_type); - armci_die("event failure problem",temp_comp->dest_id); - } - if(DEBUG_COMM){ - printf("\n%d:armci_client_complete:done waiting type=%d\n",armci_me, - ev->type); - fflush(stdout); - } - if(cdesc!=ev->md.user_ptr){ - /*printf("\n%d:expecting desc %p completing %p\n",armci_me,cdesc,ev->md.user_ptr);*/ - } - if (ev->type == PTL_EVENT_SEND_END){ - if(DEBUG_COMM){ - printf("\n%d:armci_client_complete:event send end\n",armci_me); - fflush(stdout); - } - temp_comp = (comp_desc *)ev->md.user_ptr; - if(temp_comp->type==ARMCI_PORTALS_GETPUT || temp_comp->type==ARMCI_PORTALS_NBGETPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - continue; - } - if(!armci_must_remotecomplete){ - if(temp_comp->type==ARMCI_PORTALS_PUT || temp_comp->type==ARMCI_PORTALS_NBPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - } - else - continue; - } - else{ - temp_comp->active++; - continue; - } - } - - else if (ev->type == PTL_EVENT_REPLY_END){ - temp_comp = (comp_desc *)ev->md.user_ptr; - if(DEBUG_COMM){ - printf("\n%d:client_send_complete:reply end tag=%d\n",armci_me,temp_comp->tag); - fflush(stdout); - } - temp_comp->active = 0; /*this was a get request, so we are done*/ - temp_comp->tag=-1; - continue; - } - else if (ev->type == PTL_EVENT_ACK){ - temp_comp = (comp_desc *)ev->md.user_ptr; - if(DEBUG_COMM){ - printf("\n%d:client_send_complete:event ack tag=%d\n",armci_me,temp_comp->tag); - fflush(stdout); - } - temp_comp->active=0; - temp_comp->tag=-1; - portals->outstanding_puts--; - } - else if (ev->type==PTL_EVENT_PUT_END){ - _buf_ackresp_t *ar=_buf_ackresp_first; - while(ar!=NULL){ - if(ar->val==ev->offset){ - ar->val=0; - break; - } - ar=ar->next; - } - if(ar==NULL)armci_die("server wrote data at unexpected offset",ev->offset); - if(DEBUG_COMM){printf("\n%d:put end offset=%d",armci_me,ev->offset);fflush(stdout);} - } - else - armci_die("in client_complete unknown event",ev->type); - } - if(DEBUG_COMM){ - printf("\n%d:exit:client_complete active=%d tag=%d %d\n",armci_me, - cdesc->active,cdesc->tag,nb_tag);fflush(stdout); - } - - ARMCI_PR_DBG("exit",0); - - return rc; -} - - -comp_desc * get_free_comp_desc(int * comp_id) -{ -comp_desc * c; -int rc = PTL_OK; - - ARMCI_PR_DBG("enter",0); - - c = &(_compdesc_array[portals->free_comp_desc_index]); - if(c->active!=0 && c->tag>0) - armci_client_complete(0,c->dest_id,c->tag,c); - else{ - /* - if(c->active!=0) - printf("\n%d:potential problem:active completion descriptor but tag=%d",armci_me,c->tag); - else - printf("\n%d:potential problem:active completion descriptor with tag=%d",armci_me,c->tag); - */ - } - if(!armci_must_remotecomplete){ - do{ - rc = PtlMDUnlink(c->mem_dsc_hndl); - }while(rc==PTL_MD_IN_USE); - } - - *comp_id = portals->free_comp_desc_index; - if(DEBUG_COMM){ - printf("\nthe value of comp_desc_id is %d\n",*comp_id); - fflush(stdout); - } - portals->free_comp_desc_index = (portals->free_comp_desc_index+1) % NUM_COMP_DSCR; - - ARMCI_PR_DBG("exit",0); - - return c; -} - - -void print_mem_desc(ptl_md_t * md) -{ - printf("%d:%p:start=%p length=%d threshold=%d max_size=%d options=%d eq_handle=%d\n",armci_me,md,md->start, md->length,md->threshold,md->max_size,md->options,md->eq_handle); - fflush(stdout); -} - - -#ifndef NEW_MALLOC -#if 0 -void armci_unregister_shmem(void *my_ptr, long size) -{ -int i=0,dst,found=0; -long id ; -long reg_size=0; -int reg_num = _rem_meminfo[armci_me].reg_count; -void *tptr; - - ARMCI_PR_DBG("enter",reg_num); -#ifdef DEBUG_MEM - printf("%d:%s:got size=%ld myptr %p\n",armci_me,__FUNCTION__,size,my_ptr); - fflush(stdout); -#endif - bzero(_tmp_rem_reginfo,sizeof(aptl_reginfo_t)*armci_nproc); - if(reg_num>=MAX_MEM_REGIONS) - armci_die("reg_num corrupted",reg_num); - for(i=0;i=MAX_MEM_REGIONS) - armci_die("reg_num corrupted",reg_num); - for(i=0;i=MAX_MEM_REGIONS-1){ - printf("\n%d:more than expected regions -- %d, increase MAX_MEM_REGIONS",armci_me,_rem_meminfo[i].reg_count++);fflush(stdout); - armci_die2("more than expected regions",_rem_meminfo[i].reg_count,MAX_MEM_REGIONS); - } - } -#ifdef DEBUG_MEM - printf("%d: regist id=%ld found=%d size=%ld reg_num=%d\n", - armci_me,id,found,reg_size,reg_num); - fflush(stdout); -#endif - ARMCI_PR_DBG("exit",0); -} - -void armci_register_shmem_grp(void *my_ptr, long size, long *idlist, long off, - void *sptr,ARMCI_Group *group) -{ -ARMCI_Group orig_group; - ARMCI_PR_DBG("enter",0); - ARMCI_Group_get_default(&orig_group); - ARMCI_Group_set_default(group); - armci_register_shmem(my_ptr,size,idlist,off,sptr); - ARMCI_Group_set_default(&orig_group); - ARMCI_PR_DBG("enter",0); -} -#endif -#endif // end #ifdef ARMCI_REGISTER_SHMEM - -static int _get_rem_servinfo(int serv,size_t bytes, size_t* offset) -{ -int i; - ARMCI_PR_DBG("enter",0); - i = 16<<8; - *offset=(armci_me*NUM_SERV_BUFS+_client_servbuf_count[serv])*VBUF_DLEN; - _client_servbuf_count[serv] = (_client_servbuf_count[serv]+1)%NUM_SERV_BUFS; - ARMCI_PR_DBG("exit",i); - return i; -} - -static int _get_rem_info(int proc, void *ptr,size_t bytes, size_t* offset) -{ -#ifdef ARMCI_REGISTER_SHMEM -int i; - ARMCI_PR_DBG("enter",0); -#ifdef NEW_MALLOC - i = check_meminfo(ptr,(long)bytes,proc); - if(i==0){ - printf("\n%d:ptr=%p bytes=%d proc=%d",armci_me,ptr,bytes,proc); - armci_die("region not found",proc); - } - *offset = (size_t)((caddr_t)ptr-(caddr_t)portals->dsbase[i-1]); - printf("\n%d:ptr=%p dsbase[0]=%p offs=%ld",armci_me,ptr,portals->dsbase[0],*offset);fflush(stdout); - if(*offset>=0){ - ARMCI_PR_DBG("exit A",(i+1)); - return(i); - } -#else -rem_meminfo_t *mem_info=&(_rem_meminfo[proc]); -aptl_reginfo_t *memreg = mem_info->reginfo; - for(i=0;ireg_count;i++){ - /*for now size is not verified*/ - if(DEBUG_COMM){ - printf("\n%d:proc=%d regcount=%d reg=%d base=%p size=%d end=%p checkptr=%p\n",armci_me,proc,mem_info->reg_count,i,memreg[i].base_ptr,memreg[i].size, ((char *)memreg[i].base_ptr+memreg[i].size), ptr);fflush(stdout); - } - if((memreg[i].valid) && ptr>= memreg[i].base_ptr && - ptr< ((char *)memreg[i].base_ptr+memreg[i].size)){ - *offset = ((char *)ptr-(char *)memreg[i].base_ptr); - ARMCI_PR_DBG("exit A",(i+1)); - return (i+1); - } - } -#endif - ARMCI_PR_DBG("exit B",i); - armci_die("_get_rem_info, rem memory region not found",bytes); -#else - printf("_get_rem_info called ... this shouldn't happen"); abort(); -#endif -} - -void armci_client_direct_get(ptl_process_id_t dest_proc, - ptl_size_t offset_remote, ptl_match_bits_t mb, size_t bytes, - ptl_md_t *md_local, - ptl_handle_md_t *md_hdl_local) -{ -int rc; -ptl_size_t offset_local = 0; - - ARMCI_PR_DBG("enter",0); - - if(DEBUG_COMM){ - printf("\n%d:armci_client_direct_get:BYTES = %d\n",armci_me,bytes); - printf("\n%d:offr=%ld offl=%ld\n",armci_me,offset_remote,offset_local); - fflush(stdout); - } - - rc = PtlMDBind(portals->ni_h,*md_local, PTL_UNLINK, md_hdl_local); - if (rc != PTL_OK){ - printf("%d:PtlMDBind: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("ptlmdbind get failed",0); - } - -#ifdef CRAY_USE_MDMD_COPY - if (dest_proc.nid == portals->rank.nid) { - rc = PtlMDMDCopy(*md_hdl_local, dest_proc, - portals->ptl, - 0, - mb, - offset_remote); - } else { -#endif - rc = PtlGetRegion(*md_hdl_local,offset_local,bytes,dest_proc, - portals->ptl, - 0, - mb, - offset_remote); -#ifdef CRAY_USE_MDMD_COPY - } -#endif - - if (rc != PTL_OK){ - printf("%d:PtlGetRegion: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlGetRegion failed",0); - } - - if(DEBUG_COMM){ - printf("\n%d:issued get\n",armci_me);fflush(stdout); - } - - ARMCI_PR_DBG("exit",0); -} - -void armci_portals_get(int proc, void *src_buf, void *dst_buf, int bytes, - void** cptr,int tag) -{ -int rc; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_md_t *md_local; -ptl_handle_md_t *md_hdl_local; -int rem_info; -comp_desc *cdesc; -ptl_process_id_t dest_proc; -int c_info; -int cluster = armci_clus_id(proc); - - ARMCI_PR_DBG("enter",0); - - /*first remote process information*/ - /*dest_proc.nid = portals->procid_map[proc].nid; - dest_proc.pid = portals->procid_map[proc].pid;*/ - dest_proc.nid = portals->servid_map[cluster].nid; - dest_proc.pid = portals->servid_map[cluster].pid; - - /*create local xfer info*/ - cdesc = get_free_comp_desc(&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=dst_buf; - md_local->user_ptr = (void *)cdesc; - md_local->options = PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; - - /*get remote info*/ - rem_info = _get_rem_info(proc,src_buf,bytes,&offset_remote); - - cdesc->dest_id = proc; - if (tag){ - *((comp_desc **)cptr) = cdesc; - cdesc->tag = tag; - cdesc->type = ARMCI_PORTALS_NBGET; - /*printf("\n%d:get tag=%d c_info=%d - * %p",armci_me,tag,c_info,cdesc);fflush(stdout);*/ - } - else{ - cdesc->tag = 0; - cdesc->type = ARMCI_PORTALS_GET; - } - - cdesc->active = 1; - armci_client_direct_get(dest_proc,offset_remote,(ptl_match_bits_t)rem_info, - bytes,md_local,md_hdl_local); - - if(!tag){ - armci_client_complete(0,proc,0,cdesc); /* check this later */ - } - - ARMCI_PR_DBG("exit",0); -} - - -void armci_client_nb_get(int proc, void *src_buf, int *src_stride_arr, - void *dst_buf, int *dst_stride_arr, int bytes, - void** cptr,int tag) -{ -} - -void armci_client_direct_send(ptl_process_id_t dest_proc, - ptl_size_t offset_remote, ptl_match_bits_t mb, size_t bytes, - ptl_md_t *md_local, - ptl_handle_md_t *md_hdl_local) -{ -int rc; -ptl_size_t offset_local = 0; - - ARMCI_PR_DBG("enter",0); - - if(DEBUG_COMM){ - printf("%d:armci_client_direct_send:BYTES = %d\n",armci_me,bytes); - printf("\n%d:offr=%ld offl=%ld\n",armci_me,offset_remote,offset_local); - fflush(stdout); - } - /*print_mem_desc(md_local);*/ - rc = PtlMDBind(portals->ni_h,*md_local, PTL_UNLINK, md_hdl_local); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBind: %s\n", portals->rank, - ARMCI_NET_ERRTOSTR(rc)); - armci_die("ptlmdbind send failed",0); - } - if(armci_must_remotecomplete){ - rc = PtlPutRegion(*md_hdl_local,offset_local,bytes, - PTL_ACK_REQ, - dest_proc,portals->ptl,0, mb,offset_remote, 0); - } - else{ - rc = PtlPutRegion(*md_hdl_local,offset_local,bytes, - PTL_NOACK_REQ, - dest_proc,portals->ptl,0, mb,offset_remote, 0); - } - - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlPutRegion: %s\n", portals->rank, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlPutRegion failed",0); - } - - ARMCI_PR_DBG("exit",0); -} - - -void armci_portals_put(int proc, void *src_buf, void *dst_buf, int bytes, - void** cptr,int tag) -{ -int rc; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_md_t *md_local; -ptl_handle_md_t *md_hdl_local; -int rem_info; -comp_desc *cdesc; -ptl_process_id_t dest_proc; -int c_info; -int cluster = armci_clus_id(proc); - - ARMCI_PR_DBG("enter",0); - - /*first process information*/ - dest_proc.nid = portals->servid_map[cluster].nid; - dest_proc.pid = portals->servid_map[cluster].pid; - /*dest_proc.nid = portals->procid_map[proc].nid; - dest_proc.pid = portals->procid_map[proc].pid;*/ - - /*create local xfer info*/ - cdesc = get_free_comp_desc(&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=src_buf; - md_local->user_ptr = (void *)cdesc; - md_local->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - - /*get remote info*/ - rem_info = _get_rem_info(proc,dst_buf,bytes,&offset_remote); - - - if(DEBUG_COMM){ - printf("\n%d:offr=%ld offl=%ld\n",armci_me,offset_remote,offset_local); - } - - cdesc->dest_id = proc; - if (tag){ - *((comp_desc **)cptr) = cdesc; - cdesc->tag = tag; - cdesc->type = ARMCI_PORTALS_NBPUT; - /*printf("\n%d:put tag=%d c_info=%d - * %p",armci_me,tag,c_info,cdesc);fflush(stdout);*/ - } - else{ - cdesc->tag = 0; - cdesc->type = ARMCI_PORTALS_PUT; - } - - cdesc->active = 1; - - armci_client_direct_send(dest_proc,offset_remote,(ptl_match_bits_t)rem_info, - bytes,md_local,md_hdl_local); - - if(!tag){ - armci_client_complete(0,proc,0,cdesc); /* check this later */ - } - else - portals->outstanding_puts++; - - - ARMCI_PR_DBG("exit",0); - -} - -void armci_client_nb_send(int proc, void *src_buf, int *src_stride_arr, - void *dst_buf, int *dst_stride_arr, int bytes, - void** cptr,int tag) - -{ -} - -/*using non-blocking for multiple 1ds inside a 2d*/ -void armci_network_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, armci_ihdl_t nb_handle) -{ -int i, j,tag=0; -long idxs,idxd; /* index offset of current block position to ptr */ -int n1dim; /* number of 1 dim block */ -int bvalue_s[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; -int bvalue_d[MAX_STRIDE_LEVEL]; -int bytes = count[0]; -void *sptr,*dptr; -NB_CMPL_T cptr; -ptl_process_id_t dest_proc; -ptl_size_t offset_remote; -comp_desc *cdesc; -int c_info; -ptl_md_t *md_local; -int rem_info; -int cluster = armci_clus_id(proc); - - ARMCI_PR_DBG("enter",0); - - printf("%s calling abort ... network_strided not implemented\n",Portals_ID()); - abort(); - - if(nb_handle)tag=nb_handle->tag; - - /*first remote process information*/ - dest_proc.nid = portals->servid_map[cluster].nid; - dest_proc.pid = portals->servid_map[cluster].pid; - /*dest_proc.nid = portals->procid_map[proc].nid; - dest_proc.pid = portals->procid_map[proc].pid;*/ - - rem_info = _get_rem_info(proc,(op==GET)?src_ptr:dst_ptr,bytes,&offset_remote); - - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - - /* calculate the destination indices */ - bvalue_s[0] = 0; bvalue_s[1] = 0; bunit[0] = 1; - bvalue_d[0] = 0; bvalue_d[1] = 0; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) { - bvalue_s[i] = bvalue_d[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - if(ARMCI_ACC(op)){ /*for now die for acc*/ - /*lock here*/ -# ifdef ARMCI_CHECK_STATE - arlist_print(arn); - armci_rem_state(armci_clus_info[proc].master%armci_clus_info[0].nslave); -# endif - printf("\nSHOULD NOT DO NETWORK_STRIDED FOR ACCS \n",armci_me); - fflush(stdout); - armci_die("network_strided called for acc",proc); - } - - /*loop over #contig chunks*/ - for(i=0; i (count[j]-1)) bvalue_s[j] = 0; - if(bvalue_d[j] > (count[j]-1)) bvalue_d[j] = 0; - } - sptr = ((char *)src_ptr)+idxs; - dptr = ((char *)dst_ptr)+idxd; - cdesc = get_free_comp_desc(&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=(op==GET)?dptr:sptr; - md_local->user_ptr = (void *)cdesc; - cdesc->dest_id = proc; - cdesc->tag = tag; - - if(op==GET){ - md_local->options = PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; - cdesc->active = 1; - cdesc->type = ARMCI_PORTALS_NBGET; - /* - printf("\n%d:reminfo=%d off=%d idxs=%d idxd=%d",armci_me, rem_info, - offset_remote, idxs, idxd); - */ - armci_client_direct_get( dest_proc,offset_remote+idxs,rem_info, - bytes,md_local,md_hdl_local); - } - else if(op==PUT){ - cdesc->active = 1; - cdesc->type = ARMCI_PORTALS_NBPUT; - md_local->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - armci_client_direct_send(dest_proc,offset_remote+idxd,rem_info, - bytes,md_local,md_hdl_local); - if(op==PUT)portals->outstanding_puts++; - } - else if(ARMCI_ACC(op)){ - assert(0); - } - else{ - ARMCI_PR_DBG("exit",0); - armci_die("in network_strided unknown opcode",op); - } - armci_client_complete(0,proc,tag,cdesc); - } - - if(ARMCI_ACC(op)){ - /*unlock here*/ - } - - if(nb_handle){ - /* completing the last call is sufficient, given ordering semantics*/ - nb_handle->tag=tag; - nb_handle->cmpl_info=cdesc; - } - else{ - /*completing the last call ensures everything before it is complete this - * is one of the main reasons why dataserver is necessary*/ - /*armci_client_complete(0,proc,tag,cdesc);*/ - } - ARMCI_PR_DBG("exit",0); -} - -void armci_client_direct_getput(ptl_process_id_t dest_proc, - ptl_size_t offset_remote, ptl_match_bits_t mb, size_t bytes, - ptl_md_t *md_local_get,ptl_md_t *md_local_put, - ptl_handle_md_t *md_hdl_local_get, ptl_handle_md_t - *md_hdl_local_put) -{ -int rc; -ptl_size_t offset_get = 0; -ptl_size_t offset_put = 0; - - ARMCI_PR_DBG("enter",0); - - if(DEBUG_COMM){ - printf("%d:armci_client_direct_getput:BYTES = %d\n",armci_me,bytes); - printf("\n%d:offr=%ld\n",armci_me,offset_remote);fflush(stdout); - } - - rc = PtlGetPutRegion(*md_hdl_local_get, offset_get, *md_hdl_local_put, - offset_put,bytes,dest_proc, portals->ptl,0,mb, - offset_remote,0); - if (rc != PTL_OK){ - printf("%d:PtlGetPutRegion: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlGetPutRegion failed",0); - } - - ARMCI_PR_DBG("exit",0); - -} - - -long a_p_putfrom; -long a_p_getinto; - - -int armci_portals_rmw_(int op, int *ploc, int *prem, int extra, int proc) -{ - printf("error rmw"); - return(0); -} - -void armci_portals_shmalloc_allocate_mem(int num_lks) -{ -void **ptr_arr; -void *ptr; -armci_size_t bytes = 128; -int i; - - ARMCI_PR_DBG("enter",0); - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - if(!ptr_arr) armci_die("armci_shmalloc_get_offsets: malloc failed", 0); - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - - PARMCI_Malloc(ptr_arr,bytes); - ARMCI_PR_DBG("exit",0); - - return; -} - - -void armci_wait_for_server() -{ - ARMCI_PR_DBG("enter",0); - armci_server_terminating=1; - armci_serv_quit(); - ARMCI_PR_DBG("exit",0); -} - -/*client buffers info*/ -void armci_portals_client_buf_info(char *buf, ptl_match_bits_t *mb, ptl_size_t *offset,int proc) -{ - ARMCI_PR_DBG("enter",0); - *mb = (1<<30); - *offset = buf-client_buf_ptrs[proc]; - if(DEBUG_SERV){printf("\n(%d):serv writing to ofset %d on %d\n",armci_me,*offset,proc);fflush(stdout);} - ARMCI_PR_DBG("exit",0); -} - -/*memory for client buffers*/ -char *armci_portals_client_buf_allocate(int bytes) -{ -void *ptr; -ptl_match_bits_t ignbits = 0xFFFFFFFF0FFFFFFF; -ptl_match_bits_t mbits = 1; -ptl_md_t *md_ptr,md; -ptl_process_id_t match_id; -ptl_handle_me_t me_h; -ptl_handle_md_t md_h; -int rc; - ARMCI_PR_DBG("enter",sizeof(ptl_match_bits_t)); - ptr = malloc(bytes); - bzero(ptr,bytes); - assert(ptr); - - mbits = (1<<30); - md_ptr = &(md); - md_ptr->start = ptr; - md_ptr->length = bytes; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_EVENT_START_DISABLE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = 0; - /*logic that says, eq_h is now recieving data for the buffers, including acks! */ - md_ptr->eq_handle = portals->eq_h; - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id, - mbits,ignbits,PTL_RETAIN,PTL_INS_AFTER,&(me_h)); - if (rc != PTL_OK){ - printf("%d:PtlMEAttach: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("portals attach error2",rc); - } - rc = PtlMDAttach(me_h,md,PTL_RETAIN,&md_h); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->rank, ARMCI_NET_ERRTOSTR(rc),(client_md_count+serv_md_count) ); - armci_die("portals attach error CBA",rc); - } - client_md_count++; - - client_buf_ptrs[armci_me]=ptr; - armci_msg_barrier(); - armci_exchange_address(client_buf_ptrs,armci_nproc); - - ARMCI_PR_DBG("exit",0); - return(ptr); -} - -void armci_transport_cleanup() -{ - /*for i=0tomaxpendingclean*/ - ARMCI_PR_DBG("enter",0); - free(client_buf_ptrs); - ARMCI_PR_DBG("exit",0); -} - -void free_serv_bufs() -{ - if(serv_bufs) free(serv_bufs); -} - - -int armci_send_req_msg(int proc, void *buf, int bytes,int tag) -{ -#ifndef OLD_PORTALS_CODE - int cluster = armci_clus_id(proc); - int serv = armci_clus_info[cluster].master; - char *buffer = NULL; - request_header_t *msginfo = (request_header_t *) buf; - -// # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - _armci_buf_ensure_one_outstanding_op_per_node(buf,cluster); -// # endif - - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - BUF_INFO_T *bufinfo=_armci_buf_to_bufinfo(msginfo); - _buf_ackresp_t *ar = &bufinfo->ar; - portals_ds_req_t *req = &ar->req; - # endif - - if(msginfo->operation == PUT || ARMCI_ACC(msginfo->operation)) { - // printf("%s cp: sending packed put\n",Portals_ID()); - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - portals_remote_nbput(buf, buf, cluster, req); - // portalsWaitOnRequest(req); - # else - portals_remote_put(buf, buf, cluster); - # endif - // printf("%s cp: finished packed put\n",Portals_ID()); - } - - else if(msginfo->operation == GET) { - buffer = (char *) buf; - buffer += sizeof(request_header_t); - buffer += msginfo->dscrlen; - // printf("%s cp: sending blocking get request\n",Portals_ID()); - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - portals_remote_nbget(buffer, msginfo, cluster, req); - // portalsWaitOnRequest(req); - # else - portals_remote_get(buffer, msginfo, cluster); - # endif - // printf("%s cp: get request finished\n",Portals_ID()); - } - - else if(msginfo->operation == ACK) { - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - portalsRemoteOperationToNode(buf, bytes, cluster, req); - // portalsWaitOnRequest(req); - # else - portalsBlockingRemoteOperationToNode(buf, bytes, cluster); - # endif - } - - else if(msginfo->operation == ARMCI_SWAP || msginfo->operation == ARMCI_SWAP_LONG || - msginfo->operation == ARMCI_FETCH_AND_ADD || msginfo->operation == ARMCI_FETCH_AND_ADD_LONG) { - buffer = (char *) buf; - buffer += sizeof(request_header_t); - buffer += msginfo->dscrlen; - portals_remote_rmw(buffer, msginfo, cluster, req); - # ifndef PORTALS_USE_ARMCI_CLIENT_BUFFERS - portalsWaitOnOperation(req); - # endif - } - - else { - printf("%s cp: msginfo->operation=%d not supported yet\n",Portals_ID(),msginfo->operation); - abort(); - } - - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS -/* for now, clear the ackresp structure because the call had to have been blocking - later, we will allow a modified x_net_wait_ackresp clear it */ - ar->val = ar->valc = 0; - if(ar==_buf_ackresp_first)_buf_ackresp_first=ar->next; - if(ar->next!=NULL){ - ar->next->previous=ar->previous; - } - if(ar->previous!=NULL){ - ar->previous->next=ar->next; - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=ar->previous; - } - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=NULL; - ar->previous=ar->next=NULL; - # endif - - return 0; - -#else - -int rc; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_md_t *md_local; -ptl_handle_md_t *md_hdl_local; -int rem_info; -comp_desc *cdesc; -void *cptr; -ptl_process_id_t dest_proc; -int c_info; -int cluster = armci_clus_id(proc); -int serv = armci_clus_info[cluster].master; -request_header_t *msginfo = (request_header_t *)buf; - - ARMCI_PR_DBG("enter",0); - if(msginfo->operation==GET && msginfo->dscrlen<=msginfo->datalen){ - *(long *)((char *)(msginfo+1)+msginfo->datalen)=0; - } - - /*badbadbad*/ - msginfo->tag.ack_ptr=&(msginfo->tag.ack); - cptr = (void *)((double *)buf-1); - /*first process information*/ - dest_proc.nid = portals->servid_map[cluster].nid; - dest_proc.pid = portals->servid_map[cluster].pid; - /*create local xfer info*/ - cdesc = get_free_comp_desc(&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=buf; - md_local->user_ptr = (void *)cdesc; - md_local->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - - /*get remote info*/ - rem_info = _get_rem_servinfo(cluster,(size_t)bytes,&offset_remote); - - if(DEBUG_COMM){ - printf("\n%d:offr=%ld offl=%ld\n",armci_me,offset_remote,offset_local); - } - - cdesc->dest_id = serv; - *((comp_desc **)cptr) = cdesc; - if(tag==0)tag=GET_NEXT_NBTAG(); - cdesc->tag = tag; - cdesc->type = ARMCI_PORTALS_NBPUT; - /*printf("\n%d:put tag=%d c_info=%d - * %p",armci_me,tag,c_info,cdesc);fflush(stdout);*/ - cdesc->active = 1; - - if(msginfo->operation==PUT || msginfo->operation == UNLOCK || ARMCI_ACC(msginfo->operation)){ - _buf_ackresp_cur->valc = _buf_ackresp_cur->val = (char *)msginfo->tag.ack_ptr-client_buf_ptrs[armci_me]; -# ifdef ARMCI_CHECK_STATE - arlist_add(&arn,_buf_ackresp_cur->val,msginfo->operation); -# endif - } - else { - _buf_ackresp_cur->valc = _buf_ackresp_cur->val = (char *)msginfo->tag.data_ptr-client_buf_ptrs[armci_me]; -# ifdef ARMCI_CHECK_STATE - arlist_add(&arn,_buf_ackresp_cur->val,msginfo->operation); -# endif - } - - if(DEBUG_COMM){printf("\n%d:registered %d in val to %d at %d %d\n",armci_me,_buf_ackresp_cur->val,serv,offset_remote,msginfo->operation);fflush(stdout);} - _armci_buf_ensure_pend_outstanding_op_per_node(buf,cluster); - armci_client_direct_send(dest_proc,offset_remote,(ptl_match_bits_t)rem_info, - bytes,md_local,md_hdl_local); - /*if(msginfo->operation==GET){ - BUF_INFO_T *info=((char *)msginfo-sizeof(BUF_EXTRA_FIELD_T)-sizeof(BUF_INFO_T)); - armci_client_complete(0,proc,cdesc->tag,cdesc); - }*/ - /*armci_client_complete(0,proc,cdesc->tag,cdesc);*/ - - portals->outstanding_puts++; - - ARMCI_PR_DBG("exit",0); - return 0; -#endif - -} - - -char *armci_ReadFromDirect(int proc, request_header_t *msginfo, int len) -{ -#ifndef OLD_PORTALS_CODE - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - BUF_INFO_T *bufinfo = _armci_buf_to_bufinfo(msginfo); - portals_ds_req_t *req = &bufinfo->ar.req; - portalsWaitOnRequest(req); - # endif - char *ret = (char *) msginfo; - ret += sizeof(request_header_t); - ret += msginfo->dscrlen; - return ret; -#else -long *flag; -int loop; -BUF_INFO_T *bufinfo=_armci_buf_to_bufinfo(msginfo); - - ARMCI_PR_DBG("enter",0); - if(len) - flag = (long *)((char *)(msginfo+1)+len); - else - flag = (long *)((char *)(msginfo+1)+msginfo->datalen); - x_net_wait_ackresp(&(bufinfo->ar)); - - while(armci_util_long_getval(flag) != ARMCI_TAIL){ - loop++; - loop %=100000; - if(loop==0){ - if(DEBUG_COMM){ - printf("%d: client flag(%p)=%ld off=%d %d\n", - armci_me,flag,*flag,msginfo->datalen,*((int*)(msginfo+1))); - fflush(stdout); - } - } - } - *flag=0; - ARMCI_PR_DBG("exit",0); - return (msginfo+1); -#endif -} - -#ifdef ARMCI_CHECK_STATE -extern void sarlist_add(int,int,long); -#endif - -void armci_WriteToDirect(int proc, request_header_t* msginfo, void *buf) -{ -#ifndef OLD_PORTALS_CODE - ptl_size_t bytes = (ptl_size_t) msginfo->datalen; - ptl_event_t *ev = (ptl_event_t *) msginfo->tag.user_ptr; - portals_ds_send_put(buf, msginfo->datalen, ev->initiator, ev->hdr_data); - // you could do an assertion that the portals_id_map of proc == ev->initiator -#else -long *tail; -int bytes; -void *dst_addr = msginfo->tag.data_ptr; -ptl_match_bits_t ignbits = 0xFFFFFFFF0FFFFFFF; -ptl_match_bits_t mbits = 1; -ptl_md_t *md_ptr,md; -ptl_process_id_t match_id; -ptl_handle_me_t me_h; -ptl_size_t offst,localoffset; -int rc; - - /* set tail ack, make sure it is alligned */ - ARMCI_PR_SDBG("enter",0); - bytes = msginfo->datalen+sizeof(long); - if(!(buf>=serv_bufs->buf && bufbufend)){ - bcopy(buf,(msginfo+1),bytes); - buf=(msginfo+1); - } - tail = (long*)(buf + msginfo->datalen); - *tail = ARMCI_TAIL; - - armci_portals_client_buf_info((char *)dst_addr,&mbits,&offst,proc); - -# ifdef ARMCI_CHECK_STATE - sarlist_add(proc,msginfo->operation,offst); -# endif - - match_id.nid = portals->procid_map[proc].nid; - match_id.pid = portals->procid_map[proc].pid; - localoffset=(char *)buf-(char *)serv_bufs->buf; - if(DEBUG_COMM){ - printf("\n(%d):dst=%p,mbits=%d,localoffset=%d,offst=%d,proc=%d,nid=%d,pid=%d len=%d\n",armci_me, - dst_addr,mbits,localoffset,offst,proc,portals->procid_map[proc].nid, - portals->procid_map[proc].pid,bytes);fflush(stdout); - } - rc = PtlPutRegion(serv_response_md_h,localoffset,bytes,PTL_NOACK_REQ, - match_id,portals->ptl,0,mbits,offst,0); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlPutRegion: %s\n", portals->Srank, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlPutRegion failed",0); - } - ARMCI_PR_SDBG("exit",0); -#endif -} - - -void armci_rcv_req(void *mesg,void *phdr,void *pdescr,void *pdata,int *buflen) -{ -int i,na; -char *a; -double *tmp; -request_header_t *msginfo = (request_header_t *)mesg; - ARMCI_PR_SDBG("enter",msginfo->operation); - *(void **)phdr = msginfo; - if(0){ - printf("%s [ds]: got %d req (hdrlen=%d dscrlen=%d datalen=%d %d) from %d\n", - Portals_ID(), msginfo->operation, sizeof(request_header_t), msginfo->dscrlen, - msginfo->datalen, msginfo->bytes,msginfo->from); - fflush(stdout); - } - /* we leave room for msginfo on the client side */ - *buflen = MSG_BUFLEN - sizeof(request_header_t); - - if(msginfo->bytes) { - *(void **)pdescr = msginfo+1; - *(void **)pdata = msginfo->dscrlen + (char*)(msginfo+1); - - if(msginfo->operation == GET) { - // the descriptor will exists after the request header - // but there will be no data buffer - // use the MessageRcvBuffer - *(void**) pdata = MessageSndBuffer; -// printf("%s (server) overriding pdata in rcv_req\n",Portals_ID()); - } - printf("%s [ds] oper=%d; bytes=%d\n",armci_me,msginfo->operation,msginfo->bytes); - } - else { - printf("%s [ds] bytes=%d\n",armci_me,msginfo->bytes); - *(void**)pdescr = NULL; - *(void**)pdata = MessageRcvBuffer; - } - ARMCI_PR_SDBG("exit",msginfo->operation); -} - -void armci_call_data_server() -{ -int rc; -ptl_event_t ev_t; -ptl_event_t *ev=&ev_t; -serv_buf_t *compbuf = NULL; -int loop=1; -int temp_proc; -int ccc=2,rrr; -cpu_set_t mycpuid,new_mask; -char str[CPU_SETSIZE]; -char ncid[8],*cidptr,cid[8]; -extern char * cpuset_to_cstr(cpu_set_t *mask, char *str); - ARMCI_PR_SDBG("enter",0); - //if(armci_me==0)unsetenv("CRAY_PORTALS_USE_BLOCKING_POLL"); - sprintf (cid, "%d", ccc); - rrr = cstr_to_cpuset(&new_mask,cid); - -/* ------------------------------------------------------------ *\ - Change affinity for the data server -\* ------------------------------------------------------------ */ - if(sched_setaffinity(0, sizeof (new_mask), &new_mask)) { - perror("sched_setaffinity"); - printf("failed to set pid %d's affinity.\n", getpid()); - } - if(DEBUG_SERV){ - rrr=sched_getaffinity(0, sizeof(mycpuid), &mycpuid); - if(rrr)perror("sched_getaffinity"); - cidptr = cpuset_to_cstr(&mycpuid,ncid); - printf("(%d):my affinity is to %s\n",armci_me,ncid); - fflush(stdout); - } - -/* ------------------------------------- *\ - Main data server loop -\* ------------------------------------- */ - while(armci_server_terminating==0){ - - /* ------------------------------------------------------------ *\ - check event queue for incoming data requests from remote CPs - \* ------------------------------------------------------------ */ - ev->type=0; - if((rc = PtlEQWait(portals->Seq_h, ev)) != PTL_OK){ - printf("(%d):PtlEQWait(): %d %s\n", armci_me,rc,ARMCI_NET_ERRTOSTR(rc) ); - armci_die("EQWait problem",rc); - } - if (ev->ni_fail_type != PTL_NI_OK) { - printf("(%d)%d,%d:NI sent %d in event.\n", - armci_me,portals->Srank.nid, portals->Srank.pid,ev->ni_fail_type); - fflush(stdout); - armci_die2("event failure problem",ev->initiator.nid,ev->initiator.pid); - } - if(DEBUG_SERV){ - printf("\n(%d):armci_call_data_server: ptl event detected=%d\n",armci_me,ev->type); - fflush(stdout); - } - - /* ------------------------------------------------------------ *\ - PTL_EVENT_SEND_END: is ignored. This event is triggered as - the DS returns data to a remote CP via a PtlPut. This event - signals that that PtlPut has complete. - \* ------------------------------------------------------------ */ - if(ev->type == PTL_EVENT_SEND_END) continue; - - - /* ------------------------------------------------------------ *\ - PTL_EVENT_PUT_END: this is the key portals event for the DS. - PUT_END signifies that a remote data request has come in - from a remote CP. This data request will be handled by the - data server: armci_data_server - \* ------------------------------------------------------------ */ - else if(ev->type == PTL_EVENT_PUT_END) { - if(DEBUG_SERV) { - printf("\n(%d):ev->offset=%d from %d%d",armci_me,ev->offset, - ev->initiator.pid,ev->initiator.nid); - fflush(stdout); - } - armci_data_server(((char *)serv_bufs->buf+ev->offset)); - } - - /* ------------------------------------------------------------ *\ - Unexpected Portals Event -- Panic! - \* ------------------------------------------------------------ */ - else { - armci_die("unexpected event in data server",ev->type); - } - } - ARMCI_PR_SDBG("exit",0); -} - -void x_buf_wait_ack(request_header_t *msginfo, BUF_INFO_T *bufinfo) -{ - ARMCI_PR_DBG("enter",bufinfo->ar.val); - if(DEBUG_COMM){printf("\n%d:waiting for ack at %p",armci_me,&(msginfo->tag.ack));fflush(stdout);} - x_net_wait_ackresp(&(bufinfo->ar)); - armci_util_wait_long(&(msginfo->tag.ack),ARMCI_STAMP,10000); - if(DEBUG_COMM){printf("\n%d:done waiting for ack at %p",armci_me,&(msginfo->tag.ack));fflush(stdout);} - msginfo->tag.ack=0; - ARMCI_PR_DBG("exit",0); -} - -void x_net_send_ack(request_header_t *msginfo, int proc,void *dst,void *src) -{ -long *tail; -int bytes=sizeof(long); -ptl_size_t offst; -ptl_match_bits_t ignbits = 0xFFFFFFFF0FFFFFFF; -ptl_match_bits_t mbits = 1; -ptl_process_id_t match_id; -int rc; - - /* set tail ack, make sure it is alligned */ - ARMCI_PR_SDBG("enter",0); - - - armci_portals_client_buf_info((char *)dst,&mbits,&offst,proc); - -# ifdef ARMCI_CHECK_STATE - sarlist_add(proc,msginfo->operation,offst); -# endif - - match_id.nid = portals->procid_map[proc].nid; - match_id.pid = portals->procid_map[proc].pid; - if(DEBUG_SERV){ - printf("\n(%d):dst=%p,mbits=%d,offst=%d,proc=%d,nid=%d,pid=%d len=%d\n",armci_me, - dst,mbits,offst,proc,portals->procid_map[proc].nid, - portals->procid_map[proc].pid,bytes);fflush(stdout); - } - - rc = PtlPutRegion(serv_ack_md_h,0,bytes,PTL_NOACK_REQ, - match_id,portals->ptl,0,mbits,offst,0); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlPutRegion: %s\n", portals->Srank, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlPutRegion failed",0); - } - ARMCI_PR_SDBG("exit",0); -} - -long x_net_offset(char *buf,int proc) -{ -#ifdef ARMCI_REGISTER_SHMEM -int i; -#if NEW_MALLOC - if((i=check_meminfo(buf,1,proc))==0) - armci_die("x_net_offset,reg not found",proc); - return(all_meminfo[proc].serv_offs[i]); -#else - ARMCI_PR_DBG("enter",_rem_meminfo[proc].reg_count); - if(DEBUG_COMM){printf("\n%d:%s:buf=%p",armci_me,__FUNCTION__,buf);fflush(stdout); } - for(i=0;i<_rem_meminfo[proc].reg_count;i++){ - if(IN_REGION(buf,_rem_meminfo[proc].reginfo[i])){ -#ifdef DEBUG_MEM - {printf("\n%d:found it in reg=%d (%p,%d) for proc=%d",armci_me,i,_rem_meminfo[proc].reginfo[i].base_ptr,_rem_meminfo[proc].reginfo[i].size,proc);} -#endif - return((long)((char *)_rem_meminfo[proc].reginfo[i].serv_ptr-(char *)_rem_meminfo[proc].reginfo[i].base_ptr)); - } - } -#endif - ARMCI_PR_DBG("exit",0); -#else - printf("x_net_offset called; this shouldn't happen ...\n"); abort(); -#endif -} - -void armci_set_serv_mutex_arr(void *ptr) -{ -int i; -long offset; - ARMCI_PR_DBG("enter",0); - offset=x_net_offset(ptr,armci_me); - - _armci_server_mutex_ready=1; - _armci_server_mutex_ptr = (char *)ptr+offset; - ARMCI_PR_DBG("exit",0); - -} - diff --git a/armci/src-gemini/armci_portals.h b/armci/src-gemini/armci_portals.h deleted file mode 100644 index 8fed1dac7..000000000 --- a/armci/src-gemini/armci_portals.h +++ /dev/null @@ -1,150 +0,0 @@ -#ifndef ARMCI_PORTALS_H -#define ARMCI_PORTALS_H - -/* portals header file */ - -#include -#include -#include - -#define NUM_COMP_DSCR 4 - -#define ARMCI_PORTALS_PTL_NUMBER 37 - -#define HAS_RDMA_GET -#define NUM_SERV_BUFS 1 - -/*corresponds to num of different armci mem regions*/ -#define MAX_MEM_REGIONS 10 - -#define VBUF_DLEN_ORG 4*64*1024 -#define VBUF_DLEN 16*1024 -#define MSG_BUFLEN_DBL_VT ((VBUF_DLEN)>>3) - -/* VBUF_DLEN are only used in Vinod's code */ - -#ifdef PORTALS_USE_RENDEZ_VOUS -# define MSG_BUFLEN_DBL 262144 /* for rendez-vous, this can go bigger i think */ -#else -# define MSG_BUFLEN_DBL 1280 /* this is smaller when rendez-vous is off */ -#endif - - - -#define ARMCI_NET_ERRTOSTR(__ARMCI_ERC_) ptl_err_str[__ARMCI_ERC_] - -typedef enum op { - ARMCI_PORTALS_PUT, - ARMCI_PORTALS_NBPUT, - ARMCI_PORTALS_GET, - ARMCI_PORTALS_NBGET, - ARMCI_PORTALS_ACC, - ARMCI_PORTALS_NBACC, - ARMCI_PORTALS_GETPUT, - ARMCI_PORTALS_NBGETPUT -} armci_portals_optype; - -typedef struct { - void *data_ptr; /* pointer where the data should go */ - long ack; /* header ack */ - void *ack_ptr; /* pointer where the data should go */ - void *user_ptr; -#if defined(SERV_QUEUE) - int imm_msg; - size_t data_len; -#endif -} msg_tag_t; - -typedef struct armci_portals_desc{ - int active; - int tag; - int dest_id; - armci_portals_optype type; - ptl_md_t mem_dsc; - ptl_handle_md_t mem_dsc_hndl; - char *bufptr; -}comp_desc; - -/*for buffers*/ -extern char *armci_portals_client_buf_allocate(int); -#define BUF_ALLOCATE armci_portals_client_buf_allocate -#define BUF_EXTRA_FIELD_T comp_desc* - -#define INIT_SEND_BUF(_field,_snd,_rcv) _snd=1;_rcv=1;_field=NULL - -#define GET_SEND_BUFFER _armci_buf_get -#define FREE_SEND_BUFFER _armci_buf_release - -#define CLEAR_SEND_BUF_FIELD(_field,_snd,_rcv,_to,_op) if((_op==UNLOCK || _op==PUT || ARMCI_ACC(_op)) && _field!=NULL)x_buf_wait_ack((request_header_t *)((void **)&(_field)+1),((char *)&(_field)-sizeof(BUF_INFO_T)));_field=NULL; -#define TEST_SEND_BUF_FIELD(_field,_snd,_rcv,_to,_op,_ret) -#define COMPLETE_HANDLE _armci_buf_complete_nb_request - -#define NB_CMPL_T comp_desc* -#define ARMCI_NB_WAIT(_cntr) if(_cntr){\ - int rc;\ - if(nb_handle->tag)\ - if(nb_handle->tag==_cntr->tag)\ - rc = armci_client_complete(0,nb_handle->proc,nb_handle->tag,_cntr);\ -} else{\ -printf("\n%d:wait null ctr\n",armci_me);} - -#ifndef MAX_DS -#define MAX_DS 16 -#endif - -/* structure of computing process */ -typedef struct { - ptl_pt_index_t ptl; - ptl_process_id_t rank; - ptl_handle_ni_t ni_h; - ptl_handle_eq_t eq_h; - ptl_process_id_t Srank; - ptl_handle_ni_t Sni_h; - ptl_handle_eq_t Seq_h; - int outstanding_puts; - int outstanding_gets; - ptl_process_id_t *procid_map; - ptl_process_id_t *servid_map; - int free_comp_desc_index; - caddr_t dsbase[MAX_DS]; - size_t dssizes[MAX_DS]; - ptl_match_bits_t heap_mb[MAX_DS]; - ptl_md_t heap_md[MAX_DS]; - ptl_handle_me_t heap_me_h[MAX_DS]; - ptl_handle_md_t heap_md_h[MAX_DS]; - void *brval[MAX_DS]; - long serv_offs[MAX_DS]; - int cur_ds; -}armci_portals_proc_t; - -typedef struct { - ptl_match_bits_t mb; - ptl_md_t md; - ptl_handle_me_t me_h; - ptl_handle_md_t md_h; -}armci_portals_serv_mem_t; - -typedef struct { - int reg_count; - int outstanding_puts; - int outstanding_gets; - armci_portals_serv_mem_t meminfo[MAX_MEM_REGIONS]; -}armci_portals_serv_t; - - -extern void print_mem_desc_table(void); -extern int armci_init_portals(caddr_t); -extern void armci_fini_portals(void); -extern int armci_post_descriptor(ptl_md_t *md); -extern int armci_prepost_descriptor(void* start, long bytes); -extern ptl_size_t armci_get_offset(ptl_md_t md, void *ptr,int proc); -extern int armci_get_md(void * start, int bytes , ptl_md_t * md, ptl_match_bits_t * mb); -extern void armci_portals_put(int,void *,void *,int,void **,int ); -extern void armci_portals_get(int,void *,void *,int,void **,int ); -extern void comp_desc_init(); -extern int armci_client_complete(ptl_event_kind_t evt,int proc_id, int nb_tag ,comp_desc * cdesc); -extern void armci_portals_memsetup(long); - -extern MPI_Comm portals_smp_comm; - -#endif /* ARMCI_PORTALS_H */ diff --git a/armci/src-gemini/armci_shmem.h b/armci/src-gemini/armci_shmem.h deleted file mode 100644 index 5db18fff9..000000000 --- a/armci/src-gemini/armci_shmem.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _SHMEM_H_ -#define _SHMEM_H_ -extern void Set_Shmem_Limit(unsigned long shmemlimit); -extern void Delete_All_Regions(); -extern char* Create_Shared_Region(long idlist[], long size, long *offset); -extern char* Attach_Shared_Region(long idlist[], long size, long offset); -extern void Free_Shmem_Ptr(long id, long size, char* addr); -extern long armci_shmem_reg_size(int i, long id); -extern char* armci_shmem_reg_ptr(int i); - -#define POST_ALLOC_CHECK(temp,size) ; - -#define MAX_REGIONS 64 - -#if defined(WIN32) -#define SHMIDLEN 3 -#else -#define SHMIDLEN (MAX_REGIONS + 2) -#endif - -#define IDLOC (SHMIDLEN - 3) - -#endif diff --git a/armci/src-gemini/armcip.h b/armci/src-gemini/armcip.h deleted file mode 100644 index 671e47851..000000000 --- a/armci/src-gemini/armcip.h +++ /dev/null @@ -1,509 +0,0 @@ -/* $Id: armcip.h,v 1.82.2.9 2007-08-29 17:32:31 manoj Exp $ */ -/* armci private header file */ -#ifndef _ARMCI_P_H - -#define _ARMCI_P_H -#include -#include "armci.h" -#include "message.h" -// #include "code_options.h" -#if 0 -#define ARMCI_PR_DBG(__ARMCI_ST,__ARMCI_NU) \ - printf("\n%d:%s:%d:%s:%s:%d",armci_me,__FILE__,__LINE__,__FUNCTION__,__ARMCI_ST,__ARMCI_NU);fflush(stdout) -#define ARMCI_PR_SDBG(__ARMCI_ST,__ARMCI_NU) \ - printf("\n(%d):%s:%d:%s:%s:%d",armci_me,__FILE__,__LINE__,__FUNCTION__,__ARMCI_ST,__ARMCI_NU);fflush(stdout) -#else -#define ARMCI_PR_DBG(__ARMCI_ST,__ARMCI_NU) -#define ARMCI_PR_SDBG(__ARMCI_ST,__ARMCI_NU) -#endif - -#ifdef LIBONESIDED -#include "armci-onesided.h" -#endif - -#define DATA_SERVER -#define SERVER_THREAD -/*#define ARMCI_CHECK_STATE*/ - -#define ARMCI_STAMP 11214 -#define ARMCI_TAIL 31121 -#ifdef QUADRICS -#include -#ifdef QSNETLIBS_VERSION_CODE -#ifndef DECOSF -# define ELAN_ACC -# define PENDING_OPER(x) ARMCI_ACC_INT -#endif - -# if QSNETLIBS_VERSION_CODE > QSNETLIBS_VERSION(1,5,0) -# define LIBELAN_ATOMICS -# endif - -#endif -extern void armci_elan_fence(int p); -#endif - -/* we got problems on IA64/Linux64 with Elan if inlining is used */ -#if defined(__GNUC__) && !defined(QUADRICS) -# define INLINE inline -#else -# define INLINE -#endif - -#ifdef WIN32 -#include -#define sleep(x) Sleep(100*(x)) -#else -#include -#endif - -#if (defined(SYSV) || defined(WIN32)|| defined(MMAP)) && !defined(NO_SHM) && !defined(HITACHI) && !defined(CATAMOUNT) -#define CLUSTER - -#ifdef SERVER_THREAD -# define SERVER_NODE(c) (armci_clus_info[(c)].master); -# define NODE_SERVER(c) (c); -#else -# define SOFFSET -1000000 -# define SERVER_NODE(c) ((int)(SOFFSET -armci_clus_info[(c)].master)); -# define NODE_SERVER(c) ((int)(SOFFSET - c)) -#endif - -#endif - - -/*\GPC call stuff -\*/ -typedef struct { - int hndl, hlen, dlen; - void *hdr, *data; -}gpc_send_t; - - -/*\ Stuff for non-blocking API -\*/ -#define NB_MULTI -1 /*more than one armci buffer(buffers.c) used for nbcall*/ -#define NB_NONE -2 /*no armci buffer(buffers.c) used for nbcall*/ -extern unsigned int _armci_get_next_tag(); -#define GET_NEXT_NBTAG _armci_get_next_tag -#define ARMCI_MAX_IMPLICIT 15 - -typedef struct{ - int len; - int last; - void *exthdr; -} ext_header_t; - -typedef struct{ -int val; -void *ptr; -} armci_flag_t; - - -#if defined(LAPI) || defined(PTHREADS) || defined(POSIX_THREADS) -# include - typedef pthread_t thread_id_t; -# define THREAD_ID_SELF pthread_self -#elif defined(WIN32) -# include - typedef DWORD thread_id_t; -# define THREAD_ID_SELF GetCurrentThreadId -#else - typedef int thread_id_t; -# define THREAD_ID_SELF() 1 -#endif - -extern thread_id_t armci_usr_tid; -#ifdef SERVER_THREAD -# define SERVER_CONTEXT (armci_usr_tid != THREAD_ID_SELF()) -#else -# define SERVER_CONTEXT (armci_me<0) -#endif - -#if defined(LAPI) || defined(CLUSTER) || defined(CRAY) \ - || defined(CRAY_SHMEM) || defined(BGML) || defined(DCMF) -# include "request.h" -#endif - -/* ------------------------ ARMCI threads support ------------------------- */ -#define ARMCI_THREADS_LIMIT 32 - -#include "utils.h" -#if defined(THREAD_SAFE) -typedef struct { - int max; /* max # of threads per proc */ - int avail; /* next available position */ - thread_id_t *ids; /* list of threads' ids */ - thread_lock_t lock; /* general case lock */ - thread_lock_t buf_lock; /* lock for buffer access */ - thread_lock_t net_lock; /* lock for network accees */ -} armci_user_threads_t; - -extern armci_user_threads_t armci_user_threads; - -extern void armci_init_threads(); -extern void armci_finalize_threads(); -extern int armci_thread_idx(); -extern INLINE int armci_register_thread(thread_id_t id); - -#define ARMCI_THREAD_IDX armci_thread_idx() /* needs to be optimized */ - -#else -# define ARMCI_THREAD_IDX 0 -#endif - -/* ------------------------------------------------------------------------ */ - -/* min amount of data in strided request to be sent in single TCP/IP message*/ -#if defined(SOCKETS) || defined(MPI_SPAWN_ZEROCOPY) -# define TCP_PAYLOAD 128 -# define LONG_GET_THRESHOLD TCP_PAYLOAD -# define LONG_GET_THRESHOLD_STRIDED LONG_GET_THRESHOLD -# define LONG_PUT_THRESHOLD 128 -#endif - -#ifdef WIN32 -# define bzero(a,len){\ - int _i;\ - char *_c = (char*)(a);\ - for(_i=0; _i< (int)(len); _i++)_c[_i]=(char)0;\ - } -# define bcopy(a,b,len) memcpy(b,a,len) -#else -# include -#endif - -/*#define ACC_COPY*/ -#if defined(CRAY_T3E) || defined(FUJITSU)\ - || defined(HITACHI) || (defined(QUADRICS) && !defined(ELAN_ACC)) -#define ACC_COPY -#endif - -#ifndef FATR -# ifdef WIN32 -# define FATR __stdcall -# else -# define FATR -# endif -#endif - -#define MAX_PROC 8096 -#define MAX_STRIDE_LEVEL ARMCI_MAX_STRIDE_LEVEL - -/* msg tag ARMCI uses in collective ops */ -#define ARMCI_TAG 30000 - -#ifndef EXTRA_MSG_BUFLEN_DBL -# define RESERVED_BUFLEN ((sizeof(request_header_t)>>3)+3*MAX_STRIDE_LEVEL) -#else -# define RESERVED_BUFLEN ((sizeof(request_header_t)>>3)+3*MAX_STRIDE_LEVEL +\ - EXTRA_MSG_BUFLEN_DBL) -#endif - -#if defined(HITACHI) -# define BUFSIZE ((0x50000) * sizeof(double)) -#else - /* packing algorithm for double complex numbers requires even number */ -# ifdef MSG_BUFLEN_DBL -# define BUFSIZE_DBL (MSG_BUFLEN_DBL - RESERVED_BUFLEN) -# else -# define BUFSIZE_DBL 32768 -# endif -# define BUFSIZE (BUFSIZE_DBL * sizeof(double)) -#endif - -/* note opcodes must be lower than ARMCI_ACC_OFF !!! */ -#define PUT 1 -#define GET 2 -#define RMW 3 -#define LOCK 4 -#define UNLOCK 5 -#define ACK 6 -#define STATE 11214 - -/* must fit in two bits, see msginfo->format in request.h */ -#define STRIDED 1 -#define VECTOR 2 - -extern int armci_me, armci_nproc; -extern int _armci_initialized; -#ifdef HITACHI - extern int sr8k_server_ready; - extern double *armci_internal_buffer; -#else - extern double armci_internal_buffer[BUFSIZE_DBL]; -#endif -extern int armci_getbufsize(); -extern void armci_shmem_init(); -extern void armci_krmalloc_init_localmem(); -extern void armci_die(char *msg, int code); -extern void armci_die2(char *msg, int code1, int code2); -extern void armci_write_strided(void *ptr, int stride_levels, - int stride_arr[], int count[], char *buf); -extern void armci_read_strided(void *ptr, int stride_levels, - int stride_arr[], int count[], char *buf); -extern int armci_op_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, - int dst_stride_arr[], int count[], - int stride_levels, int lockit,armci_ihdl_t nb_handle); -extern int armci_copy_vector(int op, /* operation code */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int armci_acc_vector(int op, /* operation code */ - void *scale, /* scale factor */ - armci_giov_t darr[],/* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int armci_pack_strided(int op, void* scale, int proc, - void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, ext_header_t *hdr, - int fit_level, int nb, int last,armci_ihdl_t nb_handle); - -extern int armci_pack_vector(int op, void *scale, - armci_giov_t darr[],int len,int proc,armci_ihdl_t nb_handle); - -extern void armci_lockmem(void *pstart, void* pend, int proc); -extern void armci_unlockmem(int proc); - -extern int armci_acc_copy_strided(int optype, void* scale, int proc, - void* src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels); - -extern void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf); -extern void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf); -extern void armci_init_fence(); - -#ifdef SOCKETS -#ifdef SERVER_THREAD - extern void armci_create_server_thread ( void* (* func)(void*) ); - extern void armci_terminate_server_thread(); -#else - extern void armci_create_server_process ( void* (* func)(void*) ); - extern void armci_wait_server_process(); - extern void RestoreSigChldDfl(); -#endif -#endif - -#ifdef MPI_SPAWN - extern void armci_create_server_MPIprocess (); -#endif - - -#define ARMCI_MAX(a,b) (((a)>(b))?(a):(b)) -#define ARMCI_MIN(a,b) (((a)<(b))?(a):(b)) -#define ARMCI_ABS(a) (((a) >= 0) ? (a) : (-(a))) -#define ARMCI_ACC(op) ((((int)(op))-ARMCI_ACC_INT)>=0) - - -#ifdef CLUSTER - extern char *_armci_fence_arr; -# define FENCE_ARR(p_) (_armci_fence_arr[p_]) - -# define SAMECLUSNODE(p)\ - ( ((p) <= armci_clus_last) && ((p) >= armci_clus_first) ) -#elif defined(__crayx1) -# define SAMECLUSNODE(p) 1 -#elif defined(ARMCIX) -# define SAMECLUSNODE(p) 0 -#else -# define SAMECLUSNODE(p) ((p)==armci_me) -#endif - - -#if defined(LAPI) || defined(ELAN_ACC) -# define ORDER(op,proc)\ - if( proc == armci_me || ( ARMCI_ACC(op) && ARMCI_ACC(PENDING_OPER(proc))) );\ - else FENCE_NODE(proc) -# define UPDATE_FENCE_INFO(proc_) -#elif defined(CLUSTER) && !defined(QUADRICS) && !defined(HITACHI)\ - && !defined(CRAY_SHMEM) -# define ORDER(op_,proc_)\ - if(!SAMECLUSNODE(proc_) && op_ != GET )FENCE_ARR(proc_)=1 -# define UPDATE_FENCE_INFO(proc_) if(!SAMECLUSNODE(proc_))FENCE_ARR(proc_)=1 -#else -# if defined(GM) && defined(ACK_FENCE) -# define ORDER(op,proc) -# else -# define ORDER(op,proc) if(proc != armci_me) FENCE_NODE(proc) -# endif -# define UPDATE_FENCE_INFO(proc_) -#endif - -typedef struct { - int ptr_array_len; - int bytes; - void **ptr_array; -} armci_riov_t; - -/*\ consider up to HOSTNAME_LEN characters in host name - * we can truncate names of the SP nodes since it is not used - * to establish socket communication like on the networks of workstations - * SP node names must be distinct within first HOSTNAME_LEN characters -\*/ -#if defined(LAPI) && defined(AIX) -# define HOSTNAME_TRUNCATE -# define HOSTNAME_LEN 12 -#else -# define HOSTNAME_LEN 64 -#endif - -typedef struct { - int master; - int nslave; - char hostname[HOSTNAME_LEN]; -} armci_clus_t; - -extern armci_clus_t *armci_clus_info; -extern int armci_nclus, armci_clus_me, armci_master; -extern int armci_clus_first, armci_clus_last; -extern int armci_clus_id(int p); -extern void armci_init_clusinfo(); -extern void armci_set_mem_offset(void *ptr); -extern int _armci_terminating; -extern void armci_acc_2D(int op, void* scale, int proc, void *src_ptr, - void *dst_ptr, int bytes, int cols, int src_stride, - int dst_stride, int lockit); -extern void armci_lockmem_scatter(void *ptr_array[], int len, int bytes, int p); -extern void armci_generic_rmw(int op, void *ploc, void *prem, int extra, int p); -extern unsigned long armci_max_region(); -extern void armci_dispatch_strided(void *ptr, int stride_arr[], int count[], - int strides, int fit_level, int nb, int bufsize, - void (*fun)(void*,int*,int*,int,void*), void *arg); -extern void armci_msg_gop_init(); -extern void armci_util_spin(int n, void *notused); - -#if defined(SYSV) || defined(WIN32) -extern void armci_shmem_init(); -extern void armci_set_shmem_limit_per_core(unsigned long shmemlimit); -extern void armci_set_shmem_limit_per_node(int nslaves); -extern void armci_set_shmem_limit(unsigned long shmemlimit); -#endif - -#define ALIGN_PTR_LONG(type, x) if( ((long)(x)) % sizeof(long)) { long _y = (long)(x);\ - if(sizeof(long)==8){_y>>=3; _y<<=3; }\ - else { _y>>=2; _y<<=2; }\ - _y += sizeof(long); (x) = (type*)_y; } - -#define SIXTYFOUR 64 -#define ALIGN64ADD(buf) (SIXTYFOUR-(((ssize_t)(buf))%SIXTYFOUR)) -#define ALIGNLONGADD(buf) ((((ssize_t)(buf))%sizeof(long))?(sizeof(long)-(((ssize_t)(buf))%sizeof(long))):0) - -#define SET 1 -#define UNSET 0 - -extern int armci_agg_save_strided_descriptor(void *src_ptr, - int src_stride_arr[], - void* dst_ptr, - int dst_stride_arr[], - int count[], - int stride_levels, int proc, - int op, armci_ihdl_t nb_handle); - -extern int armci_agg_save_giov_descriptor(armci_giov_t darr[], int len, - int proc, int op, - armci_ihdl_t nb_handle); - -extern int armci_agg_save_descriptor(void *src, void *dst, int bytes, - int proc, int op, int is_registered_put, - armci_ihdl_t nb_handle); - -extern void armci_agg_complete(armci_ihdl_t nb_handle, int condition); - -extern armci_ihdl_t armci_set_implicit_handle (int op, int proc); - -extern int armci_getnumcpus(void); -extern long armci_util_long_getval(long* p); -extern int armci_util_int_getval(int* p); -extern void armci_region_register_shm(void *start, long size); -extern void armci_region_register_loc(void *start, long size); -extern void armci_region_clus_record(int node, void *start, long size); -extern void armci_region_init(); -extern int armci_region_clus_found(int node, void *start, int size); -extern int armci_region_loc_found(void *start, int size); -extern int armci_region_both_found(void *loc, void *rem, int size, int node); -#ifdef REGIONS_REQUIRE_MEMHDL -extern int get_armci_region_local_hndl(void *loc, int node, ARMCI_MEMHDL_T **loc_memhdl); -#endif -extern void armci_region_exchange(void *start, long size); -extern void cpu_yield(); - -#ifdef ALLOW_PIN -extern void armci_global_region_exchange(void *, long); -#endif - - -/* -------------------- ARMCI Groups ---------------------- */ -/* data structure that caches a group's attribute */ -#ifdef BGML -#define PCLASS 3 -#endif -#ifdef MSG_COMMS_MPI -typedef int ARMCI_Datatype; - -extern int ATTR_KEY; /* attribute key */ - -/* #define ARMCI_GROUP /\*Generic ARMCI implementation*\/ */ - -typedef struct { - armci_clus_t *grp_clus_info; - int grp_me; /* my group id */ - int grp_nclus; /* number of cluster nodes */ - int grp_clus_me; /* my cluster node id */ - int mem_offset; /* memory offset */ -#ifdef ARMCI_GROUP - int nproc; /* #procs in this group*/ - int *proc_list; /* Ids of procs in this group - (w.r.t. MPI_COMM_WORLD)*/ -#endif -}armci_grp_attr_t; - -#include "mpi.h" - -/**dup of MPI_COMM_WORLD for internal MPI communication*/ -extern MPI_Comm ARMCI_COMM_WORLD; - -#ifdef PORTALS -#include "portals.h" -#endif - -typedef MPI_Comm ARMCI_Comm; -typedef struct { -#ifndef ARMCI_GROUP - MPI_Comm icomm; - MPI_Group igroup; -#endif - armci_grp_attr_t grp_attr; -}ARMCI_iGroup; - -armci_grp_attr_t *ARMCI_Group_getattr(ARMCI_Group *grp); - -extern void armci_group_init(); -extern void armci_group_finalize(); -extern ARMCI_iGroup* armci_get_igroup_from_group(ARMCI_Group *group); - -#endif /* ifdef MSG_COMMS_MPI */ -/* -------------------------------------------------------- */ - -/* ------------ ARMCI Chekcpointing/Recovery -------------- */ -#ifdef DO_CKPT -extern int armci_init_checkpoint(); -extern void armci_create_ckptds(armci_ckpt_ds_t *ckptds, int count); -extern int armci_icheckpoint_init(char *filename, ARMCI_Group *grp, - int savestack, int saveheap, - armci_ckpt_ds_t *ckptds); -extern int armci_icheckpoint(int rid); -extern int armci_irecover(int rid,int iamreplacement); -extern void armci_icheckpoint_finalize(int rid); - - -#endif /* ifdef DO_CKPT */ -/* -------------------------------------------------------- */ - -#endif diff --git a/armci/src-gemini/atomics-i386.h b/armci/src-gemini/atomics-i386.h deleted file mode 100644 index dd5899a0e..000000000 --- a/armci/src-gemini/atomics-i386.h +++ /dev/null @@ -1,22 +0,0 @@ -/** Atomic instructions for i386. To be populated as need arises. - * @author Sriram Krishnamoorthy - */ -#ifndef __ATOMICS_I386__ -#define __ATOMICS_I386__ - -#include - -#define v4b (volatile unsigned int *) - -static inline void atomic_exchange(void *val, void *ptr, int size) { - assert(size == 4); - __asm__ __volatile__ ("xchgl %0, %1" - : "=r"(*v4b(val)), "+m"(*v4b(ptr)) - : "0"(*v4b(val)) - : "memory"); -} - -#undef v4b - -#endif /*__ATOMICS_I386__*/ - diff --git a/armci/src-gemini/bufalloc.c b/armci/src-gemini/bufalloc.c deleted file mode 100644 index 8c6458eed..000000000 --- a/armci/src-gemini/bufalloc.c +++ /dev/null @@ -1,436 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: bufalloc.c,v 1.2 2001-06-07 23:23:23 d3h325 Exp $ - * storage manager for a chunk of memory passed by user in armci_init_buf_alloc - * derived from K&R that manages a chunk of memory - */ - -#include - -#define USAGE_ - -extern char *buf_allocate(); /* Used to get memory from the system */ -extern void armci_die(); - -#define VALID1 0xaaaaaaaa /* For validity check on headers */ -#define VALID2 0x55555555 -#define LOG_ALIGN 6 -#define ALIGNMENT (1 << LOG_ALIGN) -#define DEFAULT_NALLOC (1024 - ALIGNMENT) - -#ifdef USAGE -static struct shmalloc_struct { - size_t total; /* Amount request from system in units */ - long nchunk; /* No. of chunks of system memory */ - long inuse; /* Amount in use in units */ - long maxuse; /* Maximum value of inuse */ - long nfrags; /* No. of fragments divided into */ - long nmcalls; /* No. of calls to shmalloc */ - long nfcalls; /* No. of calls to buf_free */ -} usage; -#endif - -union header{ - struct { - unsigned valid1; /* Token to check if is not overwritten */ - union header *ptr; /* next block if on free list */ - size_t size; /* size of this block*/ - unsigned valid2; /* Another token acting as a guard */ - } s; - char align[ALIGNMENT]; /* Align to ALIGNMENT byte boundary */ -}; -typedef union header Header; - -static Header base; /* empty list to get started */ -static Header *freep = NULL; /* start of free list */ -static Header *usedp = NULL; /* start of used list */ -static size_t nalloc = DEFAULT_NALLOC; -static size_t max_nalloc = DEFAULT_NALLOC; -static int do_verify = 0; /* Flag for automatic heap verification */ -static int initialized=0; - - -static void buf_error(char* s, unsigned long i) -{ - void buf_alloc_print_stats(); - fflush(stdout); - fprintf(stderr,"buf_alloc error: %s %ld(0x%lx)\n", s, i, i); - fflush(stderr); -#ifdef USAGE - buf_alloc_print_stats(); -#endif - armci_die("buf_alloc: fatal error", i); -} - -void armci_buf_alloc_request(size_t size, size_t maxsize) -{ - nalloc = (size+ALIGNMENT-1) >> LOG_ALIGN; - max_nalloc = (maxsize+ALIGNMENT-1) >> LOG_ALIGN; -} - -void armci_buf_alloc_debug(int code) -{ - do_verify = code; -} - - -void armci_buf_alloc_verify() -{ - Header *p; - - if ( freep ) { - - /* Check the used list */ - for (p=usedp; p; p=p->s.ptr) { - if (p->s.valid1 != VALID1 || p->s.valid2 != VALID2) - buf_error("invalid header on usedlist", (unsigned long) p->s.valid1); - -#ifdef USAGE - if (p->s.size > usage.total) - buf_error("invalid size in header usedlist",(unsigned long)p->s.size); -#endif - } - - /* Check the free list */ - p = base.s.ptr; - while (p != &base) { - if (p->s.valid1 != VALID1 || p->s.valid2 != VALID2) - buf_error("invalid header on freelist", (unsigned long) p->s.valid1); - -#ifdef USAGE - if (p->s.size > usage.total) - buf_error("invalid size in header freelist",(unsigned long)p->s.size); -#endif - - p = p->s.ptr; - } - } /* end if */ -} - - -static void addtofree(char* ap) -{ - Header *bp, *p, **up; - -#ifdef USAGE - usage.nfcalls++; -#endif - if (do_verify) armci_buf_alloc_verify(); - - /* only do something if pointer is not NULL */ - if ( ap ) { - - bp = (Header *) ap - 1; /* Point to block header */ - - if (bp->s.valid1 != VALID1 || bp->s.valid2 != VALID2) - buf_error("buf_free: pointer not from buf_alloc", (unsigned long) ap); - -#ifdef USAGE - usage.inuse -= bp->s.size; /* Decrement memory usage */ -#endif - - /* Extract the block from the used linked list ... for debug only */ - for (up=&usedp; ; up = &((*up)->s.ptr)) { - if (!*up) - buf_error("buf_free:block not found in used list\n",(unsigned long)ap); - if (*up == bp) { - *up = bp->s.ptr; - break; - } - } - - /* Join the memory back into the free linked list */ - for (p=freep; !(bp > p && bp < p->s.ptr); p = p->s.ptr) - if (p >= p->s.ptr && (bp > p || bp < p->s.ptr)) - break; /* Freed block at start or end of arena */ - - if (bp + bp->s.size == p->s.ptr) {/* join to upper neighbour */ - bp->s.size += p->s.ptr->s.size; - bp->s.ptr = p->s.ptr->s.ptr; -#ifdef USAGE - usage.nfrags--; /* Lost a fragment */ -#endif - } else - bp->s.ptr = p->s.ptr; - - if (p + p->s.size == bp) { /* Join to lower neighbour */ - p->s.size += bp->s.size; - p->s.ptr = bp->s.ptr; -#ifdef USAGE - usage.nfrags--; /* Lost a fragment */ -#endif - } else - p->s.ptr = bp; - - freep = p; - - } /* end if on ap */ -} - - -void armci_init_buf_alloc(size_t len, void* buffer) -{ - char *cp; - Header *up, *prevp; - size_t nu; - /* need to initialize the free list */ - if (sizeof(Header) != ALIGNMENT) - buf_error("Alignment is not valid", (unsigned long) ALIGNMENT); - - if(initialized)armci_die("armci_init_buf_alloc: already initialized",0); - -#ifdef USAGE - usage.total = 0; /* Initialize statistics */ - usage.nchunk = 0; - usage.inuse = 0; - usage.nfrags = 0; - usage.maxuse = 0; - usage.nmcalls= 0; - usage.nfcalls= 0; -#endif - - base.s.ptr = freep = prevp = &base; /* Initialize linked list */ - base.s.size = 0; - base.s.valid1 = VALID1; - base.s.valid2 = VALID2; - - nu = len/sizeof(Header); /* nu must by a multiplicity of nalloc */ - max_nalloc = nu*nalloc; - if(nu<1) armci_die("buffer less than nalloc",(int)len); - cp = (char*)buffer; - -#ifdef USAGE - usage.total += nu; /* Have just got nu more units */ - usage.nchunk++; /* One more chunk */ - usage.nfrags++; /* Currently one more frag */ - usage.inuse += nu; /* Inuse will be decremented by buf_free */ -#endif - - up = (Header *) cp; - up->s.size = nu; - up->s.valid1 = VALID1; - up->s.valid2 = VALID2; - - /* Insert into linked list of blocks in use so that buf_free works - ... for debug only */ - up->s.ptr = usedp; - usedp = up; - - addtofree((char *)(up+1)); /* Try to join into the free list */ -} - - -/*\ return a chunk memory of given size -\*/ -char *armci_buf_alloc(size_t nbytes) -{ - Header *p, *prevp; - size_t nunits; - char *return_ptr; - - /* need to initialize the free list */ - if ((prevp = freep) == NULL) armci_die("not initialized", 0); - -#ifdef USAGE - usage.nmcalls++; -#endif - - if (do_verify) armci_buf_alloc_verify(); - - /* Rather than divide make the alignment a known power of 2 */ - nunits = ((nbytes + sizeof(Header) - 1)>>LOG_ALIGN) + 1; - - for (p=prevp->s.ptr; ; prevp = p, p = p->s.ptr) { - if (p->s.size >= nunits) { /* Big enuf */ - if (p->s.size == nunits) /* exact fit */ - prevp->s.ptr = p->s.ptr; - else { /* allocate tail end */ - p->s.size -= nunits; - p += p->s.size; - p->s.size = nunits; - p->s.valid1 = VALID1; - p->s.valid2 = VALID2; -#ifdef USAGE - usage.nfrags++; /* Have just increased the fragmentation */ -#endif - } - - /* Insert into linked list of blocks in use ... for debug only */ - p->s.ptr = usedp; - usedp = p; - -#ifdef USAGE - usage.inuse += nunits; /* Record usage */ - if (usage.inuse > usage.maxuse) - usage.maxuse = usage.inuse; -#endif - freep = prevp; - return_ptr = (char *) (p+1); - break; - } - - if (p == freep){ /* wrapped around the free list */ - return_ptr = (char *) NULL; - break; - } - } - return return_ptr; -} - - -void armci_buf_free(char *ap) -{ - Header *bp, *p, **up; -#ifdef USAGE - usage.nfcalls++; -#endif - if (do_verify) armci_buf_alloc_verify(); - - /* only do something if pointer is not NULL */ - - if ( ap ) { - - bp = (Header *) ap - 1; /* Point to block header */ - - if (bp->s.valid1 != VALID1 || bp->s.valid2 != VALID2) - buf_error("buf_free: pointer not from buf_alloc", (unsigned long) ap); - -#ifdef USAGE - usage.inuse -= bp->s.size; /* Decrement memory usage */ -#endif - - /* Extract the block from the used linked list for debug only */ - for (up=&usedp; ; up = &((*up)->s.ptr)) { - if (!*up) - buf_error("buf_free:block not found in used list\n",(unsigned long)ap); - if (*up == bp) { - *up = bp->s.ptr; - break; - } - } - - /* Join the memory back into the free linked list */ - for (p=freep; !(bp > p && bp < p->s.ptr); p = p->s.ptr) - if (p >= p->s.ptr && (bp > p || bp < p->s.ptr)) - break; /* Freed block at start or end of arena */ - - if (bp + bp->s.size == p->s.ptr) {/* join to upper neighbour */ - bp->s.size += p->s.ptr->s.size; - bp->s.ptr = p->s.ptr->s.ptr; -#ifdef USAGE - usage.nfrags--; /* Lost a fragment */ -#endif - } else - bp->s.ptr = p->s.ptr; - - if (p + p->s.size == bp) { /* Join to lower neighbour */ - p->s.size += bp->s.size; - p->s.ptr = bp->s.ptr; -#ifdef USAGE - usage.nfrags--; /* Lost a fragment */ -#endif - } else - p->s.ptr = bp; - - freep = p; - - } /* end if on ap */ -} - - -#ifdef USAGE -/* - Return stats on buf_alloc performance. Use arg list instead of - returning structure so that FORTRAN can eventually use it -*/ -void buf_alloc_stats(size_t *total, long* nchunk, size_t * inuse, - size_t * maxuse, long* nfrags, long* nmcalls,long* nfcalls) -{ - *total = usage.total * sizeof(Header); - *nchunk = usage.nchunk; - *inuse = (size_t)usage.inuse * sizeof(Header); - *maxuse = (size_t)usage.maxuse* sizeof(Header); - *nfrags = usage.nfrags; - *nmcalls= usage.nmcalls; - *nfcalls= usage.nfcalls; -} - -/* - Print to standard output the usage statistics. -*/ -void buf_alloc_print_stats() -{ - size_t total, inuse, maxuse; - long nchunk, nfrags, nmcalls, nfcalls; - - buf_alloc_stats(&total, &nchunk, &inuse, &maxuse, &nfrags, - &nmcalls, &nfcalls); - - fflush(stderr); - printf("\nbuf_alloc statistics\n-------------------\n\n"); - printf("Total memory from system ... %ld bytes\n", (long)total); - printf("Current memory usage ....... %ld bytes\n", (long)inuse); - printf("Maximum memory usage ....... %ld bytes\n", (long)maxuse); - printf("No. chunks from system ..... %ld\n", nchunk); - printf("No. of fragments ........... %ld\n", nfrags); - printf("No. of calls to buf_alloc ... %ld\n", nmcalls); - printf("No. of calls to buf_free ..... %ld\n", nfcalls); - printf("\n"); - fflush(stdout); -} -#endif - - -#if 0 -void armci_die(char *str, int c) -{ -fprintf(stderr,"%s %d\n",str,c); -_exit(1); -} - - -#define LEN (16*1024) -char buf[LEN]; - -main (int argc, char **argv) -{ -int i,k,total=0,size=1024; -char *ar[100]; - armci_init_buf_alloc(LEN, buf); - - for(i=0; i<100; i++)ar[i]=(char*)0; - - for(i=0; i<100; i++){ - ar[i] =armci_buf_alloc(size); - if(!ar[i]){ - printf("i =%d total=%d\n", i, total); - buf_alloc_print_stats(); - k=i; - break; - } - total+=size; - } - for(i=0; i -#include -#include -#include "armcip.h" -#include "request.h" -#ifdef WIN32 -# include - typedef unsigned long ssize_t; -#else -# include -#endif - -# define EQ_TAGS(a_, b_) !memcmp(&(a_), &(b_), sizeof(a_)) - -#define ALIGN64ADD(buf) (SIXTYFOUR-(((ssize_t)(buf))%SIXTYFOUR)) -/* the following symbols should be defined if needed in protocol specific - header file: BUF_EXTRA_FIELD, BUF_ALLOCATE -*/ - -#ifndef BUF_ALLOCATE -# define BUF_ALLOCATE malloc -#endif -#if defined LIBONESIDED -# define SMALL_BUF_LEN ARMCI_SMALL_BUF_SIZE -#else -# if defined(SERV_QUEUE) -# define SMALL_BUF_LEN 4096 -# else -# define SMALL_BUF_LEN 2048 -# endif -#endif - -#ifndef MSG_BUFLEN_SMALL -# define MSG_BUFLEN_SMALL (MSG_BUFLEN >>0) -#endif - -#define LEFT_GUARD 11.11e11 -#define RIGHT_GUARD 22.22e22 -#define CLEAR_TABLE_SLOT(idx) *((int*)(_armci_buf_state->table+(idx))) =0 - -#ifndef BUF_NET_INIT -#define BUF_NET_INIT(x,xX,Xx) -#endif -_buf_ackresp_t *_buf_ackresp_first,*_buf_ackresp_cur; -/* we allow multiple buffers (up to 15) per single request - * adjacent buffers can be coalesced into a large one - */ -typedef struct { - int op; /* pending operation code */ - int snd; /* if 1 then buffer is used for sending request */ - int rcv; /* if 1 then buffer is used for receiving data */ - int async; /* if 1 then request is nonblocking */ - int first; /* id of the 1st buffer in the set in same request */ - int count; /* count is not used and is always 1 (or 0???) */ - /*unsigned int count:4; \* how many buffers used for this request 8 possible */ - int busy; /* if 1 buffer is used and cannot be completed */ - int cmpl; /* set to 1 if buffer was completed and can be released */ - int to; /* serv/proc to which request was sent, 8172 possible */ -}buf_state_t; - - -#ifndef BUFID_PAD_T -#define BUFID_PAD_T BUF_INFO_T -#endif - -/* message send buffer data structure */ -typedef struct { - BUF_INFO_T id; -# ifdef BUF_EXTRA_FIELD_T - BUF_EXTRA_FIELD_T field; -# endif - char buffer[MSG_BUFLEN_SMALL]; -} buf_ext_t; - -/* message send buffer data structure */ -typedef struct { - BUF_INFO_T id; -# ifdef BUF_EXTRA_FIELD_T - BUF_EXTRA_FIELD_T field; -# endif - char buffer[SMALL_BUF_LEN]; -} buf_smext_t; - -/* we keep table and buffer pointer together for better locality */ -typedef struct { - double left_guard; /* stamp to verify if array was corrupted */ - buf_state_t table[MAX_BUFS+MAX_SMALL_BUFS]; /*array with state of buffer */ - buf_ext_t *buf; /* address of buffer pool */ - buf_smext_t *smallbuf; /* address of the large buffer pool */ - int avail; - int smavail; - int pad; - double right_guard; /* stamp to verify if array was corrupted */ - - unsigned buf_bitmap; /* bitmaps to track available buffers: */ - unsigned smbuf_bitmap;/* 1 - available, 0 - not available */ -} reqbuf_pool_t; - -#ifndef BUF_EXTRA_FIELD_T -# define SIZE_BUF_EXTRA_FIELD 0 -# define BUF_TO_EBUF(buf) (buf_ext_t*)(((char*)buf) - sizeof(BUFID_PAD_T) -\ - SIZE_BUF_EXTRA_FIELD) -# define BUF_TO_SMEBUF(buf) (buf_smext_t*)(((char*)buf)- sizeof(BUFID_PAD_T) -\ - SIZE_BUF_EXTRA_FIELD) -#else -# define BUF_TO_EBUF(buf) (buf_ext_t*)(((char*)buf) - sizeof(BUFID_PAD_T) -\ - sizeof(BUF_EXTRA_FIELD_T)) -# define BUF_TO_SMEBUF(buf) (buf_smext_t*)(((char*)buf)- sizeof(BUFID_PAD_T) -\ - sizeof(BUF_EXTRA_FIELD_T)) -#endif - -#define BUF_TO_BUFINDEX(buf) (BUF_TO_EBUF((buf)))->id.bufid -#define BUF_TO_SMBUFINDEX(buf) (BUF_TO_SMEBUF((buf)))->id.bufid - - -buf_ext_t *_armci_buffers; /* these are the actual buffers */ -buf_smext_t *_armci_smbuffers; /* no, these are the actual buffers */ -reqbuf_pool_t* _armci_buf_state; /* array that describes state of each buf */ - -extern active_socks_t *_armci_active_socks; - -/* returns bufinfo, given bufid */ -INLINE BUF_INFO_T *_armci_id_to_bufinfo(int bufid) { - if (bufid < 0 || bufid >= (MAX_BUFS+MAX_SMALL_BUFS)) - armci_die2("_armci_id_to_bufinfo: bad id",bufid,MAX_BUFS); - - return bufid < MAX_BUFS ? &(_armci_buf_state->buf[bufid].id) : - &(_armci_buf_state->smallbuf[bufid-MAX_BUFS].id); -} - - - -/*\ we allocate alligned buffer space - * this operation can be implemented in platform specific files -\*/ -void _armci_buf_init() -{ -char *tmp; -int extra=0; -int smallbuf_size = sizeof(buf_smext_t)*(MAX_SMALL_BUFS); - // tmp = (char *) BUF_ALLOCATE((MAX_BUFS*sizeof(buf_ext_t) + 64 + smallbuf_size + 64)); - tmp = (char *) malloc((MAX_BUFS*sizeof(buf_ext_t) + 64 + smallbuf_size + 64)); - bzero(tmp,MAX_BUFS*sizeof(buf_ext_t) + 64 + smallbuf_size + 64); - extra= ALIGN64ADD(tmp); - - /* libonesided: register buffer memory */ - # ifdef CRAY_REGISTER_ARMCI_MALLOC - onesided_hnd_t cp_hnd; - cos_mdesc_t local_mdesc; - uint64_t length = (MAX_BUFS*sizeof(buf_ext_t) + 64 + smallbuf_size + 64); - - // get the onesided v2.0 api handle for the compute process - cpGetOnesidedHandle(&cp_hnd); - - // register the memory - onesided_mem_register(cp_hnd, tmp, length, 0, &local_mdesc); - - // for now; until we can search through the linked-list of registered memory - // to deregister it by pointer (ptr) value only [see ARMCI_Free_local], we'll - // take advanatage of lazy deregistration and assume that this segment will - // be kept around as long as it's active. - onesided_mem_deregister(cp_hnd, &local_mdesc); - # endif - - - _armci_buffers = (buf_ext_t *) (tmp + extra); - - tmp = (char *)(_armci_buffers + MAX_BUFS); - extra = ALIGN64ADD(tmp); - _armci_smbuffers = (buf_smext_t *) (tmp + extra); - - - if(DEBUG2_){ - printf("%d:armci_init_bufs: pointer %p, before align ptr=%p bufptr=%p end of region is %p size=%d extra=%d\n", - armci_me,_armci_buffers,tmp,_armci_buffers->buffer,(_armci_buffers+MAX_BUFS), - MAX_BUFS*sizeof(buf_ext_t),extra); - fflush(stdout); - } - - /* now allocate state array */ - tmp = malloc(sizeof(reqbuf_pool_t) + 64); - bzero(tmp,sizeof(reqbuf_pool_t) + 64); - if(!tmp)armci_die("_armci_buf_init calloc failed",0); - extra= ALIGN64ADD(tmp); - _armci_buf_state = (reqbuf_pool_t*)(tmp + extra); - - - /* initialize it */ - _armci_buf_state->left_guard = LEFT_GUARD; - _armci_buf_state->right_guard = RIGHT_GUARD; - _armci_buf_state->avail =0; - _armci_buf_state->smavail =MAX_BUFS; - _armci_buf_state->buf = _armci_buffers; - _armci_buf_state->smallbuf = _armci_smbuffers; - - _buf_ackresp_first=_buf_ackresp_cur=NULL; - - if(BUF_TO_EBUF(_armci_buf_state->buf[0].buffer)!=_armci_buf_state->buf) - armci_die("buffers.c, internal structure alignment problem",0); - - # ifdef LIBONESIDED - int i; - for(i=0; i (void *)_armci_buffers && buf < (void *)(_armci_buffers+MAX_BUFS)){ - index = BUF_TO_BUFINDEX(ptr); - if((index >= MAX_BUFS)|| (index<0)) - armci_die2("armci_buf_to_index: bad index:",index,MAX_BUFS); - return(index); - } - else if(buf > (void *)_armci_smbuffers && buf < (void *)(_armci_smbuffers+MAX_SMALL_BUFS)){ - index = BUF_TO_SMBUFINDEX(ptr); - if((index >= MAX_BUFS+MAX_SMALL_BUFS)|| (indextable + index; - ARMCI_PR_DBG("enter",0); - if(index>=MAX_BUFS){ - int relidx; - relidx = index-MAX_BUFS; - CLEAR_SEND_BUF_FIELD(_armci_buf_state->smallbuf[relidx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op); - } - else - CLEAR_SEND_BUF_FIELD(_armci_buf_state->buf[index].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op); - ARMCI_PR_DBG("exit",0); -} - -/*\ complete outstanding operation that uses the specified buffer -\*/ -void _armci_buf_complete_index(int idx, int called) -{ -int count; -buf_state_t *buf_state = _armci_buf_state->table +idx; -cos_request_t *req = NULL; - - count = buf_state->count; - if(DEBUG_ || 0) { - printf("%d:buf_complete_index:%d op=%d first=%d count=%d called=%d\n", - armci_me,idx,buf_state->op,buf_state->first,buf_state->count, - called); - fflush(stdout); - } - - if(buf_state->first != (unsigned int)idx){ - armci_die2("complete_buf_index:inconsistent Index:",idx,buf_state->first); - } - - /* need to call platform specific function */ - if(idx>=MAX_BUFS){ - int relidx,rr; - relidx = idx-MAX_BUFS; - //printf("\n%d:in clear idx=%d %d",armci_me,idx,_armci_buf_state->smallbuf[relidx].id.tag);fflush(stdout); - /* ------------------------------------------------------------------------------------------- *\ - active buffers need to be completed - \* ------------------------------------------------------------------------------------------- */ - # ifdef LIBONESIDED - req = &_armci_buf_state->smallbuf[relidx].id.ar.req; - cpReqWait(req); - - # else - - if(_armci_buf_state->smallbuf[relidx].id.tag && (_armci_buf_state->smallbuf[relidx].field)->tag>0) { - rr=armci_client_complete(0,buf_state->to,_armci_buf_state->smallbuf[relidx].id.tag,_armci_buf_state->smallbuf[relidx].field); - } - CLEAR_SEND_BUF_FIELD(_armci_buf_state->smallbuf[relidx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op); - - # endif - - // later, we might just need to do this for all operations, not just get - // Complete NBGET: non-blocking get - if(_armci_buf_state->smallbuf[relidx].id.tag!=0 &&(buf_state->op == GET)){ - armci_complete_req_buf(&(_armci_buf_state->smallbuf[relidx].id), - _armci_buf_state->smallbuf[relidx].buffer); - } - _armci_buf_state->smallbuf[relidx].id.tag=0; - } - else { - int rr; - - /* ------------------------------------------------------------------------------------------- *\ - active buffers need to be completed - \* ------------------------------------------------------------------------------------------- */ - # ifdef LIBONESIDED - req = &_armci_buf_state->buf[idx].id.ar.req; - cpReqWait(req); - - # else - - if(_armci_buf_state->buf[idx].id.tag && (_armci_buf_state->buf[idx].field)->tag>0 ) - rr=armci_client_complete(0,buf_state->to,_armci_buf_state->buf[idx].id.tag,_armci_buf_state->buf[idx].field); - CLEAR_SEND_BUF_FIELD(_armci_buf_state->buf[idx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op); - //printf("\n%d:in clear large idx=%d %d",armci_me,idx,_armci_buf_state->buf[idx].id.tag);fflush(stdout); - # endif - - // later, we might just need to do this for all operations, not just get - // Complete NBGET: non-blocking get - if(_armci_buf_state->buf[idx].id.tag!=0 &&(buf_state->op == GET)){ - armci_complete_req_buf(&(_armci_buf_state->buf[idx].id), - _armci_buf_state->buf[idx].buffer); - } - _armci_buf_state->buf[idx].id.tag=0; - } - /* clear table slots for all the buffers in the set for this request */ - for(; count; count--, buf_state++) *(int*)buf_state = 0; -} - - -/*\ test outstanding operation that uses the specified buffer for complete - * It is important not to change the state of the buffer, the buffer has - * to remain as it was, only completion has to be indicated -\*/ -int _armci_buf_test_index(int idx, int called) -{ -int count,retval=0; -buf_state_t *buf_state = _armci_buf_state->table +idx; - count = buf_state->count; - if(DEBUG_ ){ - printf("%d:buf_test_index:%d op=%d first=%d count=%d called=%d\n", - armci_me,idx,buf_state->op,buf_state->first,buf_state->count, - called); - fflush(stdout); - } - if(buf_state->first != (unsigned int)idx){ - armci_die2("_buf_test_index:inconsistent index:",idx,buf_state->first); - } -# ifdef BUF_EXTRA_FIELD_T - /* need to call platform specific function */ - if(idx>=MAX_BUFS){ - int relidx; - relidx = idx-MAX_BUFS; - /*printf("\n%d:relidx=%d \n",armci_me,relidx);fflush(stdout);*/ - TEST_SEND_BUF_FIELD(_armci_buf_state->smallbuf[relidx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op,&retval); - - } - else { - TEST_SEND_BUF_FIELD(_armci_buf_state->buf[idx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op,&retval); - } -# endif - if(DEBUG_ ){ - printf("%d:buf_test_index:%d op=%d first=%d count=%d called=%d ret=%d\n", - armci_me,idx,buf_state->op,buf_state->first,buf_state->count, - called,retval); - fflush(stdout); - } - return(retval); -} - -/** -an addition to the below operation to allow for multiple outstanding operations -per server node -*/ -void _armci_buf_ensure_pend_outstanding_op_per_node(void *buf, int node) -{ -int i; -int index =_armci_buf_to_index(buf); -int this = _armci_buf_state->table[index].first; -int nfirst, nlast; -void _armci_buf_release_index(int i); -int buf_pend_count=0; -int changeid=0; - nfirst=armci_clus_info[node].master; - nlast = nfirst+armci_clus_info[node].nslave-1; - if(_armci_buf_state->table[index].to<0){ - _armci_buf_state->table[index].to = 0-1e6-_armci_buf_state->table[index].to; - changeid=1; - } - - if((_armci_buf_state->table[index].to<(unsigned int) nfirst) || - (_armci_buf_state->table[index].to>(unsigned int) nlast)) - armci_die2("_armci_buf_ensure_pend_outstanding_op_per_node: bad to",node, - (int)_armci_buf_state->table[index].to); - - buf_pend_count=0; - for(i=0;itable +i; - if((buf_state->to >= nfirst) && (buf_state->to<= (unsigned int) nlast)) - if( (buf_state->first != (unsigned int) this) && (buf_state->first==(unsigned int) i) && buf_state->op){ - buf_pend_count++; - if(buf_pend_count == NUM_SERV_BUFS){ - _armci_buf_complete_index(i,0); - _armci_buf_release_index(i); - break; - } - } - } - if(changeid)_armci_buf_state->table[index].to = 0-1e6-_armci_buf_state->table[index].to; -} - -/*\ make sure that there are no other pending operations to that smp node - * this operation is called from platforms specific routine that sends - * request - * we could have accomplished the same in armci_buf_get but as Vinod - * is pointing out, it is better to delay completing outstanding - * calls to overlap memcpy for the current buffer with communication -\*/ -void _armci_buf_ensure_one_outstanding_op_per_node(void *buf, int node) -{ - int i; - int index =_armci_buf_to_index(buf); - int this = _armci_buf_state->table[index].first; - int nfirst, nlast; - void _armci_buf_release_index(int i); - - nfirst=armci_clus_info[node].master; - nlast = nfirst+armci_clus_info[node].nslave-1; - if((_armci_buf_state->table[index].to<(unsigned int) nfirst) || - (_armci_buf_state->table[index].to>(unsigned int) nlast)) - armci_die2("_armci_buf_ensure_one_outstanding_op_per_node: bad to",node, - (int)_armci_buf_state->table[index].to); - - for(i=0;itable +i; - if((buf_state->to >= nfirst) && (buf_state->to<= (unsigned int) nlast)) { - if((buf_state->first != (unsigned int) this)&&(buf_state->first==(unsigned int) i) && buf_state->op) { - _armci_buf_complete_index(i,0); - _armci_buf_release_index(i); - } - } - } -} - -/*\ same as above but for process -\*/ -void _armci_buf_ensure_one_outstanding_op_per_proc(void *buf, int proc) -{ - int i; - int index = _armci_buf_to_index(buf); - int this = _armci_buf_state->table[index].first; - void _armci_buf_release_index(int i); - - if(_armci_buf_state->table[index].to !=(unsigned int) proc ) - armci_die2("_armci_buf_ensure_one_outstanding_op_per_proc: bad to", proc, - (int)_armci_buf_state->table[index].to); - - for(i=0;itable +i; - if(buf_state->to == (unsigned int) proc) { - if((buf_state->first != (unsigned int) this)&&(buf_state->first==(unsigned int) i) && buf_state->op) { - _armci_buf_complete_index(i,0); - _armci_buf_release_index(i); - } - } - } -} - - -#define HISTORY__ -#ifdef HISTORY -typedef struct{ int size; int op; int count; int id; } history_t; -history_t history[100]; -int h=0; - -void print_history() -{ -int i; - fflush(stdout); - printf("%d records\n",h); - for(i=0; ibuf[history[i].id].buffer, history[i].count, - history[i].op); - - fflush(stdout); -} -#endif - -/*\ call corresponding to GET_SEND_BUF -\*/ -char *_armci_buf_get_small(int size, int operation, int to) -{ -int avail=_armci_buf_state->smavail,i; -_buf_ackresp_t *ar; - if(_armci_buf_state->table[avail].op || - _armci_buf_state->table[avail].first || - _armci_buf_state->smallbuf[avail-MAX_BUFS].id.ar.req.active) { - - for(i=MAX_BUFS;itable[i].op && - !_armci_buf_state->table[i].first && - !_armci_buf_state->smallbuf[i-MAX_BUFS].id.ar.req.active) - break; - } - if(i<(MAX_SMALL_BUFS+MAX_BUFS))avail = i; - else { - _armci_buf_complete_index(avail,1); - } - } - _armci_buf_state->table[avail].op = operation; - _armci_buf_state->table[avail].to = to; - _armci_buf_state->table[avail].count= 1; - _armci_buf_state->table[avail].first = avail; - _armci_buf_state->smallbuf[avail-MAX_BUFS].id.tag=0; - _armci_buf_state->smallbuf[avail-MAX_BUFS].id.bufid= avail; - _armci_buf_state->smallbuf[avail-MAX_BUFS].id.protocol=0; - ar=&_armci_buf_state->smallbuf[avail-MAX_BUFS].id.ar; - assert(ar->val==0);assert(ar->next==NULL);assert(ar->previous==NULL); - ar->req.active = 1; - if(_buf_ackresp_cur!=NULL) - _buf_ackresp_cur->next=ar; - if(_buf_ackresp_first==NULL) - _buf_ackresp_first=ar; - ar->previous=_buf_ackresp_cur; - ar->next=NULL; - _buf_ackresp_cur=ar; - - if(DEBUG_ || 0) { - printf("%d:buf_get_sm1:size=%d max=%d got %d ptr=%p op=%d to=%d count=%d first=%d\n", - armci_me,size,SMALL_BUF_LEN,avail, - _armci_buf_state->smallbuf[avail-MAX_BUFS].buffer,operation,to, - (int)_armci_buf_state->table[avail].count,(int)_armci_buf_state->table[avail].first); - fflush(stdout); - } - -# ifdef BUF_EXTRA_FIELD_T - INIT_SEND_BUF(_armci_buf_state->smallbuf[avail-MAX_BUFS].field,_armci_buf_state->table[avail].snd,_armci_buf_state->table[avail].rcv); -#endif - - _armci_buf_state->smavail = (avail+1-MAX_BUFS)%MAX_SMALL_BUFS + MAX_BUFS; - - if(DEBUG_ || 0) { - printf("%d:buf_get_sm:size=%d max=%d got %d ptr=%p op=%d to=%d count=%d first=%d\n", - armci_me,size,SMALL_BUF_LEN,avail, - _armci_buf_state->smallbuf[avail-MAX_BUFS].buffer,operation,to, - _armci_buf_state->table[avail].count,_armci_buf_state->table[avail].first); - fflush(stdout); - } - - return(_armci_buf_state->smallbuf[avail-MAX_BUFS].buffer); - -} - -/*\ call corresponding to GET_SEND_BUF -\*/ -static char *rmo_buffer = NULL; - -char *_armci_buf_get(int size, int operation, int to) -{ -int avail=_armci_buf_state->avail; -int count=1, i; -_buf_ackresp_t *ar; - - /*if small buffer, we go to another routine that gets smallbuf*/ - if(size MSG_BUFLEN_SMALL) ){ - double val = (double)size; /* use double due to a bug in gcc */ - val /= MSG_BUFLEN_SMALL; - count=(int)val; - if(size%MSG_BUFLEN_SMALL) count++; - assert(0); - } - /* start from 0 if there is not enough bufs available from here */ - if((avail+count) > MAX_BUFS)avail = 0; - - /* avail should never point to buffer in a middle of a set of used bufs */ - if(_armci_buf_state->table[avail].op && - (_armci_buf_state->table[avail].first != (unsigned int) avail)){ sleep(1); - printf("%d: inconsistent first. avail=%d count=%d first=%d size=%d\n", - armci_me, avail, count, _armci_buf_state->table[avail].first, size); - armci_die2("armci_buf_get: inconsistent first", avail, - _armci_buf_state->table[avail].first); - } - - /* we need complete "count" number of buffers */ - for(i=0;itable[cur].op && - _armci_buf_state->table[cur].first==(unsigned int) cur) || - _armci_buf_state->buf[cur].id.ar.req.active) { - _armci_buf_complete_index(cur,1); - } - } - - for(i=0; itable[avail+i].op = operation; - _armci_buf_state->table[avail+i].to = to; - _armci_buf_state->table[avail+i].count= count; - _armci_buf_state->table[avail+i].first = avail; - } - - _armci_buf_state->buf[avail].id.tag=0; - _armci_buf_state->buf[avail].id.bufid=avail; - _armci_buf_state->buf[avail].id.protocol=0; - ar=&_armci_buf_state->buf[avail].id.ar; - - assert(ar->val==0);assert(ar->next==NULL);assert(ar->previous==NULL); - assert(ar->req.active == 0); - - ar->req.active = 1; - - if(_buf_ackresp_cur!=NULL) - _buf_ackresp_cur->next=ar; - if(_buf_ackresp_first==NULL) - _buf_ackresp_first=ar; - ar->previous=_buf_ackresp_cur; - ar->next=NULL; - _buf_ackresp_cur = ar; - -# ifdef BUF_EXTRA_FIELD_T - INIT_SEND_BUF(_armci_buf_state->buf[avail].field,_armci_buf_state->table[avail].snd,_armci_buf_state->table[avail].rcv); -#endif - -#ifdef HISTORY - history[h].size=size; - history[h].op=operation; - history[h].count=count; - history[h].id = avail; - h++; -#endif - - if(DEBUG_ || 0) { - printf("%d:buf_get:size=%d max=%d got %d ptr=%p count=%d op=%d to=%d\n", - armci_me,size,MSG_BUFLEN_SMALL,avail, - _armci_buf_state->buf[avail].buffer, count,operation,to); - fflush(stdout); - } - - /* select candidate buffer for next allocation request */ - _armci_buf_state->avail = avail+count; - _armci_buf_state->avail %= MAX_BUFS; - - return(_armci_buf_state->buf[avail].buffer); -} - - -void _armci_buf_release_index(int index) { - int count; - buf_state_t *buf_state = _armci_buf_state->table +index; - char *_armci_buf_ptr_from_id(int id); - - if((index >= MAX_BUFS+MAX_SMALL_BUFS)|| (index<0)) - armci_die2("armci_buf_release: bad index:",index,MAX_BUFS); - - count = _armci_buf_state->table[index].count; - - if(DEBUG_ || 0) { - printf("%d:_armci_buf_release_index %d ptr=%p count=%d op=%d smavail=%d\n", - armci_me,index,_armci_buf_ptr_from_id(index),count, _armci_buf_state->table[index].op,_armci_buf_state->smavail); - fflush(stdout); - } - - /* clear table slots for all the buffers in the set for this request */ - for(; count; count--, buf_state++) *(int*)buf_state = 0; - if(index >= MAX_BUFS){ - _armci_buf_state->smallbuf[index-MAX_BUFS].id.tag=0; - //_armci_buf_state->smavail = index; - } - else{ - _armci_buf_state->buf[index].id.tag=0; - // _armci_buf_state->avail = index; - } - /* the current buffer is prime candidate to satisfy next buffer request */ -} - - -/*\ release buffer when it becomes free -\*/ -void _armci_buf_release(void *buf) { - _armci_buf_release_index(_armci_buf_to_index(buf)); -} - - -/*\ return pointer to buffer number id -\*/ -char *_armci_buf_ptr_from_id(int id) -{ - if(id <0 || id >=(MAX_BUFS+MAX_SMALL_BUFS)) - armci_die2("armci_buf_ptr_from_id: bad id",id,MAX_BUFS); - if(id >=MAX_BUFS)return(_armci_buf_state->smallbuf[id-MAX_BUFS].buffer); - return(_armci_buf_state->buf[id].buffer); -} - - - -/*\function called from PARMCI_Wait to wait for non-blocking ops -\*/ -void _armci_buf_complete_nb_request(int bufid,unsigned int tag, int *retcode) -{ -int i=0; -#if 0 - printf("\n%d:wait called with bufid=%d tag=%d \n",armci_me,bufid,tag); - fflush(stdout); -#endif - - if(bufid == NB_NONE) *retcode=0; - else if(bufid == NB_MULTI) { - for(i=0;ibuf[i].id.tag) - _armci_buf_complete_index(i,1); - } - for(i=0;ismallbuf[i].id.tag) - _armci_buf_complete_index(i+MAX_BUFS,1); - } - *retcode=0; - } - else { - if(bufidbuf[bufid].id.tag) - _armci_buf_complete_index(bufid,1); - } - else{ - if(tag && tag==_armci_buf_state->smallbuf[bufid-MAX_BUFS].id.tag) - _armci_buf_complete_index(bufid,1); - } - *retcode=0; - } -} - - -/*\function called from PARMCI_Test to test completion of non-blocking ops -\*/ -void _armci_buf_test_nb_request(int bufid,unsigned int tag, int *retcode) -{ -int i; - if(bufid == NB_NONE) *retcode=0; - else if(bufid == NB_MULTI) { - for(i=0;ibuf[i].id.tag){ - if(_armci_buf_test_index(i,1)){ - *retcode=1; - break; - } - } - } - for(i=0;ismallbuf[i].id.tag) - if(_armci_buf_test_index(i+MAX_BUFS,1)){ - *retcode=1; - break; - } - } - } - else { - if(bufidbuf[bufid].id.tag) - *retcode = _armci_buf_test_index(bufid,1); - } - else{ - if(tag && tag==_armci_buf_state->smallbuf[bufid-MAX_BUFS].id.tag) - *retcode = _armci_buf_test_index(bufid,1); - } - } -} - -/*\function to set the buffer tag and the protocol -\*/ -void _armci_buf_set_tag(void *bufptr,unsigned int tag,short int protocol) -{ -int index = _armci_buf_to_index(bufptr); - /*_armci_buf_state->table[index].async=1;*/ - if(indexbuf[index].id.tag=tag; - _armci_buf_state->buf[index].id.protocol=protocol; - } - else{ - _armci_buf_state->smallbuf[index-MAX_BUFS].id.tag=tag; - _armci_buf_state->smallbuf[index-MAX_BUFS].id.protocol=protocol; - } -} - -int _armci_buf_get_tag(void *bufptr) -{ -int index = _armci_buf_to_index(bufptr); - if(indexbuf[index].id.tag); - else - return(_armci_buf_state->smallbuf[index-MAX_BUFS].id.tag); -} - -/*\function to return bufinfo, given buf ptr -\*/ -BUF_INFO_T *_armci_buf_to_bufinfo(void *buf){ - if(buf > (void *)_armci_buffers && buf < (void *)(_armci_buffers+MAX_BUFS)){ - return(&((BUF_TO_EBUF(buf))->id)); - } - else if(buf > (void *)_armci_smbuffers && buf < (void *)(_armci_smbuffers+MAX_SMALL_BUFS)){ - return(&((BUF_TO_SMEBUF(buf))->id)); - } - else { - armci_die("armci_buf_to_index: bad pointer",0); - return(0); - } -} - -/*\function to clear all buffers -\*/ -void _armci_buf_clear_all() -{ -int i; - for(i=0;itable[i].op || _armci_buf_state->table[i].first) - CLEAR_SEND_BUF_FIELD(_armci_buf_state->buf[i].field,_armci_buf_state->table[i].snd,_armci_buf_state->table[i].rcv,_armci_buf_state->table[i].to,_armci_buf_state->table[i].op); -#endif - } - for(i=MAX_BUFS;itable[i].op || _armci_buf_state->table[i].first) - CLEAR_SEND_BUF_FIELD(_armci_buf_state->smallbuf[i-MAX_BUFS].field,_armci_buf_state->table[i].snd,_armci_buf_state->table[i].rcv,_armci_buf_state->table[i].to,_armci_buf_state->table[i].op); -#endif - } -} - -/* function to return bufinfo, given buf tag */ -BUF_INFO_T *_armci_tag_to_bufinfo(msg_tag_t tag) { - int idx; - - for (idx=0; idx < MAX_BUFS; idx++) - if (EQ_TAGS(_armci_buffers[idx].id.tag, tag)) break; - - if (idx == MAX_BUFS) {/* not found is regular buffers */ - for (idx = 0; idx < MAX_SMALL_BUFS; idx++) - if (EQ_TAGS(_armci_smbuffers[idx].id.tag, tag)) break; - if (idx == MAX_SMALL_BUFS) /* not found at all */ - armci_die("_armci_tag_to_bufinfo: bad tag",0); - - return &(_armci_smbuffers[idx].id); - } else return &(_armci_buffers[idx].id); -} - - -/* inline primitives for buffer state management */ -INLINE char *_armci_buf_get_clear_busy(int size, int operation, int to) { - char *buf = _armci_buf_get(size, operation, to); - _armci_buf_set_busy(buf, 0); - return buf; -} - -INLINE void _armci_buf_set_busy(void *buf, int state) { - _armci_buf_state->table[_armci_buf_to_index(buf)].busy = state; -} - -INLINE void _armci_buf_set_busy_idx(int idx, int state) { - _armci_buf_state->table[idx].busy = state; -} - -#if 0 -INLINE int _armci_buf_cmpld(void *buf) { - return _armci_buf_state->table[_armci_buf_to_index(buf)].cmpl; -} -#else -INLINE int _armci_buf_cmpld(int bufid) { - return _armci_buf_state->table[bufid].cmpl; -} -#endif - - -INLINE void _armci_buf_set_cmpld(void *buf, int state) { - _armci_buf_state->table[_armci_buf_to_index(buf)].cmpl = state; -} - -INLINE void _armci_buf_set_cmpld_idx(int idx, int state) { - _armci_buf_state->table[idx].cmpl = state; -} - - diff --git a/armci/src-gemini/build.env b/armci/src-gemini/build.env deleted file mode 100644 index dee42bbd5..000000000 --- a/armci/src-gemini/build.env +++ /dev/null @@ -1,14 +0,0 @@ -setenv NWCHEM_TOP `pwd` -setenv USE_MPI y -setenv LIBMPI " " -setenv LARGE_FILES y -setenv ARMCI_NETWORK LIBONESIDED -setenv NWCHEM_TARGET LINUX64 -setenv TARGET LINUX64 -setenv USE_64TO32 y -setenv HAS_BLAS yes -setenv USE_SCALAPACK y -setenv TARGET LINUX64 -setenv BLASOPT " " -setenv BLAS_OPT " " -alias gmake "make FC=ftn _FC=pgf90 COPT='-O0 -g'" diff --git a/armci/src-gemini/caccumulate.c b/armci/src-gemini/caccumulate.c deleted file mode 100644 index c4a05324c..000000000 --- a/armci/src-gemini/caccumulate.c +++ /dev/null @@ -1,798 +0,0 @@ -/*************************************************************************** - - COPYRIGHT - -The following is a notice of limited availability of the code, and disclaimer -which must be included in the prologue of the code and in all source listings -of the code. - -Copyright Notice - + 2009 University of Chicago - -Permission is hereby granted to use, reproduce, prepare derivative works, and -to redistribute to others. This software was authored by: - -Jeff R. Hammond -Leadership Computing Facility -Argonne National Laboratory -Argonne IL 60439 USA -phone: (630) 252-5381 -e-mail: jhammond@anl.gov - - GOVERNMENT LICENSE - -Portions of this material resulted from work developed under a U.S. -Government Contract and are subject to the following license: the Government -is granted for itself and others acting on its behalf a paid-up, nonexclusive, -irrevocable worldwide license in this computer software to reproduce, prepare -derivative works, and perform publicly and display publicly. - - DISCLAIMER - -This computer code material was prepared, in part, as an account of work -sponsored by an agency of the United States Government. Neither the United -States, nor the University of Chicago, nor any of their employees, makes any -warranty express or implied, or assumes any legal liability or responsibility -for the accuracy, completeness, or usefulness of any information, apparatus, -product, or process disclosed, or represents that its use would not infringe -privately owned rights. - - ***************************************************************************/ - -/*********************************************************************** - * accumulate operation for the following datatypes: - * real, double precision, complex, double complex, integer - * - * WARNING: This file must be compiled WITH optimization under AIX. - * IBM fortran compilers generate bad code with -g option. - * - * Two versions of each routine are provided: - * original and unrolled loops. - * - ***********************************************************************/ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "acc.h" - -#if 0 - subroutine d_accumulate_1d(alpha, A, B, rows) - integer rows, r - double precision A(*), B(*), alpha -ccdir$ no_cache_alloc a,b - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_d_accumulate_1d_(const double* const restrict alpha, - double* restrict A, - const double* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - - -#if 0 - subroutine d_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double precision A(ald,*), B(bld,*), alpha -ccdir$ no_cache_alloc a,b - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_d_accumulate_2d_(const double* const alpha, - const int* const rows, - const int* const cols, - double* restrict A, - const int* const ald, - const double* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine f_accumulate_1d(alpha, A, B, rows) - integer rows, r - real A(*), B(*), alpha - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_f_accumulate_1d_(const float* const restrict alpha, - float* const restrict A, - const float* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - -#if 0 - subroutine f_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - real A(ald,*), B(bld,*), alpha - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_f_accumulate_2d_(const float* const alpha, - const int* const rows, - const int* const cols, - float* restrict A, - const int* const ald, - const float* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine z_accumulate_1d(alpha, A, B, rows) - integer rows, r - double complex A(*), B(*), alpha - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_c_accumulate_1d_(const complex_t* const restrict alpha, - complex_t* const restrict A, - const complex_t* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i].real += (*alpha).real * B[i].real - (*alpha).imag * B[i].imag; - A[i].imag += (*alpha).imag * B[i].real + (*alpha).real * B[i].imag; - } - return; -} - -#if 0 - subroutine z_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double complex A(ald,*), B(bld,*), alpha - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_c_accumulate_2d_(const complex_t* const alpha, - const int* const rows, - const int* const cols, - complex_t* restrict A, - const int* const ald, - const complex_t* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ) { - for ( r = 0 ; r < (*rows) ; r++ ) { - A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag; - A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag; - } - } - return; -} - -#if 0 - subroutine c_accumulate_1d(alpha, A, B, rows) - integer rows, r - complex A(*), B(*), alpha - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_z_accumulate_1d_(const dcomplex_t* const restrict alpha, - dcomplex_t* const restrict A, - const dcomplex_t* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i].real += (*alpha).real * B[i].real - (*alpha).imag * B[i].imag; - A[i].imag += (*alpha).imag * B[i].real + (*alpha).real * B[i].imag; - } - return; -} - -#if 0 - subroutine c_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - complex A(ald,*), B(bld,*), alpha - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_z_accumulate_2d_(const dcomplex_t* const alpha, - const int* const rows, - const int* const cols, - dcomplex_t* restrict A, - const int* const ald, - const dcomplex_t* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ) { - for ( r = 0 ; r < (*rows) ; r++ ) { - A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag; - A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag; - } - } - return; -} - -#if 0 - subroutine i_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - integer A(ald,*), B(bld,*), alpha - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_i_accumulate_1d_(const int* const restrict alpha, - int* const restrict A, - const int* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - -void c_l_accumulate_1d_(const long* const restrict alpha, - long* const restrict A, - const long* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - -void c_ll_accumulate_1d_(const long long* const restrict alpha, - long long* const restrict A, - const long long* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - -#if 0 - subroutine i_accumulate_1d(alpha, A, B, rows) - integer rows, r - integer A(*), B(*), alpha - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_i_accumulate_2d_(const int* const alpha, - const int* const rows, - const int* const cols, - int* restrict A, - const int* const ald, - const int* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -void c_l_accumulate_2d_(const long* const alpha, - const int* const rows, - const int* const cols, - long* restrict A, - const int* const ald, - const long* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -void c_ll_accumulate_2d_(const long long* const alpha, - const int* const rows, - const int* const cols, - long long* restrict A, - const int* const ald, - const long long* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine d_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double precision A(ald,*), B(bld,*), alpha - integer r1 - doubleprecision d1, d2, d3, d4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - d1 = a(r,c) + alpha*b(r,c) - d2 = a(r+1,c) + alpha*b(r+1,c) - d3 = a(r+2,c) + alpha*b(r+2,c) - d4 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = d1 - a(r+1,c) = d2 - a(r+2,c) = d3 - a(r+3,c) = d4 - enddo - enddo - end -#endif - -void c_d_accumulate_2d_u_(const double* const alpha, - const int* const rows, - const int* const cols, - double* restrict A, - const int* const ald, - const double* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine f_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - real A(ald,*), B(bld,*), alpha - integer r1 - real d1, d2, d3, d4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - d1 = a(r,c) + alpha*b(r,c) - d2 = a(r+1,c) + alpha*b(r+1,c) - d3 = a(r+2,c) + alpha*b(r+2,c) - d4 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = d1 - a(r+1,c) = d2 - a(r+2,c) = d3 - a(r+3,c) = d4 - enddo - enddo - end -#endif - -void c_f_accumulate_2d_u_(const float* const alpha, - const int* const rows, - const int* const cols, - float* restrict A, - const int* const ald, - const float* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine z_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double complex A(ald,*), B(bld,*), alpha - integer r1 - double complex x1, x2, x3, x4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - x1 = a(r,c) + alpha*b(r,c) - x2 = a(r+1,c) + alpha*b(r+1,c) - x3 = a(r+2,c) + alpha*b(r+2,c) - x4 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = x1 - a(r+1,c) = x2 - a(r+2,c) = x3 - a(r+3,c) = x4 - enddo - enddo - end -#endif - -void c_c_accumulate_2d_u_(const complex_t* const alpha, - const int* const rows, - const int* const cols, - complex_t* restrict A, - const int* const ald, - const complex_t* const B, - const int* const bld) -{ - int r, c; - int jA, jB; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - jA = c * (*ald) + r; - jB = c * (*bld) + r; - A[ jA ].real += (*alpha).real * B[ jB ].real - (*alpha).imag * B[ jB ].imag; - A[ jA ].imag += (*alpha).imag * B[ jB ].real + (*alpha).real * B[ jB ].imag; - A[ jA+1 ].real += (*alpha).real * B[ jB+1 ].real - (*alpha).imag * B[ jB+1 ].imag; - A[ jA+1 ].imag += (*alpha).imag * B[ jB+1 ].real + (*alpha).real * B[ jB+1 ].imag; - A[ jA+2 ].real += (*alpha).real * B[ jB+2 ].real - (*alpha).imag * B[ jB+2 ].imag; - A[ jA+2 ].imag += (*alpha).imag * B[ jB+2 ].real + (*alpha).real * B[ jB+2 ].imag; - A[ jA+3 ].real += (*alpha).real * B[ jB+3 ].real - (*alpha).imag * B[ jB+3 ].imag; - A[ jA+3 ].imag += (*alpha).imag * B[ jB+3 ].real + (*alpha).real * B[ jB+3 ].imag; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag; - A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag; - } - } - return; -} - -#if 0 - subroutine c_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - complex A(ald,*), B(bld,*), alpha - integer r1 - complex x1, x2, x3, x4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - x1 = a(r,c) + alpha*b(r,c) - x2 = a(r+1,c) + alpha*b(r+1,c) - x3 = a(r+2,c) + alpha*b(r+2,c) - x4 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = x1 - a(r+1,c) = x2 - a(r+2,c) = x3 - a(r+3,c) = x4 - enddo - enddo - end -#endif - -void c_z_accumulate_2d_u_(const dcomplex_t* const alpha, - const int* const rows, - const int* const cols, - dcomplex_t* restrict A, - const int* const ald, - const dcomplex_t* const B, - const int* const bld) -{ - int r, c; - int jA, jB; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - jA = c * (*ald) + r; - jB = c * (*bld) + r; - A[ jA ].real += (*alpha).real * B[ jB ].real - (*alpha).imag * B[ jB ].imag; - A[ jA ].imag += (*alpha).imag * B[ jB ].real + (*alpha).real * B[ jB ].imag; - A[ jA+1 ].real += (*alpha).real * B[ jB+1 ].real - (*alpha).imag * B[ jB+1 ].imag; - A[ jA+1 ].imag += (*alpha).imag * B[ jB+1 ].real + (*alpha).real * B[ jB+1 ].imag; - A[ jA+2 ].real += (*alpha).real * B[ jB+2 ].real - (*alpha).imag * B[ jB+2 ].imag; - A[ jA+2 ].imag += (*alpha).imag * B[ jB+2 ].real + (*alpha).real * B[ jB+2 ].imag; - A[ jA+3 ].real += (*alpha).real * B[ jB+3 ].real - (*alpha).imag * B[ jB+3 ].imag; - A[ jA+3 ].imag += (*alpha).imag * B[ jB+3 ].real + (*alpha).real * B[ jB+3 ].imag; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag; - A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag; - } - } - return; -} - -#if 0 - subroutine i_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - integer A(ald,*), B(bld,*), alpha - - integer r1, j2, j3, j4, j5 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - j2 = a(r,c) + alpha*b(r,c) - j3 = a(r+1,c) + alpha*b(r+1,c) - j4 = a(r+2,c) + alpha*b(r+2,c) - j5 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = j2 - a(r+1,c) = j3 - a(r+2,c) = j4 - a(r+3,c) = j5 - enddo - enddo - end -#endif - -void c_i_accumulate_2d_u_(const int* const alpha, - const int* const rows, - const int* const cols, - int* restrict A, - const int* const ald, - const int* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -void c_l_accumulate_2d_u_(const long* const alpha, - const int* const rows, - const int* const cols, - long* restrict A, - const int* const ald, - const long* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -void c_ll_accumulate_2d_u_(const long long* const alpha, - const int* const rows, - const int* const cols, - long long* restrict A, - const int* const ald, - const long long* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 -c---------- operations used in armci gops -------------- -c - subroutine fort_dadd(n, x, work) - integer n,i - double precision x(n), work(n) - do i= 1,n - x(i) = x(i) + work(i) - enddo - end -#endif - -void c_dadd_(const int* const restrict n, - double* const restrict x, - const double* const restrict work) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - x[i] += work[i]; - } - return; -} - -#if 0 - subroutine fort_dadd2(n, x, work, work2) - integer n,i - double precision x(n), work(n), work2(n) - do i= 1,n - x(i) = work(i) + work2(i) - enddo - end -#endif - -void c_dadd2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - x[i] = work[i] + work2[i]; - } - return; -} - -#if 0 - subroutine fort_dmult(n, x, work) - integer n,i - double precision x(n), work(n) - do i= 1,n - x(i) = x(i) * work(i) - enddo - end -#endif - -void c_dmult_(const int* const restrict n, - double* const restrict x, - const double* const restrict work) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - x[i] *= work[i]; - } - return; -} - -#if 0 - subroutine fort_dmult2(n, x, work,work2) - integer n,i - double precision x(n), work(n) - do i= 1,n - x(i) = work(i)*work2(i) - enddo - end -#endif - -void c_dmult2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - x[i] = work[i] * work2[i]; - } - return; -} - - -// specific to src-portals && to src-gemini -void RA_ACCUMULATE_2D_(long* alpha, int* rows, int* cols, long* a, - int* lda, long* b, int* ldb) -{ -int i,j; - for(j=0;j< *cols; j++){ - long *aa = a + j* *lda; - long *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] ^= bb[i]; - } -} diff --git a/armci/src-gemini/ccopy.c b/armci/src-gemini/ccopy.c deleted file mode 100644 index e99f8f025..000000000 --- a/armci/src-gemini/ccopy.c +++ /dev/null @@ -1,337 +0,0 @@ -/*************************************************************************** - - COPYRIGHT - -The following is a notice of limited availability of the code, and disclaimer -which must be included in the prologue of the code and in all source listings -of the code. - -Copyright Notice - + 2009 University of Chicago - -Permission is hereby granted to use, reproduce, prepare derivative works, and -to redistribute to others. This software was authored by: - -Jeff R. Hammond -Leadership Computing Facility -Argonne National Laboratory -Argonne IL 60439 USA -phone: (630) 252-5381 -e-mail: jhammond@anl.gov - - GOVERNMENT LICENSE - -Portions of this material resulted from work developed under a U.S. -Government Contract and are subject to the following license: the Government -is granted for itself and others acting on its behalf a paid-up, nonexclusive, -irrevocable worldwide license in this computer software to reproduce, prepare -derivative works, and perform publicly and display publicly. - - DISCLAIMER - -This computer code material was prepared, in part, as an account of work -sponsored by an agency of the United States Government. Neither the United -States, nor the University of Chicago, nor any of their employees, makes any -warranty express or implied, or assumes any legal liability or responsibility -for the accuracy, completeness, or usefulness of any information, apparatus, -product, or process disclosed, or represents that its use would not infringe -privately owned rights. - - ***************************************************************************/ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "copy.h" - -/* ONE-DIMENSIONAL COPY OPERATIONS */ - -#if 0 - subroutine dcopy1d_n(A, B, n) - integer n,i - double precision A(n), B(n) -ccdir$ no_cache_alloc a,b - do i = 1, n - B(i) = A(i) - end do - end -#endif - -void c_dcopy1d_n_(const double* const restrict A, - double* const restrict B, - const int* const restrict n) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - B[i] = A[i]; - } - return; -} - -#if 0 - subroutine dcopy1d_u(A, B, n) - integer n,n1,i - double precision A(n), B(n) - double precision d1, d2, d3, d4 - n1 = iand(max0(n,0),3) - do i = 1, n1 - B(i) = A(i) - end do - do i = n1+1, n, 4 - d1 = a(i) - d2 = a(i+1) - d3 = a(i+2) - d4 = a(i+3) - b(i) = d1 - b(i+1) = d2 - b(i+2) = d3 - b(i+3) = d4 - end do - end -#endif - -void c_dcopy1d_u_(const double* const restrict A, - double* const restrict B, - const int* const restrict n) -{ - int i; - int m = (*n) - ((*n)%4); - for ( i = 0 ; i < m ; i+=4 ){ - B[i ] = A[i ]; - B[i+1] = A[i+1]; - B[i+2] = A[i+2]; - B[i+3] = A[i+3]; - } - for ( i = m ; i < (*n) ; i++ ){ - B[i] = A[i]; - } - return; -} - -/* TWO-DIMENSIONAL COPY OPERATIONS */ - -#if 0 - subroutine dcopy21(rows, cols, A, ald, buf, cur) - integer rows, cols - integer c, r, ald, cur - double precision A(ald,*), buf(ald) - cur = 0 - do c = 1, cols - do r = 1, rows - cur = cur+1 - buf(cur) = A(r,c) - end do - end do - end -#endif - -void c_dcopy21_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict buf, - int* const restrict cur) -{ - int r, c, i=0; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - buf[i++] = A[ c * (*ald) + r ]; - } - } - (*cur) = i; - return; -} - -#if 0 - subroutine dcopy12(rows, cols, A, ald, buf, cur) - integer rows, cols - integer c, r, ald, cur - double precision A(ald,*), buf(ald) - cur = 0 - do c = 1, cols - do r = 1, rows - cur = cur+1 - A(r,c) = buf(cur) - end do - end do - end -#endif - -void c_dcopy12_(const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict buf, - int* const restrict cur) -{ - int r, c, i=0; - i = 0; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] = buf[i++]; - } - } - (*cur) = i; - return; -} - -#if 0 - subroutine dcopy2d_n(rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double precision A(ald,*), B(bld,*) - do c = 1, cols - do r = 1, rows - B(r,c) = A(r,c) - end do - end do - end -#endif - -void c_dcopy2d_n_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - B[ c * (*bld) + r ] = A[ c * (*ald) + r ]; - } - } - return; -} - -#if 0 - subroutine dcopy2d_u(rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double precision A(ald,*), B(bld,*) - integer r1 - double precision d1, d2, d3, d4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 -c$$$ b(r,c) = a(r,c) + b(r,c) * 0 - b(r,c) = a(r,c) - end do - do r = r1 + 1, rows, 4 - d1 = a(r,c) - d2 = a(r+1,c) - d3 = a(r+2,c) - d4 = a(r+3,c) - b(r,c) = d1 - b(r+1,c) = d2 - b(r+2,c) = d3 - b(r+3,c) = d4 -c$$$ b(r,c) = a(r,c) + b(r,c) * 0 -c$$$ b(r+1,c) = a(r+1,c) + b(r+1,c) * 0 -c$$$ b(r+2,c) = a(r+2,c) + b(r+2,c) * 0 -c$$$ b(r+3,c) = a(r+3,c) + b(r+3,c) * 0 - enddo - enddo - end -#endif - -void c_dcopy2d_u_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - int m = (*rows) - ((*rows)%4); - for ( r = 0 ; r < m ; r+=4 ){ - B[ c * (*bld) + r ] = A[ c * (*ald) + r ]; - B[ c * (*bld) + r+1 ] = A[ c * (*ald) + r+1 ]; - B[ c * (*bld) + r+2 ] = A[ c * (*ald) + r+2 ]; - B[ c * (*bld) + r+3 ] = A[ c * (*ald) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - B[ c * (*bld) + r ] = A[ c * (*ald) + r ]; - } - } - return; -} - -/* THREE-DIMENSIONAL COPY OPERATIONS */ - -#if 0 - subroutine dcopy31(rows, cols, planes, A, aldr, aldc, buf, cur) - integer rows, cols, planes - integer c, r, p, aldr, aldc, cur - double precision A(aldr, aldc, *), buf(aldr) - cur = 0 - do p = 1, planes - do c = 1, cols - do r = 1, rows - cur = cur+1 - buf(cur) = A(r,c,p) - end do - end do - end do - end -#endif - -void c_dcopy31_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - const double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - double* const restrict buf, - int* const restrict cur) -{ - int r, c, p, i=0; - for ( p = 0 ; p < (*plns) ; p++ ){ - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - buf[i++] = A[ p * (*aldc) * (*aldr) + c * (*aldr) + r ]; - } - } - } - (*cur) = i; - return; -} - -#if 0 - subroutine dcopy13(rows, cols, planes, A, aldr, aldc, buf, cur) - integer rows, cols, planes - integer c, r, p, aldr, aldc, cur - double precision A(aldr, aldc, *), buf(aldr) - cur = 0 - do p = 1, planes - do c = 1, cols - do r = 1, rows - cur = cur+1 - A(r,c,p) = buf(cur) - end do - end do - end do - end -#endif - -void c_dcopy13_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - const double* const restrict buf, - int* const restrict cur) -{ - int r, c, p, i=0; - for ( p = 0 ; p < (*plns) ; p++ ){ - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ p * (*aldc) * (*aldr) + c * (*aldr) + r ] = buf[i++]; - } - } - } - (*cur) = i; - return; -} diff --git a/armci/src-gemini/clusterinfo.c b/armci/src-gemini/clusterinfo.c deleted file mode 100644 index 9f51a06c5..000000000 --- a/armci/src-gemini/clusterinfo.c +++ /dev/null @@ -1,499 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: clusterinfo.c,v 1.36.2.3 2007-06-13 00:46:13 vinod Exp $ */ -/****************************************************************************** -* file: cluster.c -* purpose: Determine cluster info i.e., number of machines and processes -* running on each of them. -* -*******************************************************************************/ - -#include -#include -#include -#ifdef unix -#include -#endif -#include "message.h" -#include "armcip.h" - -#ifdef WIN32 - /* this is where gethostbyname is declared */ -# include -#endif - -/* NO_SHMEM enables to simulate cluster environment on a single workstation. - * Must define NO_SHMMAX_SEARCH in shmem.c to prevent depleting shared memory - * due to a gready shmem request by the master process on cluster node 0. - */ -#if defined(DECOSF) && defined(QUADRICS) -# if !defined(REGION_ALLOC) -# define NO_SHMEM - extern int armci_enable_alpha_hack(); -# endif -#else -# define armci_enable_alpha_hack() 1 -#endif - -#define DEBUG 0 -#define MAX_HOSTNAME 80 -#define CHECK_NODE_NAMES - -/* print info on how many cluster nodes detected */ -#ifdef CLUSTER -# define PRINT_CLUSTER_INFO 1 -#else -# define PRINT_CLUSTER_INFO 0 -#endif - -#if defined(GM) - static char *network_protocol="Myrinet GM"; -#elif defined(VIA) - static char *network_protocol="VIA"; -#elif defined(MELLANOX) - static char *network_protocol="Mellanox Verbs API"; -#elif defined(OPENIB) - static char *network_protocol="OpenIB Verbs API"; -#elif defined(DOELAN4) - static char *network_protocol="Quadrics ELAN-4"; -#elif defined(QUADRICS) - static char *network_protocol="Quadrics ELAN-3"; -#elif defined(PM) - static char *network_protocol="Score PM"; -#elif defined(PORTALS) - static char *network_protocol="PORTALS"; -#elif defined(MPI_SPAWN) - static char *network_protocol="MPI-SPAWN"; -#elif defined(LIBONESIDED) - static char *network_protocol="Cray Onesided"; -#else - static char *network_protocol="TCP/IP Sockets"; -#endif - - -/*** stores cluster configuration ***/ -armci_clus_t *armci_clus_info; - -#ifdef HITACHI -#include -# define GETHOSTNAME sr_gethostname -ndes_t _armci_group; - -static int sr_gethostname(char *name, int len) -{ -int no; -pid_t ppid; - - if(hmpp_nself (&_armci_group,&no,&ppid,0,NULL) <0) - return -1; - - if(len<6)armci_die("len too small",len); - if(no>1024)armci_die("expected node id <1024",no); - sprintf(name,"n%d",no); - return 0; -} -#elif defined(SGIALTIX) -# define GETHOSTNAME altix_gethostname -static int altix_gethostname(char *name, int len) { - sprintf(name,"altix"); - return 0; -} -#elif defined(XT3) && !defined(PORTALS) -#define GETHOSTNAME cnos_gethostname -static int cnos_gethostname(char *name, int len) -{ - sprintf(name,"%d",cnos_get_rank()); -} -#else -# define GETHOSTNAME gethostname -#endif - -static char* merge_names(char *name) -{ - int jump = 1, rem, to, from; - int lenmes, lenbuf, curlen, totbuflen= armci_nproc*HOSTNAME_LEN; - int len = strlen(name); - char *work = malloc(totbuflen); - - if(!work)armci_die("armci: merge_names: malloc failed: ",totbuflen); - - strcpy(work, name); - curlen = len+1; - - /* prefix tree merges names in the order of process numbering in log(P)time - * result = name_1//name_2//...//name_P-1 - */ - do { - jump *= 2; rem = armci_me%jump; - if(rem){ - to = armci_me - rem; - armci_msg_snd(ARMCI_TAG, work, curlen, to); - break; - }else{ - from = armci_me + jump/2; - if(from < armci_nproc){ - lenbuf = totbuflen - curlen; - armci_msg_rcv(ARMCI_TAG, work+curlen, lenbuf, &lenmes, from); - curlen += lenmes; - } - } - }while (jump < armci_nproc); - return(work); -} - - -static void process_hostlist(char *names) -{ -#ifdef CLUSTER - - int i, cluster=0; - char *s,*master; - int len, root=0; - - /******** inspect list of machine names to determine locality ********/ - if (armci_me==0){ - - /* first find out how many cluster nodes we got */ - armci_nclus =1; s=master=names; - for(i=1; i < armci_nproc; i++){ - s += strlen(s)+1; - if(strcmp(s,master)){ - /* we found a new machine name on the list */ - master = s; - armci_nclus++; - /*fprintf(stderr,"new name %s len =%d\n",master, strlen(master));*/ - - } - } - - /* allocate memory */ - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); - - /* fill the data structure -- go through the list again */ - s=names; - master="*-"; /* impossible hostname */ - cluster =0; - for(i=0; i < armci_nproc; i++){ - if(strcmp(s,master)){ - /* we found a new machine name on the list */ - master = s; - armci_clus_info[cluster].nslave=1; - armci_clus_info[cluster].master=i; - strcpy(armci_clus_info[cluster].hostname, master); - -#ifdef CHECK_NODE_NAMES - /* need consecutive task id allocated on the same node - * the current test only compares hostnames against first cluster */ - if(cluster) if(!strcmp(master,armci_clus_info[0].hostname)){ - /* we have seen that hostname before */ - fprintf(stderr, "\nIt appears that tasks allocated on the same"); - fprintf(stderr, " host machine do not have\n"); - fprintf(stderr, "consecutive message-passing IDs/numbers. "); - fprintf(stderr,"This is not acceptable \nto the ARMCI library "); - fprintf(stderr,"as it prevents SMP optimizations and would\n"); - fprintf(stderr,"lead to poor resource utilization.\n\n"); - fprintf(stderr,"Please contact your System Administrator "); - fprintf(stderr,"or, if you can, modify the "); -# if defined(MSG_COMMS_MPI) - fprintf(stderr,"MPI"); -# elif defined(TCGMSG) - fprintf(stderr,"TCGMSG"); -# elif defined(PVM) - fprintf(stderr,"PVM"); -# endif - fprintf(stderr,"\nmessage-passing job startup configuration.\n\n"); -#ifdef HITACHI - fprintf(stderr,"On Hitachi it can be done by setting environment variable MPIR_RANK_NO_ROUND, for example\n setenv MPIR_RANK_NO_ROUND yes\n\n"); -#endif - sleep(1); - armci_die("Cannot run: improper task to host mapping!",0); - } -#endif - cluster++; - - }else{ - /* the process is still on the same host */ - armci_clus_info[cluster-1].nslave++; - } - s += strlen(s)+1; - } - - if(armci_nclus != cluster) - armci_die("inconsistency processing clusterinfo",armci_nclus); - - } - /******** process 0 got all data ********/ - - /* now broadcast locality info struct to all processes - * two steps are needed because of the unknown length of hostname list - */ - len = sizeof(int); - armci_msg_brdcst(&armci_nclus, len, root); - - if(armci_me){ - /* allocate memory */ - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); - } - - len = sizeof(armci_clus_t)*armci_nclus; - armci_msg_brdcst(armci_clus_info, len, root); - - /******** all processes 0 got all data ********/ - - /* now determine current cluster node id by comparing me to master */ - armci_clus_me = armci_nclus-1; - for(i =0; i< armci_nclus-1; i++) - if(armci_me < armci_clus_info[i+1].master){ - armci_clus_me=i; - break; - } -#else - - armci_clus_me=0; - armci_nclus=1; - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); - strcpy(armci_clus_info[0].hostname, names); - armci_clus_info[0].master=0; - armci_clus_info[0].nslave=armci_nproc; -#endif - - armci_clus_first = armci_clus_info[armci_clus_me].master; - armci_clus_last = armci_clus_first +armci_clus_info[armci_clus_me].nslave-1; - -} - - -/*\ Substring Replacement: replace needle with nail in a haystack -\*/ -static char *substr_replace(char *haystack, char *needle, char *nail) -{ -char *tmp, *pos, *first; -size_t len=strlen(needle), nlen=strlen(nail),bytes; -size_t left; - - pos = strstr(haystack,needle); - if (pos ==NULL) return NULL; - first= tmp = calloc(strlen(haystack)+nlen-len+1+1,1); - if(first==NULL) return(NULL); - bytes = pos - haystack; - while(bytes){ *tmp = *haystack; tmp++; haystack++; bytes--;} - while(nlen) { *tmp = *nail; tmp++; nail++; nlen--;} - haystack += len; - left = strlen(haystack); - while(left>0){*tmp = *haystack; tmp++; haystack++; left --;} - *tmp='\0'; - return(first); -} - - -/*\ ARMCI_HOSTNAME_REPLACE contains "needle/nail" string to derive new hostname -\*/ -static char *new_hostname(char *host) -{ - char *tmp, *needle, *nail; - if((tmp =getenv("ARMCI_HOSTNAME_REPLACE"))){ - needle = strdup(tmp); - if(needle== NULL) return NULL; - nail = strchr(needle,'/'); - if(nail == NULL) return NULL; - *nail = '\0'; - nail++; - if(nail == (needle+1)){ - char* tmp1 = calloc(strlen(host)+strlen(nail)+1,1); - if(tmp1 == NULL) return NULL; - strcpy(tmp1,host); - strcat(tmp1,nail); - return tmp1; - } - return substr_replace(host,needle,nail); - } else return NULL; -} - - -static void print_clus_info() -{ -int i; - - if(PRINT_CLUSTER_INFO && armci_nclus >1 && armci_me ==0){ -#if defined(DATA_SERVER) || defined(SERVER_THREAD) - printf("ARMCI configured for %d cluster nodes. Network protocol is '%s'.\n", - armci_nclus, network_protocol); -#else - printf("ARMCI configured for %d cluster nodes\n", armci_nclus); -#endif - fflush(stdout); - } - - if(armci_me==0 && DEBUG) for(i=0;i= MAX_HOSTNAME) - armci_die("armci: hostname too long",strlen(tmp)); - strcpy(name,tmp); - printf("%d using %s hostname\n",armci_me, name); - fflush(stdout); - } - len = strlen(name); - /*a simple way to run as many servers as compute processes*/ - enval = getenv("ARMCI_NSERV_EQ_NPROC"); - if(enval != NULL){ - sprintf(name+len,"n%d",getpid()); - len = strlen(name); - printf("\n%s\n",name); - } - - -#ifdef HOSTNAME_TRUNCATE - { - /* in some cases (e.g.,SP) when name is used to determine - * cluster structure but not to establish communication - * we can truncate hostnames to save memory */ - int i; - limit = HOSTNAME_LEN-1; - for(i=0; i",i+1); - } - if(len>limit)name[limit]='\0'; - len =limit; - } -#else - if(len >= HOSTNAME_LEN-1) - armci_die("armci: gethostname overrun name string length",len); -#endif - -#ifdef NO_SHMEM - if(armci_enable_alpha_hack()) { - name[len]='0'+armci_me; - name[len+1]='\0'; - len++; - } -#endif - - if(DEBUG) - fprintf(stderr,"%d: %s len=%d\n",armci_me, name,(int)strlen(name)); - -#ifdef CLUSTER - merged = merge_names(name); /* create hostname list */ - process_hostlist(merged); /* compute cluster info */ - free(merged); -#else - process_hostlist(name); /* compute cluster info */ -#endif - -#ifndef NO_SHMEM - armci_set_shmem_limit_per_node(armci_clus_info[0].nslave); -#endif - armci_master = armci_clus_info[armci_clus_me].master; - -#ifdef NO_SHMEM - if(armci_enable_alpha_hack()) { - int i; - for(i=0;i= armci_nproc)armci_die("armci_clus_id: out of range",p); - - if(p < armci_clus_first){ from = 0; to = armci_clus_me;} - else {from = armci_clus_me; to = armci_nclus;} - - found = to-1; - for(c = from; c< to-1; c++) - if(p < armci_clus_info[c+1].master){ - found=c; - break; - } - - return (found); -} - -int armci_smp_master(int i) -{ -return(armci_clus_info[i].master); -} -/*\ return number of processes in the domain represented by id; id<0 means my node -\*/ -int armci_domain_nprocs(armci_domain_t domain, int id) -{ - if(id>= armci_nclus) armci_die2("armci domain error",id,armci_nclus); - if(id<0) id = armci_clus_me; - return armci_clus_info[id].nslave; -} - -/*\ return number of nodes in diven domain -\*/ -int armci_domain_count(armci_domain_t domain) -{ - return armci_nclus; -} - -/*\ return domain ID of the specified process -\*/ -int armci_domain_id(armci_domain_t domain, int glob_proc_id) -{ -int id = glob_proc_id; - if(id<0 || id>= armci_nproc) armci_die2("armci domain error",id,armci_nproc); - return armci_clus_id(glob_proc_id); -} - -/*\ return global ID of a process loc_proc_id in domain identified by id - * armci_domain_nproc(id)< loc_proc_id >=0 -\*/ -int armci_domain_glob_proc_id(armci_domain_t domain, int id, int loc_proc_id) -{ - if(id<0 || id>= armci_nclus) armci_die2("armci domain error",id,armci_nclus); - if(loc_proc_id<0 || loc_proc_id>= armci_clus_info[id].nslave) - armci_die2("armci domain proc error",loc_proc_id,armci_clus_info[id].nslave); - return (armci_clus_info[id].master + loc_proc_id); -} - -/*\ return ID of domain that the calling process belongs to -\*/ -int armci_domain_my_id(armci_domain_t domain) -{ - return(armci_clus_me); -} - -int armci_domain_same_id (armci_domain_t domain, int proc) -{ - int rc = SAMECLUSNODE(proc); - return(rc); -} diff --git a/armci/src-gemini/code_options.h b/armci/src-gemini/code_options.h deleted file mode 100644 index 4a30cd4fc..000000000 --- a/armci/src-gemini/code_options.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - Questions: - ORNL - tipparajuv@ornl.gov - CRAY - ryan@cray.com -*/ - -/* --------------------------------------------------------------------------- *\ - PORTALS_USE_RENDEZ_VOUS - ======================= - When the number of PEs gets very large, the data server is required to have - buffer space available for all possible incoming messages which is defined - by PORTALS_MAX_DESCRIPTORS = (MAX_BUFS+MAX_SMALL_BUFS). - For each PE, the DS must have at least: - min_memory_per_pe = PORTALS_MAX_BUFS*PORTALS_BUF_SIZE + - PORTALS_MAX_SMALL_BUFS*PORTALS_SMALL_BUF_SIZE - This becomes a memory constraint at large core count. - Rendez-vous message is one mechanism to get around requiring the DS to - have buffer space for all messages. When rendez-vous (RZV) messaging is - enabled, the messages what use the large buffers no longer send the entire - buffer "eagerly". Instead, only the data request (request_header_t) gets - sent to the data server. When the data server is ready to handle the - request, it "pulls" the entire buffer over via a portals_get operation. - One can immediately see that this can lead to a slow down in performance, - since the data server is idle when it has to pull the data over. This is - the price paid when you remove the bufferign for those messsages. Ideally, - when the DS is pulling the message, it could be processing another request. - This double buffering technique needs to be programmed in. Care must be - taken to ensure proper ARMCI behavior. The next request handled can not be - from the same PE, nor can it be a FENCE operation ... all other (?) - requests/operations can be double buffered. -\* --------------------------------------------------------------------------- */ - # define PORTALS_USE_RENDEZ_VOUS - - - -/* --------------------------------------------------------------------------- *\ - PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - ===================================== - Another means to reduce the required buffer needed by the data server is - to limit the number of cores that can talk to the data server at any given - moment. When this options is turned on, only 1 request per node is allowed - to be in the buffer of any given data server. On a 10 core node, the size - of the buffer required by the data server is reduced by more than an order - of magnitude. You get more than an order of magnitude, because you don't - need to reserve space for any of the small buffers, since you can only have - one small or one large from any given node in the ds buffer at any one time. - Another major benefit is you can increase MAX_BUFS and MAX_SMALL_BUFS to - increase concurrency without affecting the DS's buffer size. - - Can be used with PORTALS_USE_RENDEZ_VOUS. - - notes: every request needs to respond with an ack, even gets. - acks actually send data when we limit remote request ... the ack - response is needed to trigger that the outstanding request has - been finished by the data server ... the ack zeros out the index - in the active_requests_by_node array. -\* --------------------------------------------------------------------------- */ - # define PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE_TURNED_OFF - - -/* --------------------------------------------------------------------------- *\ - PORTALS_AFFINITY - ================ - When initializing compute processes and data servers, the affinity passed - in by aprun/alps is ignored. - - Compute processes are bound strictly to a particular core. Cores are - evenly divided between sockets keeping the last core (mask = 1 << (ncpus-1)) - free for the data server. - - If the node is not fully subscribed, then the data server is bound to the - last core on the node (mask = 1 << (ncpus-1)); otherwise, the data server - is "free floating" (mask = (1 << ncpus)-1) on a fully subscribed node. -\* --------------------------------------------------------------------------- */ - # define PORTALS_AFFINITY - # define PORTALS_AFFINITY_NSOCKETS 2 - - -/* --------------------------------------------------------------------------- *\ - CRAY_USE_MDMD_COPY - ================== - Used MDMD copy instead of PtlGetRegion for on-node "local" transfers -\* --------------------------------------------------------------------------- */ - # define CRAY_USE_MDMD_COPY - - - -/* --------------------------------------------------------------------------- *\ - ORNL_USE_DS_FOR_REMOTE_GETS - =========================== - Vinod informed us of a modification that can be made to enable the use of - the data server for remote gets. Without this option, direct gets are - used. This can cause severe network congestion, because many armci_gets - are not stride 1. The data server packs those gets into contiguous blocks - and sends them back as a single put. However, the direct gets, require - many small messages. - - Unfortunately, there is a small bug in the DS for remote gets. This bug - may cause the program to abort or print out the following message: - %d: server wrote data at unexpected offset %d - - This is a bug actively being worked on @ CRAY and ORNL. -\* --------------------------------------------------------------------------- */ - # define ORNL_USE_DS_FOR_REMOTE_GETS - # define CRAY_USE_ARMCI_CLIENT_BUFFERS diff --git a/armci/src-gemini/copy.h b/armci/src-gemini/copy.h deleted file mode 100644 index 37110a23b..000000000 --- a/armci/src-gemini/copy.h +++ /dev/null @@ -1,446 +0,0 @@ -/* $Id: copy.h,v 1.86.2.6 2007-08-29 17:32:32 manoj Exp $ */ -#ifndef _COPY_H_ -#define _COPY_H_ - -#include -#include -#ifdef WIN32 -# include -#endif -#ifdef DECOSF -#include -#endif - - -#ifndef EXTERN -# define EXTERN extern -#endif - - -#if defined(SGI) || defined(FUJITSU) || defined(HPUX) || defined(SOLARIS) || defined (DECOSF) || defined(__ia64__) || defined(__crayx1) -# define PTR_ALIGN -#endif - -#if defined(NB_NONCONT) && !defined(CRAY_SHMEM) && !defined(QUADRICS) -#error NB_NONCONT is only available on CRAY_SHMEM,QUADRICS and PORTALS -#endif - -#if defined(SHMEM_HANDLE_SUPPORTED) && !defined(CRAY_SHMEM) -#error SHMEM_HANDLE_SUPPORTED should not be defined on a non CRAY_SHMEM network -#endif - -/* 08/30/06 moved up here from lines 252-397, MEM_FENCE before FENCE_NODE */ - -#if defined(NEED_MEM_SYNC) -# ifdef AIX -# define MEM_FENCE {int _dummy=1; _clear_lock((int *)&_dummy,0); } -# elif defined(__ia64) -# if defined(__GNUC__) && !defined (__INTEL_COMPILER) -# define MEM_FENCE __asm__ __volatile__ ("mf" ::: "memory"); -# else /* Intel Compiler */ - extern void _armci_ia64_mb(); -# define MEM_FENCE _armci_ia64_mb(); -# endif -# elif defined(LINUX) && defined(__GNUC__) && defined(__ppc__) -# define MEM_FENCE \ - __asm__ __volatile__ ("isync" : : : "memory"); -# endif -#endif - -#ifndef armci_copy -# define armci_copy(src,dst,n) bcopy(src,dst,n) -#endif - -/****************************** 2D Copy *******************/ - - -# define DCopy2D(rows, cols, src_ptr, src_ld, dst_ptr, dst_ld){\ - int j, nbytes = sizeof(double)* rows;\ - char *ps=src_ptr, *pd=dst_ptr;\ - for (j = 0; j < cols; j++){\ - armci_copy(ps, pd, nbytes);\ - ps += sizeof(double)* src_ld;\ - pd += sizeof(double)* dst_ld;\ - }\ - } - - -# define ByteCopy2D(bytes, count, src_ptr, src_stride, dst_ptr,dst_stride){\ - int _j;\ - char *ps=src_ptr, *pd=dst_ptr;\ - for (_j = 0; _j < count; _j++){\ - armci_copy(ps, pd, bytes);\ - ps += src_stride;\ - pd += dst_stride;\ - }\ - } - -#if defined(FUJITSU) - -# define armci_put2D(p, bytes,count,src_ptr,src_stride,dst_ptr,dst_stride)\ - CopyPatchTo(src_ptr, src_stride, dst_ptr, dst_stride, count,bytes, p) - -# define armci_get2D(p, bytes, count, src_ptr,src_stride,dst_ptr,dst_stride)\ - CopyPatchFrom(src_ptr, src_stride, dst_ptr, dst_stride,count,bytes,p) - -#elif defined(HITACHI) || defined(_ELAN_PUTGET_H) && !defined(NB_NONCONT) - -#if defined(QUADRICS) -#if 0 -# define WAIT_FOR_PUTS elan_putWaitAll(elan_base->state,200) -# define WAIT_FOR_GETS elan_getWaitAll(elan_base->state,200) -#else -# define WAIT_FOR_PUTS armcill_wait_put() -# define WAIT_FOR_GETS armcill_wait_get() - extern void armcill_wait_put(); - extern void armcill_wait_get(); -#endif -#endif - - extern void armcill_put2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); - extern void armcill_get2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); -# define armci_put2D armcill_put2D -# define armci_get2D armcill_get2D - -#elif defined(NB_NONCONT) - - extern void armcill_wait_put(); - extern void armcill_wait_get(); -# define WAIT_FOR_PUTS armcill_wait_put() -# define WAIT_FOR_GETS armcill_wait_get() - - extern void armcill_put2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); - extern void armcill_get2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); -# define armci_put2D armcill_put2D -# define armci_get2D armcill_get2D - -# if defined(QUADRICS) - -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - _hdl = elan_put(elan_base->state,_src,_dst,(size_t)_sz,_proc) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - _hdl = elan_get(elan_base->state,_src,_dst,(size_t)_sz,_proc) -# define armcill_nb_wait(_hdl)\ - elan_wait(_hdl,100) - -# elif defined(CRAY_SHMEM) - -# define armcill_nb_wait(_hdl)\ - shmem_wait_nb(_hdl) -/*VT:this should be ifdef'ed based on if shmem_handle is defined or not*/ -# if defined (CRAY_XT) -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - shmem_putmem(_dst, _src, (size_t)_sz, _proc) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - shmem_getmem(_dst, _src, (size_t)_sz, _proc) -# else -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - _hdl = shmem_putmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl)) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - _hdl = shmem_getmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl)) -# endif -# endif - -#else -# define armci_put2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\ - int _j;\ - char *ps=src_ptr, *pd=dst_ptr;\ - for (_j = 0; _j < count; _j++){\ - armci_put(ps, pd, bytes, proc);\ - ps += src_stride;\ - pd += dst_stride;\ - }\ - } - - -# define armci_get2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\ - int _j;\ - char *ps=src_ptr, *pd=dst_ptr;\ - for (_j = 0; _j < count; _j++){\ - armci_get(ps, pd, bytes, proc);\ - ps += src_stride;\ - pd += dst_stride;\ - }\ - } -#endif - -/* macros to ensure ordering of consecutive puts or gets following puts */ -#if defined(LAPI) - -# include "lapidefs.h" - -#elif defined(_CRAYMPP) || defined(QUADRICS) || defined(__crayx1)\ - || defined(CRAY_SHMEM) -#if defined(CRAY) || defined(CRAY_XT) -# include -#else -# include -#ifndef ptrdiff_t -# include -#endif -# include -#endif -# ifdef ELAN_ACC -# define FENCE_NODE(p) {\ - if(((p)armci_clus_last))armci_elan_fence(p);} -# define UPDATE_FENCE_STATE(p, op, nissued) -# else - int cmpl_proc; -# ifdef DECOSF -# define FENCE_NODE(p) if(cmpl_proc == (p)){\ - if(((p)armci_clus_last))shmem_quiet();\ - else asm ("mb"); } -# else -# define FENCE_NODE(p) if(cmpl_proc == (p)){\ - if(((p)armci_clus_last))shmem_quiet(); } -# endif -# define UPDATE_FENCE_STATE(p, op, nissued) if((op)==PUT) cmpl_proc=(p); -# endif -#else -# if defined(GM) && defined(ACK_FENCE) - extern void armci_gm_fence(int p); -# define FENCE_NODE(p) armci_gm_fence(p) -# elif defined(BGML) -# include "bgmldefs.h" -# define FENCE_NODE(p) BGML_WaitProc(p) -# else -# define FENCE_NODE(p) -# endif -# define UPDATE_FENCE_STATE(p, op, nissued) - -#endif - - -#ifdef NEC -# define THRESH 1 -# define THRESH1D 1 -#else -# define THRESH 32 -# define THRESH1D 512 -#endif -#define ALIGN_SIZE sizeof(double) - -/********* interface to C 1D and 2D memory copy functions ***********/ -/* dcopy2d_u_ uses explicit unrolled loops to depth 4 */ -void c_dcopy2d_n_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld); -void c_dcopy2d_u_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld); -void c_dcopy1d_n_(const double* const restrict A, - double* const restrict B, - const int* const restrict n); -void c_dcopy1d_u_(const double* const restrict A, - double* const restrict B, - const int* const restrict n); -void c_dcopy21_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict buf, - int* const restrict cur); -void c_dcopy12_(const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict buf, - int* const restrict cur); -void c_dcopy31_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - const double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - double* const restrict buf, - int* const restrict cur); -void c_dcopy13_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - const double* const restrict buf, - int* const restrict cur); - -/********* interface to fortran 1D and 2D memory copy functions ***********/ -#if defined(AIX) || defined(BGML) -# define DCOPY2D c_dcopy2d_u_ -# define DCOPY1D c_dcopy1d_u_ -#elif defined(LINUX) || defined(__crayx1) || defined(HPUX64) || defined(DECOSF) || defined(CRAY) || defined(WIN32) || defined(HITACHI) -# define DCOPY2D c_dcopy2d_n_ -# define DCOPY1D c_dcopy1d_n_ -#else -# define DCOPY2D c_dcopy2d_u_ -# define DCOPY1D c_dcopy1d_u_ -#endif -#define DCOPY21 c_dcopy21_ -#define DCOPY12 c_dcopy12_ -#define DCOPY31 c_dcopy31_ -#define DCOPY13 c_dcopy13_ - - -/***************************** 1-Dimensional copy ************************/ -#if defined(QUADRICS) -# include - -# if defined(_ELAN_PUTGET_H) -# define qsw_put(src,dst,n,proc) \ - elan_wait(elan_put(elan_base->state,src,dst,n,proc),elan_base->waitType) -# define qsw_get(src,dst,n,proc) \ - elan_wait(elan_get(elan_base->state,src,dst,n,proc),elan_base->waitType) -/* -# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\ - *(phandle)=elan_put(elan_base->state,src,dst,n,proc) -*/ -#ifdef DOELAN4 -extern void armci_elan_put_with_tracknotify(char *src,char *dst,int n,int proc, ELAN_EVENT **phandle); -# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\ - armci_elan_put_with_tracknotify(src,dst,n,proc,phandle) -#endif - -# define ARMCI_NB_GET(src,dst,n,proc,phandle)\ - *(phandle)=elan_get(elan_base->state,src,dst,n,proc) -# define ARMCI_NB_WAIT(handle) if(handle)elan_wait(handle,elan_base->waitType) -# define ARMCI_NB_TEST(handle,_succ) (*(_succ))= (handle)? !elan_poll(handle,1L): 1 -# else -# define qsw_put(src,dst,n,proc) shmem_putmem((dst),(src),(int)(n),(proc)) -# define qsw_get(src,dst,n,proc) shmem_getmem((dst),(src),(int)(n),(proc)) -# endif - -# define armci_put(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { qsw_put(src,dst,n,proc);} -# define armci_get(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { qsw_get((src),(dst),(int)(n),(proc));} - -#elif defined(CRAY_T3E) || defined(CRAY_SHMEM) -# define armci_copy_disabled(src,dst,n)\ - if((n)<256 || n%sizeof(long) ) memcpy((dst),(src),(n));\ - else {\ - shmem_put((long*)(dst),(long*)(src),(int)(n)/sizeof(long),armci_me);\ - shmem_quiet(); } - -# define armci_put(src,dst,n,proc) \ - shmem_put32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\ - shmem_quiet() - -# define armci_get(src,dst,n,proc) \ - shmem_get32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\ - shmem_quiet() - -#elif defined(HITACHI) - - extern void armcill_put(void *src, void *dst, int bytes, int proc); - extern void armcill_get(void *src, void *dst, int bytes, int proc); - -# define armci_put(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armcill_put((src), (dst),(n),(proc));} - -# define armci_get(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armcill_get((src), (dst),(n),(proc));} - -#elif defined(FUJITSU) - -# include "fujitsu-vpp.h" -# ifndef __sparc -# define armci_copy(src,dst,n) _MmCopy((char*)(dst), (char*)(src), (n)) -# endif -# define armci_put CopyTo -# define armci_get CopyFrom - -#elif defined(LAPI) - -# include - extern lapi_handle_t lapi_handle; - -# define armci_put(src,dst,n,proc)\ - if(proc==armci_me){\ - armci_copy(src,dst,n);\ - } else {\ - if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\ - NULL,&(ack_cntr[ARMCI_THREAD_IDX].cntr),&cmpl_arr[proc].cntr))\ - ARMCI_Error("LAPI_put failed",0); else;} - - /**** this copy is nonblocking and requires fence to complete!!! ****/ -# define armci_get(src,dst,n,proc) \ - if(proc==armci_me){\ - armci_copy(src,dst,n);\ - } else {\ - if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \ - NULL, &(get_cntr[ARMCI_THREAD_IDX].cntr)))\ - ARMCI_Error("LAPI_Get failed",0);else;} - -# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\ - {if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\ - ARMCI_Error("LAPI_Setcntr in NB_PUT failed",0);\ - (cmplt)->val=1;\ - if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\ - NULL, &((cmplt)->cntr), &cmpl_arr[proc].cntr))\ - ARMCI_Error("LAPI_put failed",0); else;} - -# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\ - {if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\ - ARMCI_Error("LAPI_Setcntr in NB_GET failed",0);\ - (cmplt)->val=1;\ - if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \ - NULL, &((cmplt)->cntr)))\ - ARMCI_Error("LAPI_Get NB_GET failed",0);else;} - -# define ARMCI_NB_WAIT(cmplt) CLEAR_COUNTER((cmplt)) -# define ARMCI_NB_TEST(cmplt,_succ) TEST_COUNTER((cmplt),(_succ)) - -#elif defined(PORTALS) -# define armci_put(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armci_portals_put((proc),(src), (dst),(n),NULL,0);} - -# define armci_get(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armci_portals_get((proc),(src), (dst),(n),NULL,0);} - -# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\ - nb_handle->tag=GET_NEXT_NBTAG();armci_portals_put((proc),(src),\ - (dst),(n),cmplt,nb_handle->tag) -# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\ - nb_handle->tag=GET_NEXT_NBTAG();armci_portals_get((proc),(src),\ - (dst),(n),cmplt,nb_handle->tag) - -#elif defined(BGML) -#define armci_get(src, dst, n, p) PARMCI_Get(src, dst, n, p) -#define armci_put(src, dst, n, p) PARMCI_Put(src, dst, n, p) - -#else - -# define armci_get(src,dst,n,p) armci_copy((src),(dst),(n)) -# define armci_put(src,dst,n,p) armci_copy((src),(dst),(n)) - -#endif - -#ifndef MEM_FENCE -# define MEM_FENCE -#endif -#ifndef armci_copy_fence -# define armci_copy_fence armci_copy -#endif - -#endif diff --git a/armci/src-gemini/ds-shared.c b/armci/src-gemini/ds-shared.c deleted file mode 100644 index b038fb8c8..000000000 --- a/armci/src-gemini/ds-shared.c +++ /dev/null @@ -1,573 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "armcip.h" -#include "request.h" -#include "message.h" -#include "memlock.h" -#include "copy.h" -#include "gpc.h" -#include -#include -#ifdef WIN32 -#include -#else -#include -#endif - -#define DEBUG_ 0 -#define DEBUG1 0 - -#ifndef SERV -# define SERV 2 -#endif - -#ifdef SOCKETS -# define EQ_TAGS(a_, b_) ((a_) == (b_)) -#else -# define EQ_TAGS(a_, b_) !memcmp(&(a_), &(b_), sizeof(a_)) -#endif - -int _armci_server_started=0; - -extern active_socks_t *_armci_active_socks; - -#ifdef ARMCI_CHECK_STATE -typedef struct sarns{ - int data; - long data1; - struct sarns *next; -} sarnode; - -sarnode **sarn_np=NULL; - -sarnode * sarlist_add(int pr, int i,long j) -{ -sarnode **p = &sarn_np[pr]; - sarnode *n = (sarnode *)malloc(sizeof(sarnode)); - assert(n != NULL); - - n->next = *p; - *p = n; - n->data = i; - n->data1 = j; - return *p; -} - -void sarlist_remove(sarnode **p) -{ - if(*p != NULL){ - sarnode *n = *p; - *p = (*p)->next; - free(n); - } -} - -sarnode **sarlist_search(sarnode **n, long i) -{ - while (*n != NULL){ - if ((*n)->data == i){ - return n; - } - n = &(*n)->next; - } - return NULL; -} - -void sarlist_print(int proc) -{ - sarnode *n =sarn_np[proc]; - if (n == NULL){ - /*printf("sarlist is empty\n");*/ - } - while (n != NULL){ - printf("(%d):%d %d next=%d\n", armci_me,n->data,n->data1,(n->next==NULL)?0:1); - n = n->next; - } -} -#endif - -/*\ client sends request to server -\*/ -void armci_send_req(int proc, request_header_t* msginfo, int len,int tag) -{ -int hdrlen = sizeof(request_header_t); -int bytes; - - ARMCI_PR_DBG("enter",0); - if(msginfo->operation == GET) { - if(msginfo->format==VECTOR && msginfo->ehlen > 0) { - printf("%d [cp] unhandled condition in send_req for VECTOR and ehlen\n",armci_me); - abort(); - bytes = msginfo->dscrlen + hdrlen + msginfo->datalen; - } else { - bytes = msginfo->dscrlen + hdrlen; - } - } else bytes = msginfo->bytes + hdrlen; - - if(0 || DEBUG_) { - printf("%d: sending req %d (len=%d dscr=%d data=%d) to %d \n", - armci_me, msginfo->operation, bytes,msginfo->dscrlen, - msginfo->datalen,proc); fflush(stdout); - } - if(bytes > len) armci_die2("armci_send_req:buffer overflow",bytes,len); - // msginfo->tag.data_ptr = (msginfo+1); // not really data, but dscr ptr - armci_send_req_msg(proc,msginfo, bytes,tag); - ARMCI_PR_DBG("exit",0); -} - - -/*\ client sends strided data + request to server -\*/ -void armci_send_strided(int proc, request_header_t *msginfo, char *bdata, - void *ptr, int strides, int stride_arr[], int count[],int tag) -{ -int hdrlen = sizeof(request_header_t); -int dscrlen = msginfo->dscrlen; -int datalen = msginfo->datalen; -int cluster = armci_clus_id(proc); -int bytes; -int i,na; -char *a; -double *tmp; - ARMCI_PR_DBG("enter",0); - bytes = msginfo->bytes + hdrlen; - if(0){ - printf("%d:sending strided %d to(%d,%d,%d) bytes=%d dslen=%d dlen=%d,\n", - armci_me, msginfo->operation, msginfo->to, - cluster, proc, bytes, dscrlen, datalen); fflush(stdout); - } - armci_write_strided(ptr, strides, stride_arr, count, bdata); - // msginfo->tag.data_ptr = (msginfo+1); -#ifdef RMO_DEBUG_ - a = (char *) (msginfo + 1); - a += msginfo->dscrlen; - tmp = (double *) a; - na = msginfo->datalen/sizeof(double); - for(i=0; idatalen; -char *buf; - ARMCI_PR_DBG("enter",0); - if(rcvlen)datalen=rcvlen; - if(DEBUG_) { - printf("%d:armci_rcv_data: bytes= %d \n", armci_me, datalen); - fflush(stdout); - } - - if(datalen == 0) armci_die("armci_rcv_data: no data to receive",datalen); - - buf = armci_ReadFromDirect(proc, msginfo, datalen); - - if(DEBUG_){ - printf("%d:armci_rcv_data: got %d bytes \n",armci_me,datalen); - fflush(stdout); - } - ARMCI_PR_DBG("exit",0); - return(buf); -} - -/*\ client receives vector data from server and unpacks to the right loc -\*/ -void armci_rcv_vector_data(int proc, request_header_t* msginfo, armci_giov_t darr[], int len) -{ - ARMCI_PR_DBG("enter",0); - char *buf = armci_rcv_data(proc, msginfo, 0); - armci_vector_from_buf(darr, len, buf); - ARMCI_PR_DBG("exit",0); -} - -/*\ client receives strided data from server -\*/ -void armci_rcv_strided_data(int proc, request_header_t* msginfo, int datalen, - void *ptr, int strides,int stride_arr[],int count[]) -{ - char *databuf; - ARMCI_PR_DBG("enter",0); - if(DEBUG_){ - printf("%d: armci_rcv_strided_data: expecting datalen %d from %d\n", - armci_me, datalen, proc); fflush(stdout); - } - databuf = armci_ReadFromDirect(proc,msginfo,0); - armci_read_strided(ptr, strides, stride_arr, count, databuf); - ARMCI_PR_DBG("exit",0); -} - -void armci_rem_state(int clus) -{ -int bufsize = sizeof(request_header_t)+sizeof(int); -int destproc = 0; -request_header_t *msginfo; -destproc = SERVER_NODE(clus); -msginfo = (request_header_t *)GET_SEND_BUFFER(bufsize,STATE,destproc); -int tag=0; - - ARMCI_PR_DBG("enter",0); - msginfo->dscrlen = 0; - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(clus); - msginfo->operation = STATE; - msginfo->bytes =0; - msginfo->datalen =sizeof(int); - // msginfo->tag.data_ptr = (msginfo+1); - - if(DEBUG_){ - printf("%d(c):sending ACKreq to %d clus=%d\n",armci_me,msginfo->to,clus); - fflush(stdout); - } - - armci_send_req(armci_clus_info[clus].master, msginfo, bufsize,tag); - armci_rcv_data(armci_clus_info[clus].master, msginfo,0); /* receive */ - FREE_SEND_BUFFER(msginfo); - ARMCI_PR_DBG("exit",0); -} - - -/*\ get ACK from server -\*/ -void armci_rem_ack(int clus) -{ -int bufsize = sizeof(request_header_t)+sizeof(int); -int destproc = 0; -request_header_t *msginfo; -destproc = SERVER_NODE(clus); -msginfo = (request_header_t *) GET_SEND_BUFFER(bufsize,ACK,destproc); -int tag=0; - - ARMCI_PR_DBG("enter",0); - msginfo->dscrlen = 0; - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(clus); - msginfo->operation = ACK; - msginfo->bytes =0; - msginfo->datalen =sizeof(int); - // msginfo->tag.data_ptr = (msginfo+1); - - if(DEBUG_){ - printf("%d(c):sending ACKreq to %d clus=%d\n",armci_me,msginfo->to,clus); - fflush(stdout); - } - - armci_send_req(armci_clus_info[clus].master, msginfo, bufsize,tag); - armci_rcv_data(armci_clus_info[clus].master, msginfo,0); /* receive ACK */ - FREE_SEND_BUFFER(msginfo); - ARMCI_PR_DBG("exit",0); -} - - -/*\ request to QUIT sent by client -\*/ -void armci_serv_quit() -{ -int bufsize = sizeof(request_header_t)+sizeof(int); -int destproc; -request_header_t *msginfo; -destproc = SERVER_NODE(armci_clus_me); -msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,QUIT,destproc); -int tag=0; - - ARMCI_PR_DBG("enter",0); - if(DEBUG_){ printf("%d master: sending quit request to server\n",armci_me); - fflush(stdout); - } - - msginfo->dscrlen = 0; - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(armci_clus_me); - msginfo->operation = QUIT; - if(ACK_QUIT) - msginfo->bytes = msginfo->datalen = sizeof(int); /* ACK */ - else - msginfo->bytes = msginfo->datalen = 0; /* no ACK */ - - armci_send_req(armci_master, msginfo, bufsize,tag); - - if(ACK_QUIT){ - int stat; - stat = *(int*)armci_rcv_data(armci_master,msginfo,0); /* receive ACK */ - if(stat != QUIT) - armci_die("armci_serv_quit: wrong response from server", stat); - FREE_SEND_BUFFER(msginfo); - } - ARMCI_PR_SDBG("exit",0); -} - - -/***************************** server side *********************************/ - -static void armci_check_req(request_header_t *msginfo, int buflen) -{ - - ARMCI_PR_SDBG("enter",msginfo->operation); - if((msginfo->to != armci_me && msginfo->to < armci_master) || - msginfo->to >= armci_master + armci_clus_info[armci_clus_me].nslave) - /*armci_die("armci_check_req: invalid to", msginfo->to);*/ - printf("\n%d:got following to %d",armci_me,msginfo->to); - if(msginfo->dscrlen < 0) - armci_die("armci_check_req: dscrlen < 0", msginfo->dscrlen); - if(msginfo->datalen < 0) - armci_die("armci_check_req: datalen < 0", msginfo->datalen); - if(msginfo->dscrlen > (int)msginfo->bytes) - armci_die2("armci_check_req: dsclen > bytes", msginfo->dscrlen, - msginfo->bytes); - ARMCI_PR_SDBG("exit",0); -} - - -/*\ server response - send data to client -\*/ -void armci_send_data(request_header_t* msginfo, void *data) -{ - int to = msginfo->from; - ARMCI_PR_SDBG("enter",0); - armci_WriteToDirect(to, msginfo, data); - ARMCI_PR_SDBG("exit",0); -} - - -/*\ server sends strided data back to client -\*/ -void armci_send_strided_data(int proc, request_header_t *msginfo, - char *bdata, void *ptr, int strides, - int stride_arr[], int count[]) -{ - int i,na; - double *a = NULL; - int to = msginfo->from; - ARMCI_PR_SDBG("enter",0); - - if(DEBUG_){ printf("%d(server): sending datalen = %d to %d %p\n", - armci_me, msginfo->datalen, to,ptr); fflush(stdout); } - - /* for small contiguous blocks copy into a buffer before sending */ - armci_write_strided(ptr, strides, stride_arr, count, bdata); - -#ifdef RMO_PORTALS_DEBUG_GET - a = (double *) bdata; - na = msginfo->datalen/sizeof(double); - for(i=0; idatalen,to); - fflush(stdout); - } - ARMCI_PR_SDBG("exit",0); -} - - -/*\ server sends ACK to client -ptl_event_t *ev = (ptl_event_t *) msginfo->tag.user_ptr; - - ARMCI_PR_SDBG("enter",0); - if(DEBUG_){ - printf("%d server: terminating request by %d\n",armci_me,msginfo->from); - fflush(stdout); - } - - portals_ds_send_ack(ev->initiator,ev->hdr_data); - -\*/ -void armci_server_ack(request_header_t* msginfo) -{ -int *ack=(int*)(msginfo+1); - - ARMCI_PR_SDBG("enter",0); - if(DEBUG_){ - printf("%d server: sending ACK to %d\n",armci_me,msginfo->from); - fflush(stdout); - } - - *ack = ACK; - if(msginfo->datalen != sizeof(int)) - armci_die("armci_server_ack: bad datalen=",msginfo->datalen); - armci_send_data(msginfo,ack); - - ARMCI_PR_SDBG("exit",0); -} - - - -/*\ server action triggered by request to quit -\*/ -void armci_server_goodbye(request_header_t* msginfo) -{ -#ifdef LIBONESIDED - -#else - ptl_event_t *ev = (ptl_event_t *) msginfo->tag.user_ptr; - - ARMCI_PR_SDBG("enter",0); - if(DEBUG_){ - printf("%d server: terminating request by %d\n",armci_me,msginfo->from); - fflush(stdout); - } - - portals_ds_send_ack(ev->initiator,ev->hdr_data); - -#ifdef ARMCI_CHECK_STATE_ - for(int i=0;ifrom; - - if(DEBUG_){ - printf("%d(serv):got %d request from %d\n",armci_me,msginfo->operation, - from); - fflush(stdout); - } - -/*if(msginfo->operation==GET)fprintf(stderr,"GET request received with tag: %d\n",msginfo->tag);*/ - - switch(msginfo->operation){ -# ifdef ARMCI_CHECK_STATE - case STATE: - printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - abort(); - if(DEBUG_){printf("\n%d:state request\n",armci_me);fflush(stdout);} - sarlist_print(msginfo->from); - armci_WriteToDirect(msginfo->from, msginfo, (msginfo+1)); - break; -# endif - case QUIT: - if(DEBUG_){ - printf("%d(serv):got QUIT request from %d\n",armci_me, from); - fflush(stdout); - } - armci_server_goodbye(msginfo); - break; /*pessimism?*/ - - case ACK: - // printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - // abort(); - if(DEBUG_) { - fprintf(stdout, "%d(server): got ACK request from %d\n", - armci_me, msginfo->from); fflush(stdout); - } - armci_server_ack(msginfo); - break; - - case ATTACH: - printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - abort(); - // if(DEBUG_){ - // printf("%d(serv):got ATTACH request from%d\n",armci_me, from); - // fflush(stdout); - // } - // armci_server_ipc(msginfo, descr, buffer, buflen); - break; - case ARMCI_SWAP: - case ARMCI_SWAP_LONG: - case ARMCI_FETCH_AND_ADD: - case ARMCI_FETCH_AND_ADD_LONG: - armci_server_rmw(msginfo,descr,buffer); - break; - - case LOCK: - printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - abort(); - // armci_server_lock(msginfo); - break; - - case UNLOCK: - printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - abort(); - // armci_server_unlock(msginfo, descr); - // msginfo->tag.ack=ARMCI_STAMP; - // x_net_send_ack(msginfo,msginfo->from,msginfo->tag.ack_ptr,&msginfo->tag.ack); - break; - - default: - if(msginfo->format == VECTOR){ - armci_server_vector(msginfo, descr, buffer, buflen); // point 1 - if(msginfo->operation==PUT || ARMCI_ACC(msginfo->operation)) { // point 2 - armci_server_send_ack(msginfo); - } - // the obove if clause and the similar cause below for a strided operation - // was the reason for the race condition in the original portals code. - // if the original request was a get, it could return it's data to the CP - // once the the data is returned, the CP could fire off a new request which - // would overwrite the 'now' volatile msginfo ... in which case, after returning - // from armci_server_vector (having finished the get); the operation could now be - // a put, in which case, it would repy back that it has also finished the put, - // with out actually doing it. Msginfo could be different at points 1 and 2 if - // at point 1 the operation was a get. - } - else if(msginfo->format == STRIDED){ - // if(msginfo->operation != PUT && msginfo->operation != GET && !ARMCI_ACC(msginfo->operation)) { - // printf("[ds %d]: operation=%d (format==STRIDED) not supported yet\n",armci_me,msginfo->operation); - // abort(); - // } - armci_server(msginfo, descr, buffer, buflen); // point 1 - if(msginfo->operation==PUT || ARMCI_ACC(msginfo->operation)){ // point 2 - armci_server_send_ack(msginfo); - } - } - else - armci_die2("armci_data_serv: unknown format code", - msginfo->format, msginfo->from); - } - ARMCI_PR_SDBG("exit",0); -} - - - diff --git a/armci/src-gemini/fence.c b/armci/src-gemini/fence.c deleted file mode 100644 index 0eef7a28e..000000000 --- a/armci/src-gemini/fence.c +++ /dev/null @@ -1,91 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: fence.c,v 1.25.4.6 2007-08-30 19:17:02 manoj Exp $ */ -#include "armcip.h" -#include "armci.h" -#include "copy.h" -#include -#if defined(PVM) -# include -#elif defined(TCGMSG) -# include -#elif defined(BGML) -# include "bgml.h" -#else -# include -#endif - -char *_armci_fence_arr; - -void armci_init_fence() -{ - _armci_fence_arr=calloc(armci_nproc,1); - if(!_armci_fence_arr) - armci_die("armci_init_fence: calloc failed",0); -} - -void ARMCI_DoFence(int proc) -{ -int i; - if(!SAMECLUSNODE(proc) && (armci_nclus >1)){ - int cluster = armci_clus_id(proc); - armci_rem_ack(cluster); - } -} - -void PARMCI_Fence(int proc) -{ -int i; - -#if defined(DATA_SERVER) && !(defined(GM) && defined(ACK_FENCE)) -// printf("%d [cp] fence_arr(%d)=%d\n",armci_me,proc,FENCE_ARR(proc)); - if(FENCE_ARR(proc) && (armci_nclus >1)){ - - int cluster = armci_clus_id(proc); - int master=armci_clus_info[cluster].master; - - armci_rem_ack(cluster); - - /* one ack per cluster node suffices */ - /* note, in multi-threaded case it will only clear for current thread */ - bzero(&FENCE_ARR(master),armci_clus_info[cluster].nslave); - } -#elif defined(BGML) - BGML_WaitProc(proc); - MEM_FENCE; -#else - FENCE_NODE(proc); - MEM_FENCE; -#endif -} - - -/* - portals developers' note: - armci fence is not guaranteed to be correct unless PUT_START events are captured - PUT_ENDs do NOT guarantee order; only PUT_STARTs -*/ -void PARMCI_AllFence() -{ -#if defined(CLUSTER) - { int p; for(p=0;p profile_3d.dat -for (( i = 0 ; i <= 128; i++ )) -do - affile="armci_profile.${i}" - if test -s $affile - then - head -n 28 $affile | tail -n 22 | awk '{print $7" "$0+$1+$2" '$i'"}' | awk -F- '{print $2}' | awk -F")" '{print $1" "$2}' >> profile_3d.dat - - fi -done diff --git a/armci/src-gemini/gpc.c b/armci/src-gemini/gpc.c deleted file mode 100644 index 26cd747c5..000000000 --- a/armci/src-gemini/gpc.c +++ /dev/null @@ -1,345 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: gpc.c,v 1.7.4.4 2007-06-13 00:44:01 vinod Exp $ ***************************************************** - Prototype of Global Procedure Calls. - July/03 JN - shared memory version - -*************************************************************/ - -#include -#include "armcip.h" -#include "locks.h" -#include "gpc.h" - -#define GPC_SLOTS 32 -#define GPC_OFFSET -100 -static void *_table[GPC_SLOTS]={ -(void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, -(void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, -(void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, -(void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0}; - -#if (defined(LAPI) || defined(GM) || defined(VAPI) || defined(QUADRICS)) && ARMCI_ENABLE_GPC_CALLS - -/*\ callback functions must be registered -- user gets int handle back -\*/ -int ARMCI_Gpc_register( int (*func) ()) -{ - int handle =-1, candidate = 0; - - PARMCI_Barrier(); - do{ - if(!_table[candidate]){ - handle = candidate; - _table[candidate]=func; - } - candidate++; - }while(candidate < GPC_SLOTS && handle == -1); - return(GPC_OFFSET-handle); -} - -/*\ release/deassociate handle with previously registered callback function -\*/ -void ARMCI_Gpc_release(int handle) -{ - int h = -handle + GPC_OFFSET; - - PARMCI_Barrier(); - if(h<0 || h >= GPC_SLOTS) armci_die("ARMCI_Gpc_release: bad handle",h); - _table[h] = (void*)0; -} - - - -/*\ Send Request to Execute callback function in a global address space - * Arguments: - * f - handle to the callback function - * p - remote processor - * hdr - header data - used to pack extra args for callback (local buffer) - * hlen - size of header data < ARMCI_GPC_HLEN - * data - bulk data passed to callback (local buffer) - * dlen - length of bulk data - * rhdr - ptr to reply header (return args from callback) - * rhlen - length of buffer to store reply header < ARMCI_GPC_HLEN - * rdata - ptr to where reply data from callback should be stored (local buf) - * rdlen - size of the buffer to store reply data - * nbh - nonblocking handle - * -\*/ -int ARMCI_Gpc_exec(int h, int p, void *hdr, int hlen, void *data, int dlen, - void *rhdr, int rhlen, void *rdata, int rdlen, gpc_hdl_t* nbh) -{ - int hnd = -h + GPC_OFFSET; - int err = 0; - armci_hdl_t *ahdl = (nbh ? &(nbh->ahdl): NULL); - - if(hnd <0 || hnd>= GPC_SLOTS) - err += fprintf(stderr, "ARMCI_Gpc_exec: bad callback handle %d: %d\n",hnd,GPC_SLOTS); - if(!_table[hnd]) - err += fprintf(stderr, "ARMCI_Gpc_exec: NULL function %d",hnd); - - if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send header size %d %d\n", hlen, ARMCI_Gpc_get_hlen()); - if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv header size %d %d\n", rhlen, ARMCI_Gpc_get_hlen()); - if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send data size %d %d\n", dlen, ARMCI_Gpc_get_dlen()); - if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv data size %d %d\n", rdlen, ARMCI_Gpc_get_dlen()); - - if(hlen>0 && hdr==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null send header for non-zero header size %d\n", hlen); - if(rhlen>0 && rhdr==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv header for non-zero header size %d\n", rhlen); - if(dlen>0 && data==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null send data for non-zero data size %d\n", dlen); - if(rdlen>0 && rdata==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv data for non-zero header size %d\n", rdlen); - - if(p<0 || p >= armci_nproc) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid target processor id %d\n", p, armci_nproc); - - if(err) - return FAIL; - - if(rhlen + rdlen == 0) - armci_die("Zero reply header + data length not yet supported", 0); - - if(nbh) - nbh->proc = p; -#if 1 - if(SAMECLUSNODE(p) && armci_nproc==1) { - int rhsize, rdsize; - int (*func)(); - -/* fprintf(stderr, "%d:: armci gpc exec. SAMECLUSNODE\n", armci_me); */ - - func = _table[hnd]; - if(func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, GPC_INIT) != GPC_DONE) { - func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, GPC_WAIT); - } -#ifndef VAPI - PARMCI_Fence(p); -#endif - return 0; - } -#endif - -/* fprintf(stderr, "%d:: armci gpc exec. invoking armci gpc\n", armci_me); */ - return armci_gpc(h, p, hdr, hlen, data, dlen, - rhdr, rhlen, rdata, rdlen, ahdl); -} - -/* - func - handle to the function executed at each process in the chain - callba- handle to the callback to be executed when - hdr - header data used to pack extra args for callback (local buffer) - hlen - size of header data < ARMCI_GPC_HLEN - data - bulk data passed to callback (local buffer) - dlen - length of bulk data - rhdr - ptr to reply header (return args from callback) - rhlen - length of buffer to store reply header < ARMCI_GPC_HLEN - rdata - ptr to where reply data from callback should be stored (local buf) - rdlen - size of the buffer to store reply data - idlen - number of ID's - idslst- list of id's in the chained GPC - nbh - nonblocking handle which also acts as a context for each individual - GPC - Tree - the id of tree function used (default is 0=>binary, 1=>binomial, - n=> user defined) -*/ -int ARMCI_Gpc_chained_exec(int func, int callback, void *hdr, int hlen, - void *data, int dlen, void *rhdr, int rhlen, void *rdata, - int rdlen, int idlen, int *idlst, gpc_hdl_t* nbh, int TREE) -{ -#if 0 -int hnd = -func + GPC_OFFSET; -int err = 0; - armci_hdl_t *ahdl = (nbh ? &(nbh->ahdl): NULL); - - if(hnd <0 || hnd>= GPC_SLOTS) - err += fprintf(stderr, "ARMCI_Gpc_exec: bad callback handle %d: %d\n",hnd,GPC_SLOTS); - if(!_table[hnd]) - err += fprintf(stderr, "ARMCI_Gpc_exec: NULL function %d",hnd); - - if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send header size %d %d\n", hlen, ARMCI_Gpc_get_hlen()); - if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv header size %d %d\n", rhlen, ARMCI_Gpc_get_hlen()); - if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send data size %d %d\n", dlen, ARMCI_Gpc_get_dlen()); - if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv data size %d %d\n", rdlen, ARMCI_Gpc_get_dlen()); - - if(hlen>0 && hdr==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null send header for non-zero header size %d\n", hlen); - if(rhlen>0 && rhdr==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv header for non-zero header size %d\n", rhlen); - if(dlen>0 && data==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null send data for non-zero data size %d\n", dlen); - if(rdlen>0 && rdata==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv data for non-zero header size %d\n", rdlen); - - if(p<0 || p >= armci_nproc) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid target processor id %d\n", p, armci_nproc); - - if(err) - return FAIL; - - if(rhlen + rdlen == 0) - armci_die("Zero reply header + data length not yet supported", 0); - - tree_id = armci_msg_generate_tree(idlst,idlen,id_tree,TREE); - if(nbh) - nbh->proc = p; - -#if 1 - if(SAMECLUSNODE(p) && armci_nproc==1) { - int rhsize, rdsize; - int (*func)(); - - /* fprintf(stderr, "%d:: armci gpc exec. SAMECLUSNODE\n", armci_me); */ - - func = _table[hnd]; - if(func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, GPC_INIT) != GPC_DONE) { - func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, GPC_WAIT); - } -#ifndef VAPI - PARMCI_Fence(p); -#endif - return 0; - } -#endif -/* fprintf(stderr, "%d:: armci gpc exec. invoking armci gpc\n", armci_me); */ - return armci_gpc(h, p, hdr, hlen, data, dlen, - rhdr, rhlen, rdata, rdlen, ahdl); -#endif -} - - - -int armci_gpc_local_exec(int h, int to, int from, void *hdr, int hlen, - void *data, int dlen, - void *rhdr, int rhlen, - void *rdata, int rdlen, int rtype) { - int rhsize, rdsize; - int (*func)(); - int hnd = -h + GPC_OFFSET; - - if(hnd <0 || hnd>= GPC_SLOTS) - armci_die2("armci_gpc_local_exec: bad callback handle",hnd,GPC_SLOTS); - if(!_table[hnd]) armci_die("armci_gpc_local_exec: NULL function",hnd); - - func = _table[hnd]; - - if(!SAMECLUSNODE(to)) - armci_die("armci_gpc_local_exec: GPC call to a different node received!", - armci_me); - -/* func(to, from, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, */ -/* rdata, rdlen, &rdsize); */ -/* return 0; */ - return func(to, from, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, rtype); -} - -/*\ - * This is a template for the callback function - * The arguments are passed as specified in ARMCI_Gpc_exec - * In addition, - * rhsize specifies the actual size of reply header data returned - * rdsize specifies the actual size of reply data returned -\*/ -int example_func(int to, int from, void *hdr, int hlen, - void *data, int dlen, - void *rhdr, int rhlen, int *rhsize, - void *rdata, int rdlen, int *rdsize, - int rtype); - - -#ifdef LAPI -void armci_gpc_set_serverpid(){ -} -#endif - - -/*\ - * Translate pointer to memory on processor "proc" - * to be used in a callback function send by processor "from" -\*/ -void * ARMCI_Gpc_translate(void *ptr, int proc, int from) -{ -return ptr; -} - - -/*\ acquire lock in a callback function executed in context of processor "proc" -\*/ -void ARMCI_Gpc_lock(int proc) -{ -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - NATIVE_LOCK(lock,proc); -} - -/*\ try acquire lock in a callback function to be executed in context of - * processor "proc" - * return value: 1 - success - * 0 - failure (already locked by another thread) -\*/ -int ARMCI_Gpc_trylock(int proc) -{ -armci_die("ARMCI_Gpc_trylock: not yet implemented",0); -return 0; -} - -/*\ release lock in a callback function executed in context of processor "proc" -\*/ -void ARMCI_Gpc_unlock(int proc) -{ -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - NATIVE_UNLOCK(lock,proc); -} - -void ARMCI_Gpc_init_handle(gpc_hdl_t *nbh) { - nbh->proc = armci_me; - ARMCI_INIT_HANDLE(&nbh->ahdl); -} - -void ARMCI_Gpc_wait(gpc_hdl_t *nbh) { - if(SAMECLUSNODE(nbh->proc)) - return; - PARMCI_Wait(&nbh->ahdl); -} - -void ARMCI_Gpc_test(gpc_hdl_t *nbh) { - if(SAMECLUSNODE(nbh->proc)) - return; - PARMCI_Test(&nbh->ahdl); -} - -#define ARMCI_GPC_HLEN 65536 -#define ARMCI_GPC_DLEN 65536 -int ARMCI_Gpc_get_hlen() { - return ARMCI_GPC_HLEN; -} - -int ARMCI_Gpc_get_dlen() { - return ARMCI_GPC_DLEN; -} - -#endif - diff --git a/armci/src-gemini/gpc.h b/armci/src-gemini/gpc.h deleted file mode 100644 index ec165ab54..000000000 --- a/armci/src-gemini/gpc.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef __GPCDEF -#if ARMCI_ENABLE_GPC_CALLS -#define __GPCDEF - -#include "armci.h" - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -#define GPC_INIT 1 -#define GPC_PROBE 2 -#define GPC_WAIT 3 -#define GPC_DONE 4 -#define GPC_PENDING 5 - -typedef struct { - int proc; - armci_hdl_t ahdl; -}gpc_hdl_t; - -/* #define ARMCI_GPC_HLEN 1024 */ -/* #define ARMCI_GPC_DLEN 1024*1024 */ -extern int ARMCI_Gpc_register( int (*func) ()); -extern void ARMCI_Gpc_release(int handle); -extern void * ARMCI_Gpc_translate(void *ptr, int proc, int from); -extern void ARMCI_Gpc_lock(int proc); -extern void ARMCI_Gpc_unlock(int proc); -extern int ARMCI_Gpc_trylock(int proc); -extern int ARMCI_Gpc_exec(int h,int p, void *hdr, int hlen, void *data,int dlen, - void *rhdr, int rhlen, void *rdata, int rdlen, - gpc_hdl_t* nbh); -extern int PARMCI_Get_gpc_hlen(); -extern int PARMCI_Get_gpc_dlen(); - -extern void ARMCI_Gpc_init_handle(gpc_hdl_t *nbh); -extern void ARMCI_Gpc_wait(gpc_hdl_t *nbh); - -#if defined(__cplusplus) || defined(c_plusplus) -} -#endif - -#endif -#endif diff --git a/armci/src-gemini/groups.c b/armci/src-gemini/groups.c deleted file mode 100644 index 8601f80a5..000000000 --- a/armci/src-gemini/groups.c +++ /dev/null @@ -1,568 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: groups.c,v 1.4.6.2 2007-08-15 08:37:16 manoj Exp $ */ - - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STDLIB_H -# include -#endif -#if HAVE_STRING_H -# include -#endif -#if HAVE_ASSERT_H -# include -#endif - -#ifndef MSG_COMMS_MPI -# define MSG_COMMS_MPI -#endif -#include "armcip.h" -#include "message.h" - -#define DEBUG_ 0 - -MPI_Comm ARMCI_COMM_WORLD; /*dup of MPI_COMM_WORLD. Initialized first thing in ARMCI_Init*/ - -ARMCI_Group ARMCI_Default_Proc_Group = 0; -ARMCI_Group ARMCI_World_Proc_Group = 0; - -typedef struct group_list_struct { - ARMCI_Group group; - ARMCI_iGroup igroup; - struct group_list_struct *next; -} group_list_t; - -group_list_t *group_list = NULL; - -ARMCI_iGroup* armci_get_igroup_from_group(ARMCI_Group *group) -{ - group_list_t *current_group_list_item = group_list; - - assert(group_list != NULL); - while (current_group_list_item != NULL) { - if (current_group_list_item->group == *group) { - return ¤t_group_list_item->igroup; - } - current_group_list_item = current_group_list_item->next; - } - armci_die("ARMCI_Group lookup failed", -1); - return NULL; -} - -static void armci_create_group_and_igroup(ARMCI_Group *group, ARMCI_iGroup **igroup) -{ - group_list_t *new_group_list_item = NULL; - group_list_t *last_group_list_item = NULL; - - /* create the new group in the linked list */ - last_group_list_item = group_list; - while (last_group_list_item->next != NULL) { - last_group_list_item = last_group_list_item->next; - } - - new_group_list_item = malloc(sizeof(group_list_t)); - new_group_list_item->group = last_group_list_item->group + 1; - new_group_list_item->next = NULL; - *igroup = &new_group_list_item->igroup; - *group = new_group_list_item->group; - last_group_list_item->next = new_group_list_item; -} - -#ifdef ARMCI_GROUP -void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Group *group) -{ - armci_msg_group_bcast_scope(SCOPE_ALL, buffer, len, - ARMCI_Absolute_id(group, root), - group); -} -#else -void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Comm comm) -{ - int result; - MPI_Comm_compare(comm, ARMCI_COMM_WORLD, &result); - if(result == MPI_IDENT) armci_msg_brdcst(buffer, len, root); - else MPI_Bcast(buffer, len, MPI_BYTE, root, (MPI_Comm)comm); -} -#endif - -int ARMCI_Group_rank(ARMCI_Group *group, int *rank) -{ - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); -#ifdef ARMCI_GROUP - if(!igroup) return MPI_ERR_GROUP; - *rank = igroup->grp_attr.grp_me; - return MPI_SUCCESS; -#else - return MPI_Group_rank((MPI_Group)(igroup->igroup), rank); -#endif -} - -void ARMCI_Group_size(ARMCI_Group *group, int *size) -{ - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); -#ifdef ARMCI_GROUP - *size = igroup->grp_attr.nproc; -#else - MPI_Group_size((MPI_Group)(igroup->igroup), size); -#endif -} - -int ARMCI_Absolute_id(ARMCI_Group *group,int group_rank) -{ - int abs_rank,status; - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); -#ifdef ARMCI_GROUP - assert(group_rank < igroup->grp_attr.nproc); - return igroup->grp_attr.proc_list[group_rank]; -#else - MPI_Group grp; - status = MPI_Comm_group(ARMCI_COMM_WORLD,&grp); - MPI_Group_translate_ranks(igroup->igroup,1,&group_rank,grp,&abs_rank); - return(abs_rank); -#endif -} - -void ARMCI_Group_set_default(ARMCI_Group *group) -{ - ARMCI_Default_Proc_Group = *group; -} - -void ARMCI_Group_get_default(ARMCI_Group *group_out) -{ - *group_out = ARMCI_Default_Proc_Group; -} - -void ARMCI_Group_get_world(ARMCI_Group *group_out) -{ - *group_out = ARMCI_World_Proc_Group; -} - -static void get_group_clus_id(ARMCI_iGroup *igroup, int grp_nproc, - int *grp_clus_id) -{ -#ifdef ARMCI_GROUP - int i; - assert(grp_nproc<=igroup->grp_attr.nproc); - for(i=0; igrp_attr.proc_list[i]); - } -#else - int i, *ranks1, *ranks2; - MPI_Group group2; - - /* Takes the list of processes from one group and attempts to determine - * the corresponding ranks in a second group (here, ARMCI_COMM_WORLD) */ - - ranks1 = (int *)malloc(2*grp_nproc*sizeof(int)); - ranks2 = ranks1 + grp_nproc; - for(i=0; iigroup, grp_nproc, ranks1, group2, ranks2); - - /* get the clus_id of processes */ - for(i=0; iicomm; -#endif - - int grp_me, grp_nproc, grp_nclus, grp_clus_me; - armci_clus_t *grp_clus_info=NULL; -#ifdef CLUSTER - int i, len, root=0; -#endif - -#ifndef ARMCI_GROUP - if(comm==MPI_COMM_NULL || igroup->igroup==MPI_GROUP_NULL) - armci_die("group_process_list: NULL COMMUNICATOR",0); -#endif - - ARMCI_Group_rank(group, &grp_me); - ARMCI_Group_size(group, &grp_nproc); - -#ifdef CLUSTER -# ifdef ARMCI_GROUP - /*all processes construct the clus_info structure in parallel*/ - grp_clus_info = group_construct_clusinfo(&grp_nclus, group); -# else - /* process 0 gets group cluster information: grp_nclus, grp_clus_info */ - if(grp_me == 0) { - grp_clus_info = group_construct_clusinfo(&grp_nclus, group); - } - - /* process 0 broadcasts group cluster information */ - len = sizeof(int); - ARMCI_Bcast_(&grp_nclus, len, root, comm); - - if(grp_me){ - /* allocate memory */ - grp_clus_info = (armci_clus_t*)malloc(grp_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); - } - - len = sizeof(armci_clus_t)*grp_nclus; - ARMCI_Bcast_(grp_clus_info, len, root, comm); -# endif - /* determine current group cluster node id by comparing me to master */ - grp_clus_me = grp_nclus-1; - for(i =0; i< grp_nclus-1; i++) { - if(grp_me < grp_clus_info[i+1].master){ - grp_clus_me=i; - break; - } - } -#else /* !CLUSTER */ - grp_clus_me = 0; - grp_nclus = 1; - grp_clus_info = (armci_clus_t*)malloc(grp_nclus*sizeof(armci_clus_t)); - if(!grp_clus_info)armci_die("malloc failed for clusinfo",grp_nclus); - strcpy(grp_clus_info[0].hostname, armci_clus_info[0].hostname); - grp_clus_info[0].master=0; - grp_clus_info[0].nslave=grp_nproc; -#endif /* CLUSTER */ -#ifdef ARMCI_GROUP - /*Set in ARMCI_Group_create. ARMCI_Group_rank is used before - setting this field. So moving it there in the generic - implementation.*/ -#else - grp_attr->grp_me = grp_me; -#endif - grp_attr->grp_clus_info = grp_clus_info; - grp_attr->grp_nclus = grp_nclus; - grp_attr->grp_clus_me = grp_clus_me; -} - -/* attribute caching: group_cluster_information and memory_offset should - be cached in group data structure */ -static void armci_cache_attr(ARMCI_Group *group) { - armci_grp_attr_t *grp_attr; - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - - /* allocate storage for the attribute content. Note: Attribute contents - should be stored in persistent memory */ - grp_attr = &(igroup->grp_attr); - - /* get group cluster information and grp_attr */ - group_process_list(group, grp_attr); -} - -armci_grp_attr_t *ARMCI_Group_getattr(ARMCI_Group *group) -{ - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - return(&(igroup->grp_attr)); - -} - -static void armci_igroup_finalize(ARMCI_iGroup *igroup) { -#ifdef ARMCI_GROUP - int world_me, i; - - world_me = armci_msg_me(); - for(i=0; igrp_attr.nproc; i++) { - if(igroup->grp_attr.proc_list[i] == world_me) { - break; - } - } - if(i==igroup->grp_attr.nproc) { - return; /*not in group to be freed*/ - } - - assert(igroup); - free(igroup->grp_attr.grp_clus_info); - free(igroup->grp_attr.proc_list); - igroup->grp_attr.nproc = 0; -#else - int rv; - - assert(igroup); - /*the following was causing seg fault*/ - /*free(igroup->grp_attr.grp_clus_info);*/ - - rv=MPI_Group_free(&(igroup->igroup)); - if(rv != MPI_SUCCESS) armci_die("MPI_Group_free: Failed ",armci_me); - - if(igroup->icomm != MPI_COMM_NULL) { - rv = MPI_Comm_free( (MPI_Comm*)&(igroup->icomm) ); - if(rv != MPI_SUCCESS) armci_die("MPI_Comm_free: Failed ",armci_me); - } -#endif -} - -void ARMCI_Group_free(ARMCI_Group *group) { - group_list_t *current_group_list_item = group_list; - group_list_t *previous_group_list_item = NULL; - - /* find the group to free */ - while (current_group_list_item != NULL) { - if (current_group_list_item->group == *group) { - break; - } - previous_group_list_item = current_group_list_item; - current_group_list_item = current_group_list_item->next; - } - /* make sure we found a group */ - assert(current_group_list_item != NULL); - /* remove the group from the linked list */ - if (previous_group_list_item != NULL) { - previous_group_list_item->next = current_group_list_item->next; - } - /* free the group */ - armci_igroup_finalize(¤t_group_list_item->igroup); - free(current_group_list_item); -} - -/* - Create a child group for to the given group. - @param n IN #procs in this group (<= that in group_parent) - @param pid_list IN The list of proc ids (w.r.t. group_parent) - @param group_out OUT Handle to store the created group - @param group_parent IN Parent group - */ -void ARMCI_Group_create_child(int n, int *pid_list, ARMCI_Group *group_out, - ARMCI_Group *grp_parent) -{ - int i,grp_me; - ARMCI_iGroup *igroup = NULL; - -#ifdef ARMCI_GROUP - int world_me, parent_grp_me; - armci_grp_attr_t *grp_attr = NULL; -#else - int rv; - ARMCI_iGroup *igroup_parent = NULL; - MPI_Group *group_parent = NULL; - MPI_Comm *comm_parent = NULL; -#endif - - armci_create_group_and_igroup(group_out, &igroup); - -#ifdef ARMCI_GROUP - grp_attr = &igroup->grp_attr; - ARMCI_Group_rank(grp_parent, &parent_grp_me); - for(i=0; inproc=0; - grp_attr->proc_list = NULL; - return; /*not in group to be created*/ - } - for(i=0; i pid_list[i+1]){ - armci_die("ARMCI_Group_create: Process ids are not sorted ",armci_me); - break; - } - } - grp_attr->grp_clus_info = NULL; - grp_attr->nproc = n; - grp_attr->proc_list = (int *)malloc(n*sizeof(int)); - assert(grp_attr->proc_list!=NULL); - for(i=0; iproc_list[i] = ARMCI_Absolute_id(grp_parent,pid_list[i]); - } - world_me = armci_msg_me(); - grp_attr->grp_me = grp_me = MPI_UNDEFINED; - for(i=0; igrp_attr.proc_list[i] == world_me) { - grp_attr->grp_me = grp_me = i; - break; - } - } - if(grp_me != MPI_UNDEFINED) armci_cache_attr(group_out); - armci_msg_group_barrier(group_out); -#else - igroup_parent = armci_get_igroup_from_group(grp_parent); - /* NOTE: default group is the parent group */ - group_parent = &(igroup_parent->igroup); - comm_parent = &(igroup_parent->icomm); - - rv=MPI_Group_incl(*group_parent, n, pid_list, &(igroup->igroup)); - if(rv != MPI_SUCCESS) armci_die("MPI_Group_incl: Failed ",armci_me); - - { - MPI_Comm comm, comm1, comm2; - int lvl=1, local_ldr_pos; - MPI_Group_rank((MPI_Group)(igroup->igroup), &grp_me); - if(grp_me == MPI_UNDEFINED) { - igroup->icomm = MPI_COMM_NULL; /*FIXME: keeping the group around for now*/ - return; - } - assert(grp_me>=0); /*SK: sanity check for the following bitwise operations*/ - MPI_Comm_dup(MPI_COMM_SELF, &comm); /*FIXME: can be optimized away*/ - local_ldr_pos = grp_me; - while(n> lvl) { - int tag=0; - int remote_ldr_pos = local_ldr_pos^lvl; - if(remote_ldr_pos < n) { - int remote_leader = pid_list[remote_ldr_pos]; - MPI_Comm peer_comm = *comm_parent; - int high = (local_ldr_posicomm = comm; - MPI_Group_free(&igroup->igroup); /*cleanup temporary group*/ - MPI_Comm_group(igroup->icomm, &igroup->igroup); /*the group associated with comm*/ - igroup->grp_attr.grp_clus_info=NULL; - /* processes belong to this group should cache attributes */ - armci_cache_attr(group_out); - } - -#endif -} - -void ARMCI_Group_create(int n, int *pid_list, ARMCI_Group *group_out) { - ARMCI_Group_create_child(n, pid_list, group_out, (ARMCI_Group *)&ARMCI_Default_Proc_Group); -} - -void armci_group_init() -{ -#ifdef ARMCI_GROUP - int i; -#else - int grp_me; -#endif - ARMCI_iGroup *igroup; - - /* Initially, World group is the default group */ - ARMCI_World_Proc_Group = 0; - ARMCI_Default_Proc_Group = 0; - - /* create the head of the group linked list */ - assert(group_list == NULL); - group_list = malloc(sizeof(group_list_t)); - group_list->group = ARMCI_World_Proc_Group; - group_list->next = NULL; - igroup = &group_list->igroup; - -#ifdef ARMCI_GROUP - /*setup the world proc group*/ - igroup->grp_attr.nproc = armci_msg_nproc(); - igroup->grp_attr.grp_me = armci_msg_me(); - igroup->grp_attr.proc_list = (int *)malloc(igroup->grp_attr.nproc*sizeof(int)); - assert(igroup->grp_attr.proc_list != NULL); - for(i=0; igrp_attr.nproc; i++) { - igroup->grp_attr.proc_list[i] = i; - } - igroup->grp_attr.grp_clus_info = NULL; - armci_cache_attr(&ARMCI_World_Proc_Group); -#else - /* save MPI world group and communicatior in ARMCI_World_Proc_Group */ - igroup->icomm = ARMCI_COMM_WORLD; - MPI_Comm_group(ARMCI_COMM_WORLD, &(igroup->igroup)); - - /* processes belong to this group should cache attributes */ - MPI_Group_rank((MPI_Group)(igroup->igroup), &grp_me); - if(grp_me != MPI_UNDEFINED) { - armci_cache_attr(&ARMCI_World_Proc_Group); - } -#endif -} - -void armci_group_finalize() { - group_list_t *current_group_list_item = group_list; - group_list_t *previous_group_list_item = NULL; - - /* don't free the world group (the list head) */ - current_group_list_item = current_group_list_item->next; - - while (current_group_list_item != NULL) { - previous_group_list_item = current_group_list_item; - current_group_list_item = current_group_list_item->next; - armci_igroup_finalize(&previous_group_list_item->igroup); - free(previous_group_list_item); - } -} - -/* - ISSUES: - 1. Make sure ARMCI_Group_free frees the attribute data structures - 2. replace malloc with, kr_malloc using local_context. -*/ diff --git a/armci/src-gemini/kr_malloc.c b/armci/src-gemini/kr_malloc.c deleted file mode 100644 index 4c25c9890..000000000 --- a/armci/src-gemini/kr_malloc.c +++ /dev/null @@ -1,605 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include "kr_malloc.h" -#include "armcip.h" /* for DEBUG purpose only. remove later */ -#include "locks.h" - -#define DEBUG 0 - -/* Storage allocator basically copied from ANSI K&R and corrupted */ - -extern char *armci_allocate(); /* Used to get memory from the system */ -extern void armci_die(); -static char *kr_malloc_shmem(size_t nbytes, context_t *ctx); -static void kr_free_shmem(char *ap, context_t *ctx); - -/** - * DEFAULT_NALLOC: No. of units of length ALIGNMENT to get in every - * request to the system for memory (8MB/64 => 128*1024units) - * DEFAULT_MAX_NALLOC: Maximum number of units that can get i.e.1GB - * (if unit size=64bytes, then max units=1024MB/64 = 16*1024*1024) - */ -#define DEFAULT_NALLOC (128*1024) -#define DEFAULT_NALLOC_ALIGN 1024 -#define DEFAULT_MAX_NALLOC (1024*1024*16) - -/* mutual exclusion defs go here */ -#define LOCKED 100 -#define UNLOCKED 101 -static int lock_mode=UNLOCKED; - -/* enable locking only after armci is initailized as locks (and lock - data structures) are initialized in PARMCI_Init */ -#define LOCKIT(p) \ - if(_armci_initialized && lock_mode==UNLOCKED) { \ - NAT_LOCK(0,p); lock_mode=LOCKED; \ - } -#define UNLOCKIT(p) \ - if(_armci_initialized && lock_mode==LOCKED) { \ - NAT_UNLOCK(0,p); lock_mode=UNLOCKED; \ - } - -static int do_verify = 0; /* Flag for automatic heap verification */ - -#define VALID1 0xaaaaaaaa /* For validity check on headers */ -#define VALID2 0x55555555 - -#define USEDP 0 /* CHECK. By default anable this. */ - -static void kr_error(char *s, unsigned long i, context_t *ctx) { -char string[256]; - sprintf(string,"kr_malloc: %s %ld(0x%lx)\n", s, i, i); -#if 0 - kr_malloc_print_stats(ctx); -#endif - armci_die(string, i); -} - -static Header *morecore(size_t nu, context_t *ctx, size_t *last_size, char **last_ptr) { - char *cp; - Header *up; - -#if DEBUG - (void) printf("%d: morecore 1: Getting %ld more units of length %d nalloc=%d\n", armci_me, (long)nu, sizeof(Header),ctx->nalloc); - (void) fflush(stdout); -#endif - - if (ctx->total >= ctx->max_nalloc) { -# if DEBUG - armci_die("kr_malloc: morecore: maximum allocation reached",armci_me); -# endif - return (Header *) NULL; /* Enforce upper limit on core usage */ - } - -#if 1 - /* 07/03 ctx->nalloc is now the minimum # units we ask from OS */ - nu = DEFAULT_NALLOC_ALIGN*((nu-1)/DEFAULT_NALLOC_ALIGN+1); - if(nu < ctx->nalloc) nu = ctx->nalloc; -#else - nu = ctx->nalloc*((nu-1)/ctx->nalloc+1); /* nu must by a multiplicity of nalloc */ -#endif - -#if DEBUG - (void) printf("%d: morecore: Getting %ld more units of length %d\n", - armci_me, (long)nu, sizeof(Header)); - (void) fflush(stdout); -#endif - - if ((cp =(char *)(*ctx->alloc_fptr)((size_t)nu * sizeof(Header))) == (char *)NULL) - return (Header *) NULL; - if(last_size!=NULL && last_ptr!=NULL){ - *last_size = ((size_t)nu * sizeof(Header)); - *last_ptr = cp; - /*printf("\n%d:%s:got %p %d",armci_me,__FUNCTION__,*last_ptr,*last_size);*/ - } -/* if(armci_nclus==armci_nproc && armci_nclus!=1) - armci_register_shmem(cp,((size_t)nu * sizeof(Header)),NULL,0,cp); - */ - - ctx->total += nu; /* Have just got nu more units */ - ctx->nchunk++; /* One more chunk */ - ctx->nfrags++; /* Currently one more frag */ - ctx->inuse += nu; /* Inuse will be decremented by kr_free */ - - up = (Header *) cp; - up->s.size = nu; - up->s.valid1 = VALID1; - up->s.valid2 = VALID2; - - /* Insert into linked list of blocks in use so that kr_free works - ... for debug only */ - up->s.ptr = ctx->usedp; - ctx->usedp = up; - - kr_free((char *)(up+1), ctx); /* Try to join into the free list */ - return ctx->freep; -} - -void kr_malloc_init(size_t usize, /* unit size in bytes */ - size_t nalloc, - size_t max_nalloc, - void * (*alloc_fptr)(), /* memory alloc routine */ - int debug, - context_t *ctx) { - int scale; - - if(usize <= 0) usize = sizeof(Header); - - scale = usize>>LOG_ALIGN; - if(scale<1)fprintf(stderr,"Error: kr_malloc_init !!!\n"); - - if(nalloc==0) nalloc = DEFAULT_NALLOC; - if(max_nalloc==0) max_nalloc = DEFAULT_MAX_NALLOC; - - ctx->usize = sizeof(Header); - ctx->nalloc = nalloc * scale; - ctx->max_nalloc = max_nalloc * scale; - ctx->alloc_fptr = alloc_fptr; - ctx->freep = NULL; - ctx->usedp = NULL; - ctx->shmid = -1; - ctx->shmoffset = 0; - ctx->shmsize = 0; - ctx->ctx_type = -1; - do_verify = debug; -} - -char *_kr_last_ptr; -size_t _kr_last_size; - -char *kr_malloc(size_t nbytes, context_t *ctx, int record_allocation, void **new_base, size_t *new_size) { - Header *p, *prevp; - size_t nunits; - char *return_ptr; - -#if !((defined(SUN) || defined(SOLARIS)) && !defined(SHMMAX_SEARCH_NO_FORK)) - if(ctx->ctx_type == KR_CTX_SHMEM) return kr_malloc_shmem(nbytes,ctx); -#endif - - /* If first time in need to initialize the free list */ - if ((prevp = ctx->freep) == NULL) { - - if (sizeof(Header) != ALIGNMENT) - kr_error("Alignment is not valid", (unsigned long) ALIGNMENT, ctx); - - ctx->total = 0; /* Initialize statistics */ - ctx->nchunk = 0; - ctx->inuse = 0; - ctx->nfrags = 0; - ctx->maxuse = 0; - ctx->nmcalls= 0; - ctx->nfcalls= 0; - /* Initialize linked list */ - ctx->base.s.ptr = ctx->freep = prevp = &(ctx->base); - ctx->base.s.size = 0; - ctx->base.s.valid1 = VALID1; - ctx->base.s.valid2 = VALID2; - } - - ctx->nmcalls++; - - if (do_verify) - kr_malloc_verify(ctx); - - /* Rather than divide make the alignment a known power of 2 */ - - nunits = ((nbytes + sizeof(Header) - 1)>>LOG_ALIGN) + 1; - - for (p=prevp->s.ptr; ; prevp = p, p = p->s.ptr) { - if (p->s.size >= nunits) { /* Big enuf */ - if (p->s.size == nunits) /* exact fit */ - prevp->s.ptr = p->s.ptr; - else { /* allocate tail end */ - p->s.size -= nunits; - p += p->s.size; - p->s.size = nunits; - p->s.valid1 = VALID1; - p->s.valid2 = VALID2; - ctx->nfrags++; /* Have just increased the fragmentation */ - } - - /* Insert into linked list of blocks in use ... for debug only */ - p->s.ptr = ctx->usedp; - ctx->usedp = p; - - ctx->inuse += nunits; /* Record usage */ - if (ctx->inuse > ctx->maxuse) - ctx->maxuse = ctx->inuse; - ctx->freep = prevp; - return_ptr = (char *) (p+1); - break; - } - - if (p == ctx->freep) { /* wrapped around the free list */ - if ((p = morecore(nunits, ctx, &_kr_last_size,&_kr_last_ptr)) == (Header *) NULL) { - return_ptr = (char *) NULL; - break; - } - } - } - if(record_allocation){ - *((char **)new_base)=_kr_last_ptr; - *new_size=_kr_last_size; - } - return return_ptr; -} - - -void kr_free(char *ap, context_t *ctx) { - Header *bp, *p, **up; - -#if !((defined(SUN) || defined(SOLARIS)) && !defined(SHMMAX_SEARCH_NO_FORK)) - if(ctx->ctx_type == KR_CTX_SHMEM) { kr_free_shmem(ap,ctx); return; } -#endif - - ctx->nfcalls++; - - - if (do_verify) - kr_malloc_verify(ctx); - - /* only do something if pointer is not NULL */ - - if ( ap ) { - - bp = (Header *) ap - 1; /* Point to block header */ - - if (bp->s.valid1 != VALID1 || bp->s.valid2 != VALID2) - kr_error("kr_free: pointer not from kr_malloc", - (unsigned long) ap, ctx); - - ctx->inuse -= bp->s.size; /* Decrement memory ctx->usage */ - - /* Extract the block from the used linked list - ... for debug only */ - - for (up=&(ctx->usedp); ; up = &((*up)->s.ptr)) { - if (!*up) - kr_error("kr_free: block not found in used list\n", - (unsigned long) ap, ctx); - if (*up == bp) { - *up = bp->s.ptr; - break; - } - } - - /* Join the memory back into the free linked list */ - - for (p=ctx->freep; !(bp > p && bp < p->s.ptr); p = p->s.ptr) - if (p >= p->s.ptr && (bp > p || bp < p->s.ptr)) - break; /* Freed block at start or end of arena */ - - if (bp + bp->s.size == p->s.ptr) {/* join to upper neighbour */ - bp->s.size += p->s.ptr->s.size; - bp->s.ptr = p->s.ptr->s.ptr; - ctx->nfrags--; /* Lost a fragment */ - } else - bp->s.ptr = p->s.ptr; - - if (p + p->s.size == bp) { /* Join to lower neighbour */ - p->s.size += bp->s.size; - p->s.ptr = bp->s.ptr; - ctx->nfrags--; /* Lost a fragment */ - } else - p->s.ptr = bp; - ctx->freep = p; - - } /* end if on ap */ -} - -/* - Print to standard output the usage statistics. -*/ -void kr_malloc_print_stats(context_t *ctx) { - fflush(stderr); - printf("\nkr_malloc statistics\n-------------------\n\n"); - - printf("Total memory from system ... %ld bytes\n", - (long)(ctx->total*ctx->usize)); - printf("Current memory usage ....... %ld bytes\n", - (long)(ctx->inuse*ctx->usize)); - printf("Maximum memory usage ....... %ld bytes\n", - (long)(ctx->maxuse*ctx->usize)); - printf("No. chunks from system ..... %ld\n", ctx->nchunk); - printf("No. of fragments ........... %ld\n", ctx->nfrags); - printf("No. of calls to kr_malloc ... %ld\n", ctx->nmcalls); - printf("No. of calls to kr_free ..... %ld\n", ctx->nfcalls); - printf("\n"); - - fflush(stdout); -} - -/* - Currently assumes that are working in a single region. -*/ -void kr_malloc_verify(context_t *ctx) { - Header *p; - - if(_armci_initialized && lock_mode==UNLOCKED) { - LOCKIT(armci_master); lock_mode=LOCKED; - } - - if ( ctx->freep ) { - - /* Check the used list */ - - for (p=ctx->usedp; p; p=p->s.ptr) { - if (p->s.valid1 != VALID1 || p->s.valid2 != VALID2) - kr_error("invalid header on usedlist", - (unsigned long) p->s.valid1, ctx); - - if (p->s.size > ctx->total) - kr_error("invalid size in header on usedlist", - (unsigned long) p->s.size, ctx); - } - - /* Check the free list */ - - p = ctx->base.s.ptr; - while (p != &(ctx->base)) { - if (p->s.valid1 != VALID1 || p->s.valid2 != VALID2) - kr_error("invalid header on freelist", - (unsigned long) p->s.valid1, ctx); - - if (p->s.size > ctx->total) - kr_error("invalid size in header on freelist", - (unsigned long) p->s.size, ctx); - - p = p->s.ptr; - } - } /* end if */ - - if(_armci_initialized && lock_mode==LOCKED) { - UNLOCKIT(armci_master); lock_mode=UNLOCKED; - } -} - -/********************** BEGIN: kr_malloc for ctx_shmem *********************/ -#if defined(SYSV) || defined(MMAP) - -#include "armci_shmem.h" - -extern int armci_get_shmem_info(char *addrp, int* shmid, long *shmoffset, - size_t *shmsize); -extern Header *armci_get_shmem_ptr(int shmid, long shmoffset, size_t shmsize); - -/* returns, address of the shared memory region based on shmid, offset. - * (i.e. return_addr = stating address of shmid + offset) */ -#define SHM_PTR(hdr) armci_get_shmem_ptr((hdr)->s.shmid, (hdr)->s.shmoffset, (hdr)->s.shmsize) - -/* - * kr_malloc_shmem: memory allocator for shmem context (i.e ctx_shmem) - */ -static char *kr_malloc_shmem(size_t nbytes, context_t *ctx) { - Header *p, *prevp; - size_t nunits, prev_shmsize=0; - char *return_ptr; - int prev_shmid=-1; - long prev_shmoffset=0; - - LOCKIT(armci_master); - - /* Rather than divide make the alignment a known power of 2 */ - nunits = ((nbytes + sizeof(Header) - 1)>>LOG_ALIGN) + 1; - - /* If first time in need to initialize the free list */ - if ((prevp = ctx->freep) == NULL) { - - if (sizeof(Header) != ALIGNMENT) - kr_error("kr_malloc_shmem: Alignment is not valid", - (unsigned long) ALIGNMENT, ctx); - - ctx->total = 0; /* Initialize statistics */ - ctx->nchunk = ctx->inuse = ctx->maxuse = 0; - ctx->nfrags = ctx->nmcalls = ctx->nfcalls = 0; - - /* Initialize linked list */ - ctx->base.s.size = 0; - ctx->base.s.shmid = -1; - ctx->base.s.shmoffset = 0; - ctx->base.s.shmsize = 0; - ctx->base.s.valid1 = VALID1; - ctx->base.s.valid2 = VALID2; - if ((p = morecore(nunits, ctx,NULL,NULL)) == (Header *) NULL) return NULL; - ctx->base.s.ptr = prevp = ctx->freep; /* CHECK */ - } - - prev_shmid = ctx->shmid; - prev_shmoffset = ctx->shmoffset; - prev_shmsize = ctx->shmsize; - prevp = ctx->freep = armci_get_shmem_ptr(ctx->shmid, ctx->shmoffset, - ctx->shmsize); - - ctx->nmcalls++; - - if (do_verify) kr_malloc_verify(ctx); - - for (p=SHM_PTR(prevp); ; prevp = p, p = SHM_PTR(p)) { - - if (p->s.size >= nunits) { /* Big enuf */ - if (p->s.size == nunits) { /* exact fit */ - prevp->s.ptr = p->s.ptr; - prevp->s.shmid = p->s.shmid; - prevp->s.shmoffset = p->s.shmoffset; - prevp->s.shmsize = p->s.shmsize; - } - else { /* allocate tail end */ - p->s.size -= nunits; - p += p->s.size; - p->s.size = nunits; - p->s.valid1 = VALID1; - p->s.valid2 = VALID2; - ctx->nfrags++; /* Have just increased the fragmentation */ - } -#if USEDP - /* Insert into linked list of blocks in use ... for debug only */ - p->s.ptr = ctx->usedp; - ctx->usedp = p; -#endif - - ctx->inuse += nunits; /* Record usage */ - if (ctx->inuse > ctx->maxuse) - ctx->maxuse = ctx->inuse; - ctx->freep = prevp; - ctx->shmid = prev_shmid; - ctx->shmoffset = prev_shmoffset; - ctx->shmsize = prev_shmsize; - return_ptr = (char *) (p+1); - break; - } - - prev_shmid = prevp->s.shmid; - prev_shmoffset = prevp->s.shmoffset; - prev_shmsize = prevp->s.shmsize; - - if (p == ctx->freep) { /* wrapped around the free list */ - if ((p = morecore(nunits, ctx,NULL,NULL)) == (Header *) NULL) { - return_ptr = (char *) NULL; - break; - } - prev_shmid = ctx->shmid; - prev_shmoffset = ctx->shmoffset; - prev_shmsize = ctx->shmsize; - } - } - - UNLOCKIT(armci_master); - return return_ptr; -} - - -static void kr_free_shmem(char *ap, context_t *ctx) { - Header *bp, *p, **up, *nextp; - int shmid=-1; - long shmoffset=0; - size_t shmsize=0; - - LOCKIT(armci_master); - - ctx->nfcalls++; - - if (do_verify) - kr_malloc_verify(ctx); - - /* only do something if pointer is not NULL */ - - if ( ap ) { - - bp = (Header *) ap - 1; /* Point to block header */ - - if (bp->s.valid1 != VALID1 || bp->s.valid2 != VALID2) - kr_error("kr_free_shmem: pointer not from kr_malloc", - (unsigned long) ap, ctx); - - ctx->inuse -= bp->s.size; /* Decrement memory ctx->usage */ - -#if USEDP - /* Extract the block from the used linked list - ... for debug only */ - - for (up=&(ctx->usedp); ; up = &((*up)->s.ptr)) { - if (!*up) - kr_error("kr_free_shmem: block not found in used list\n", - (unsigned long) ap, ctx); - if (*up == bp) { - *up = bp->s.ptr; - break; - } - } -#endif - - if(ctx->shmid==-1) { - armci_get_shmem_info((char*)bp, &ctx->shmid, &ctx->shmoffset, - &ctx->shmsize); - - ctx->base.s.shmid = ctx->shmid; - ctx->base.s.shmsize = ctx->shmsize; - ctx->base.s.shmoffset = ctx->shmoffset; - - p = ctx->freep = bp; - p->s.ptr = bp; - p->s.size-=SHMEM_CTX_BYTES; /*memory to store shmem info in context*/ - p->s.shmid = ctx->shmid; - p->s.shmsize = ctx->shmsize; - p->s.shmoffset = ctx->shmoffset; - - UNLOCKIT(armci_master); - return; - } - - ctx->freep = armci_get_shmem_ptr(ctx->shmid, ctx->shmoffset, - ctx->shmsize); - - shmid = ctx->shmid; - shmoffset = ctx->shmoffset; - shmsize = ctx->shmsize; - - /* Join the memory back into the free linked list */ - p = ctx->freep; - nextp = SHM_PTR(p); - - for ( ; !(bp > p && bp < nextp); p=nextp, nextp=SHM_PTR(p)) { - if (p >= nextp && (bp > p || bp < nextp)) - break; /* Freed block at start or end of arena */ - nextp = SHM_PTR(p); - shmid = p->s.shmid; - shmoffset = p->s.shmoffset; - shmsize = p->s.shmsize; - } - - if (bp + bp->s.size == nextp) {/* join to upper neighbour */ - bp->s.size += nextp->s.size; - bp->s.ptr = nextp->s.ptr; - ctx->nfrags--; /* Lost a fragment */ - bp->s.shmid = nextp->s.shmid; - bp->s.shmoffset = nextp->s.shmoffset; - bp->s.shmsize = nextp->s.shmsize; - } else { - bp->s.ptr = nextp; - bp->s.shmid = p->s.shmid; - bp->s.shmoffset = p->s.shmoffset; - bp->s.shmsize = p->s.shmsize; - } - - if (p + p->s.size == bp) { /* Join to lower neighbour */ - p->s.size += bp->s.size; - p->s.ptr = bp->s.ptr; - ctx->nfrags--; /* Lost a fragment */ - p->s.shmid = bp->s.shmid; - p->s.shmoffset = bp->s.shmoffset; - p->s.shmsize = bp->s.shmsize; - } else { - p->s.ptr = bp; - armci_get_shmem_info((char*)bp, &p->s.shmid, &p->s.shmoffset, - &p->s.shmsize); - } - - ctx->freep = p; - ctx->shmid = shmid; - ctx->shmoffset = shmoffset; - ctx->shmsize = shmsize; - } /* end if on ap */ - - UNLOCKIT(armci_master); -} -#else /* #ifdef SYSV */ -/* What are doing here */ -static char *kr_malloc_shmem(size_t nbytes, context_t *ctx) -{ - armci_die("kr_malloc_shmem(): Invalid Function Call", 0L); -} -static void kr_free_shmem(char *ap, context_t *ctx) -{ - armci_die("kr_free_shmem(): Invalid Function Call", 0L); -} -#endif /* #ifdef SYSV */ -/********************** END: kr_malloc for ctx_shmem *********************/ - - -/** -issues: -1. do usage statistics only if debug/DEBUG is enabled -*/ diff --git a/armci/src-gemini/kr_malloc.h b/armci/src-gemini/kr_malloc.h deleted file mode 100644 index 5d6e01172..000000000 --- a/armci/src-gemini/kr_malloc.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef KR_MALLOC_H /* K&R malloc */ -#define KR_MALLOC_H - -#ifdef CRAY -#define LOG_ALIGN 6 -#elif defined(KSR) -#define LOG_ALIGN 7 -#else -#define LOG_ALIGN 6 -#endif - -#define ALIGNMENT (1 << LOG_ALIGN) - -#define KR_CTX_SHMEM 101 -#define KR_CTX_LOCALMEM 102 - -union header{ - struct { - unsigned valid1; /* Token to check if is not overwritten */ - union header *ptr; /* next block if on free list */ - int shmid; /* next block's shared memory id */ - long shmoffset; /* next block's shmem offset */ - size_t shmsize; /* next block's shared memory segment size */ - size_t size; /* size of this block*/ - unsigned valid2; /* Another token acting as a guard */ - } s; - char align[ALIGNMENT]; /* Align to ALIGNMENT byte boundary */ -}; - -typedef union header Header; - -typedef struct malloc_context { - size_t usize; /* unit size in bytes */ - size_t nalloc; /* No. of units of length ALIGNMENT */ - size_t max_nalloc; /* Maximum no. of units that can get */ - void * (*alloc_fptr)(); /* function pointer to memory alloc routine */ - size_t total; /* Amount request from system in units */ - long nchunk; /* No. of chunks of system memory */ - long inuse; /* Amount in use in units */ - long maxuse; /* Maximum value of inuse */ - long nfrags; /* No. of fragments divided into */ - long nmcalls; /* No. of calls to _armci_alloc() */ - long nfcalls; /* No. of calls to memfree */ - int ctx_type; /* context id. - -1 represents ctx_local context. - otherwise, it is ctx_shmem context. */ - int shmid; /* first free block's (i.e.freep) shmem id */ - long shmoffset; /* first free block's shmem offset */ - size_t shmsize; /* first free block's shmem total size */ - Header base; /* empty list to get started */ - Header *freep; /* start of free list */ - Header *usedp; /* start of used list */ -} context_t; - -/* Memory required to store the shmem context in shared memory. This shmem - context shuld be stored in shared memory and is stored in the first shared - memory segment created (i.e.armci_krmalloc_init_ctxshmem) */ -#define SHMEM_CTX_MEM (sizeof(context_t)+sizeof(void*)) -#define SHMEM_CTX_BYTES ((SHMEM_CTX_MEM + sizeof(Header) - 1)>>LOG_ALIGN) + 1; - -extern void kr_malloc_init(size_t usize, /* unit size in bytes */ - size_t nalloc, - size_t max_nalloc, - void * (*alloc_fptr)(), /* memory alloc routine */ - int debug, - context_t *ctx); - -/* - Returns data aligned on a quad boundary. Even if the request - size is zero it returns a non-zero pointer. -*/ -extern char *kr_malloc(size_t size, context_t *ctx, int new_allocation, void **new_base, size_t *new_size); - -/* - Frees memory allocated by kr_malloc(). Ignores NULL pointers - but must not be called twice for the same pointer or called - with non-memalloc'ed pointers -*/ -extern void kr_free(char *ptr, context_t *ctx); - -/* - Print to standard output the usage statistics ... a wrapper - for kr_malloc_stats(); -*/ -extern void kr_malloc_print_stats(context_t *ctx); - -extern void kr_malloc_verify(context_t *ctx); - -#endif diff --git a/armci/src-gemini/locks.c b/armci/src-gemini/locks.c deleted file mode 100644 index 8bc95b21c..000000000 --- a/armci/src-gemini/locks.c +++ /dev/null @@ -1,90 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: locks.c,v 1.15.6.1 2006-12-14 13:24:36 manoj Exp $ */ -#define _LOCKS_C_ -#include "armcip.h" -#include "locks.h" -#ifndef WIN32 -# include -#endif -#include - - -extern void armci_die(char*,int); - -#if defined(SPINLOCK) || defined(PMUTEXES) - -void **ptr_arr; - -void CreateInitLocks(int num_locks, lockset_t *plockid) -{ -int locks_per_proc, size; -extern void armci_set_serv_mutex_arr(void *); - ARMCI_PR_DBG("enter",0); - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - locks_per_proc = (num_locks*armci_nclus)/armci_nproc + 1; - size=locks_per_proc*sizeof(PAD_LOCK_T); - PARMCI_Malloc(ptr_arr, size); - _armci_int_mutexes = (PAD_LOCK_T*) ptr_arr[armci_master]; -# ifdef PORTALS_SPECIFIC_QUESTION - if(armci_me==armci_master)armci_set_serv_mutex_arr(_armci_int_mutexes); -# endif - - if(!_armci_int_mutexes) armci_die("Failed to create spinlocks",size); - -#ifdef PMUTEXES - if(armci_me == armci_master) { - int i; - pthread_mutexattr_t pshared; - if(pthread_mutexattr_init(&pshared)) - armci_die("armci_allocate_locks: could not init mutex attr",0); -# ifndef LINUX - if(pthread_mutexattr_setpshared(&pshared,PTHREAD_PROCESS_SHARED)) - armci_die("armci_allocate_locks: could not set PROCESS_SHARED",0); -# endif - - for(i=0; i< locks_per_proc*armci_clus_info[armci_clus_me].nslave; i++){ - if(pthread_mutex_init(_armci_int_mutexes+i,&pshared)) - armci_die("armci_allocate_locks: could not init mutex",i); - } - } -#else - - bzero((char*)ptr_arr[armci_me],size); - ARMCI_PR_DBG("exit",0); -#endif -} - -void InitLocks(int num_locks, lockset_t lockid) -{ - /* what are you doing here ? - All processes should've called CreateInitLocks(). - Check preprocessor directtives and see lock allocation in armci_init */ - armci_die("InitLocks(): what are you doing here ?",armci_me); -} - - -void DeleteLocks(lockset_t lockid) -{ - _armci_int_mutexes = (PAD_LOCK_T*)0; -} - -#else -/*********************** every thing else *************************/ - -void CreateInitLocks(int num_locks, lockset_t *lockid) -{} - -void InitLocks(int num_locks, lockset_t lockid) -{ -} - - -void DeleteLocks(lockset_t lockid) -{ -} - -#endif - diff --git a/armci/src-gemini/locks.h b/armci/src-gemini/locks.h deleted file mode 100644 index 2504b6176..000000000 --- a/armci/src-gemini/locks.h +++ /dev/null @@ -1,174 +0,0 @@ -#ifndef _ARMCI_LOCKS_H_ -#define _ARMCI_LOCKS_H_ - -#if HAVE_SYS_TYPES_H -# include -#endif - -#define MAX_LOCKS 1024 -#define NUM_LOCKS MAX_LOCKS - -#if !(defined(PMUTEX) || defined(PSPIN) || defined(CYGNUS) || defined(CRAY_XT)) -# include "spinlock.h" -#endif - -#if !(defined(PMUTEX) || defined(PSPIN) || defined(SPINLOCK)) -# error cannot run -#endif - -#if (defined(SPINLOCK) || defined(PMUTEX) || defined(PSPIN) || defined(HITACHI)) && !(defined(BGML) || defined(DCMF)) -# include "armci_shmem.h" -typedef struct { - long off; - long idlist[SHMIDLEN]; -} lockset_t; -extern lockset_t lockid; -#elif defined(BGML) || defined(DCMF) -typedef int lockset_t; -#endif - -#if defined(PMUTEX) -# warning SPINLOCK: pthread_mutex_lock -# include -# include -# define NAT_LOCK(x,p) pthread_mutex_lock(_armci_int_mutexes +x) -# define NAT_UNLOCK(x,p) pthread_mutex_unlock(_armci_int_mutexes +x) -# define LOCK_T pthread_mutex_t -# define PAD_LOCK_T LOCK_T -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(PSPIN) -# warning SPINLOCK: pthread_spin_lock -# include -# include -# define NAT_LOCK(x,p) pthread_spin_lock(_armci_int_mutexes +x) -# define NAT_UNLOCK(x,p) pthread_spin_unlock(_armci_int_mutexes +x) -# define LOCK_T pthread_spinlock_t -# define PAD_LOCK_T LOCK_T -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(SPINLOCK) && defined(SGIALTIX) -# define NAT_LOCK(x,p) armci_acquire_spinlock((LOCK_T*)( ((PAD_LOCK_T*)(((void**)_armci_int_mutexes)[p]))+x )) -# define NAT_UNLOCK(x,p) armci_release_spinlock((LOCK_T*)( ((PAD_LOCK_T*)(((void**)_armci_int_mutexes)[p]))+x )) -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(SPINLOCK) -# define NAT_LOCK(x,p) armci_acquire_spinlock((LOCK_T*)(_armci_int_mutexes+(x))) -# define NAT_UNLOCK(x,p) armci_release_spinlock((LOCK_T*)(_armci_int_mutexes+(x))) -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(HITACHI) -extern void armcill_lock(int mutex, int proc); -extern void armcill_unlock(int mutex, int proc); -# define LOCK_T int -# define PAD_LOCK_T LOCK_T -# define NAT_LOCK(x,p) armcill_lock((x),(p)) -# define NAT_UNLOCK(x,p) armcill_unlock((x),(p)) -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(SGI) -# define SGI_SPINS 100 -# include -typedef struct { - int id; - ulock_t * lock_array[NUM_LOCKS]; -}lockset_t; -extern lockset_t lockset; -# define NAT_LOCK(x,p) (void) uswsetlock(lockset.lock_array[(x)],SGI_SPINS) -# define NAT_UNLOCK(x,p) (void) usunsetlock(lockset.lock_array[(x)]) - -#elif defined(CONVEX) -# include -typedef struct{ - unsigned state; - unsigned pad[15]; -} lock_t; -typedef int lockset_t; -extern lock_t *lock_array; -extern void setlock(unsigned * volatile lp); -extern void unsetlock(unsigned * volatile lp); -# define NAT_LOCK(x,p) (void) setlock(&lock_array[x].state) -# define NAT_UNLOCK(x,p) (void) unsetlock(&lock_array[(x)].state) - -#elif defined(WIN32) -typedef int lockset_t; -extern void setlock(int); -extern void unsetlock(int); -# define NAT_LOCK(x,p) setlock(x) -# define NAT_UNLOCK(x,p) unsetlock(x) - -#elif defined(CRAY_YMP) && !defined(__crayx1) -# include -typedef int lockset_t; -extern lock_t cri_l[NUM_LOCKS]; -# pragma _CRI common cri_l -# define NAT_LOCK(x,p) t_lock(cri_l+(x)) -# define NAT_UNLOCK(x,p) t_unlock(cri_l+(x)) - -#elif defined(CRAY_T3E) || defined(__crayx1) || defined(CATAMOUNT) || defined(CRAY_SHMEM) || defined(PORTALS) -# include -# if defined(CRAY) || defined(CRAY_XT) -# include -# endif -# if defined(DECOSF) || defined(LINUX64) || defined(__crayx1) || defined(CATAMOUNT) -# define _INT_MIN_64 (LONG_MAX-1) -# endif -# undef NUM_LOCKS -# define NUM_LOCKS 4 -static long armci_lock_var[4]={0,0,0,0}; -typedef int lockset_t; -# define INVALID (long)(_INT_MIN_64 +1) -# define NAT_LOCK(x,p) while( shmem_swap(armci_lock_var+(x),INVALID,(p)) ) -# define NAT_UNLOCK(x,p) shmem_swap(armci_lock_var+(x), 0, (p)) - -#elif defined(SYSV) && defined(LAPI) && defined(AIX) -int **_armci_int_mutexes; -# define NAT_LOCK(x,p) armci_lapi_lock(_armci_int_mutexes[armci_master]+(x)) -# define NAT_UNLOCK(x,p) armci_lapi_unlock(_armci_int_mutexes[armci_master]+(x)) -typedef int lockset_t; - -#elif defined(CYGNUS) -typedef int lockset_t; -# define NAT_LOCK(x,p) armci_die("does not run in parallel",0) -# define NAT_UNLOCK(x,p) armci_die("does not run in parallel",0) - -#elif defined(LAPI) && !defined (LINUX) -# include -typedef int lockset_t; -extern pthread_mutex_t _armci_mutex_thread; -# define NAT_LOCK(x,p) pthread_mutex_lock(&_armci_mutex_thread) -# define NAT_UNLOCK(x,p) pthread_mutex_unlock(&_armci_mutex_thread) - -#elif defined(FUJITSU) -typedef int lockset_t; -# include "fujitsu-vpp.h" - -#elif defined(SYSV) || defined(MACX) -# include "semaphores.h" -# undef NUM_LOCKS -# define NUM_LOCKS ((MAX_LOCKS< SEMMSL) ? MAX_LOCKS:SEMMSL) -# define NAT_LOCK(x,p) P_semaphore(x) -# define NAT_UNLOCK(x,p) V_semaphore(x) -# ifndef _LOCKS_C_ -# define CreateInitLocks Sem_CreateInitLocks -# define InitLocks Sem_InitLocks -# define DeleteLocks Sem_DeleteLocks -# endif - -#else -# error -#endif - -extern void CreateInitLocks(int num, lockset_t *id); -extern void InitLocks(int num , lockset_t id); -extern void DeleteLocks(lockset_t id); - -#ifdef FUJITSU -# define NATIVE_LOCK(x,p) if(armci_nproc>1) { NAT_LOCK(p); } -# define NATIVE_UNLOCK(x,p) if(armci_nproc>1) { NAT_UNLOCK(p); } -#else -# define NATIVE_LOCK(x,p) if(armci_nproc>1) { NAT_LOCK(x,p); } -# define NATIVE_UNLOCK(x,p) if(armci_nproc>1) { NAT_UNLOCK(x,p); } -#endif - -#endif /* _ARMCI_LOCKS_H_ */ diff --git a/armci/src-gemini/memlock.c b/armci/src-gemini/memlock.c deleted file mode 100644 index dfae952c7..000000000 --- a/armci/src-gemini/memlock.c +++ /dev/null @@ -1,269 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: memlock.c,v 1.24.2.3 2007-08-29 17:32:32 manoj Exp $ */ -#include "armcip.h" -#include "locks.h" -#include "copy.h" -#include "memlock.h" -#include - -#define DEBUG_ 0 -#define INVALID_VAL -9999999 - -#ifdef DATA_SERVER -# define CORRECT_PTR -#endif -size_t armci_mem_offset=0; - -/* We start by using table: assign address of local variable set to 1 - * On shmem systems, this addres is overwritten by a shared memory location - * when memlock array is allocated in armci_init - * Therefore, any process within shmem node can reset armci_use_memlock_table - * to "not used" when offset changes. Since the variable is in shmem, everybody - * on that SMP node will see the change and use the same locking functions - */ -int init_use_memlock_table=1; -int *armci_use_memlock_table=&init_use_memlock_table; - -static int locked_slot=INVALID_VAL; - -volatile double armci_dummy_work=0.; -void **memlock_table_array; - -/* constants for cache line alignment */ -# define CALGN 64 -# define LOG_CALGN 6 - -#define ALIGN_ADDRESS(x) (char*)((((unsigned long)x) >> LOG_CALGN) << LOG_CALGN) -static memlock_t table[MAX_SLOTS]; - - -/*\ simple locking scheme that ignores addresses -\*/ -void armci_lockmem_(void *pstart, void *pend, int proc) -{ - -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - - if(DEBUG_){ - printf("%d: armci_lockmem_ proc=%d lock=%d\n",armci_me,proc,lock); - fflush(stdout); - } - - NATIVE_LOCK(lock,proc); - if(DEBUG_){ - printf("%d: armci_lockmem_ done\n",armci_me); - fflush(stdout); - } -} - -void armci_unlockmem_(int proc) -{ - -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - if(DEBUG_){ - printf("%d: armci_unlockmem_ proc=%d lock=%d\n",armci_me,proc,lock); - fflush(stdout); - } - NATIVE_UNLOCK(lock,proc); -} - - - -/*\ idle for a time proportional to factor -\*/ -void armci_waitsome(int factor) -{ -int i=factor*100000; - - if(factor <= 1) armci_dummy_work =0.; - if(factor < 1) return; - while(--i){ - armci_dummy_work = armci_dummy_work + 1./(double)i; - } -} - -/*\ acquire exclusive LOCK to MEMORY area owned by process "proc" - * . only one area can be locked at a time by the calling process - * . must unlock it with armci_unlockmem -\*/ -void armci_lockmem(void *start, void *end, int proc) -{ - register void* pstart, *pend; - register int slot, avail=0; - int turn=0, conflict=0; - memlock_t *memlock_table; -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - -#ifdef CORRECT_PTR - if(! *armci_use_memlock_table){ - /* if offset invalid, use dumb locking scheme ignoring addresses */ - armci_lockmem_(start, end, proc); - return; - } - -# ifndef SGIALTIX - /* when processes are attached to a shmem region at different addresses, - * addresses written to memlock table must be adjusted to the node master - */ - if(armci_mem_offset){ - start = armci_mem_offset + (char*)start; - end = armci_mem_offset + (char*)end; - } -# endif -#endif - - if(DEBUG_){ - printf("%d: calling armci_lockmem for %d range %p -%p\n", - armci_me, proc, start,end); - fflush(stdout); - } - memlock_table = (memlock_t*)memlock_table_array[proc]; - - -#ifdef ALIGN_ADDRESS - /* align address range on cache line boundary to avoid false sharing */ - pstart = ALIGN_ADDRESS(start); - pend = CALGN -1 + ALIGN_ADDRESS(end); -#else - pstart=start; - pend =end; -#endif - -#ifdef CRAY_SHMEM - { /* adjust according the remote process raw address */ - long bytes = (long) ((char*)pend-(char*)pstart); - extern void* armci_shmalloc_remote_addr(void *ptr, int proc); - pstart = armci_shmalloc_remote_addr(pstart, proc); - pend = (char*)pstart + bytes; - } -#endif - while(1){ - NATIVE_LOCK(lock,proc); - - armci_get(memlock_table, table, sizeof(table), proc); -/* armci_copy(memlock_table, table, sizeof(table));*/ - - /* inspect the table */ - conflict = 0; avail =-1; - for(slot = 0; slot < MAX_SLOTS; slot ++){ - - /* nonzero starting address means the slot is occupied */ - if(table[slot].start == NULL){ - - /* remember a free slot to store address range */ - avail = slot; - - }else{ - /*check for conflict: overlap between stored and current range*/ - if( (pstart >= table[slot].start && pstart <= table[slot].end) - || (pend >= table[slot].start && pend <= table[slot].end) ){ - - conflict = 1; - break; - - } - /* - printf("%d: locking %ld-%ld (%d) conflict\n", - armci_me, */ - } - } - - if(avail != -1 && !conflict) break; - - NATIVE_UNLOCK(lock,proc); - armci_waitsome( ++turn ); - - } - - /* we got the memory lock: enter address into the table */ - table[avail].start = pstart; - table[avail].end = pend; - armci_put(table+avail,memlock_table+avail,sizeof(memlock_t),proc); - - FENCE_NODE(proc); - - NATIVE_UNLOCK(lock,proc); - locked_slot = avail; - -} - - -/*\ release lock to the memory area locked by previous call to armci_lockemem -\*/ -void armci_unlockmem(int proc) -{ - void *null[2] = {NULL,NULL}; - memlock_t *memlock_table; - -#ifdef CORRECT_PTR - if(! *armci_use_memlock_table){ - /* if offset invalid, use dumb locking scheme ignoring addresses */ - armci_unlockmem_(proc); - return; - } -#endif - -#ifdef DEBUG - if(locked_slot == INVALID_VAL) armci_die("armci_unlock: empty",0); - if(locked_slot >= MAX_SLOTS || locked_slot <0) - armci_die("armci_unlock: corrupted slot?",locked_slot); -#endif - - memlock_table = (memlock_t*)memlock_table_array[proc]; - armci_put(null,&memlock_table[locked_slot].start,2*sizeof(void*),proc); - -} - - - -/*\ based on address for set by master, determine correction for - * memory addresses set in memlock table - * if the correction/offset ever changes stop using memlock table locking -\*/ -void armci_set_mem_offset(void *ptr) -{ - extern size_t armci_mem_offset; - size_t off; - static int first_time=1; - volatile void *ref_ptr; - - ARMCI_PR_DBG("enter",0); - /* do not care if memlock not used */ - if(! *armci_use_memlock_table) return; - - if(!ptr) armci_die("armci_set_mem_offset : null ptr",0); - ref_ptr = *(void**)ptr; - off = (size_t)((char*)ref_ptr - (char*)ptr); - - if(first_time){ - - armci_mem_offset =off; - first_time =0; - if(DEBUG_){ - printf("%d memlock offset=%ld ref=%p ptr=%p\n",armci_me, - (long)armci_mem_offset, ref_ptr, ptr); fflush(stdout); - } - - }else{ - if(armci_mem_offset != off){ - *armci_use_memlock_table =0; - fprintf(stderr, "%d: WARNING:armci_set_mem_offset: offset changed %ld to %ld\n", - armci_me, (long)armci_mem_offset, (long)off); fflush(stdout); - } - } -} diff --git a/armci/src-gemini/memlock.h b/armci/src-gemini/memlock.h deleted file mode 100644 index cd3147d20..000000000 --- a/armci/src-gemini/memlock.h +++ /dev/null @@ -1,38 +0,0 @@ -/* $Id: memlock.h,v 1.18 2004-09-21 17:26:23 manoj Exp $ */ -#ifndef _MEMLOCK_H_ -#define _MEMLOCK_H_ - - -/* data structure for locking memory areas */ -#define MAX_SLOTS 8 -typedef struct{ - void *start; - void *end; -} memlock_t; - -/* SGI Altix Stuff */ -typedef struct { - void *seg_addr; /* master's starting address of the segment */ - size_t seg_size; - size_t tile_size; - size_t mem_offset; -}armci_memoffset_t; - -extern void** memlock_table_array; -extern int *armci_use_memlock_table; - -#if defined(LAPI) || defined(FUJITSU) || defined(PTHREADS) || defined(QUADRICS)\ - || defined(PORTALS) || defined(HITACHI) || (defined(LINUX64)&&defined(__GNUC__)&&defined(__alpha__))\ - || defined(CYGWIN) || defined(__crayx1) || defined(NEC) || defined(LIBONESIDED) -# define ARMCI_LOCKMEM armci_lockmem_ -# define ARMCI_UNLOCKMEM armci_unlockmem_ -#else -# define ARMCI_LOCKMEM armci_lockmem -# define ARMCI_UNLOCKMEM armci_unlockmem -# error "rmo: i think this is broken and should not be used" -#endif - -extern void ARMCI_LOCKMEM(void *pstart, void *pend, int proc); -extern void ARMCI_UNLOCKMEM(int proc); -#define MEMLOCK_SHMEM_FLAG -#endif diff --git a/armci/src-gemini/memory.c b/armci/src-gemini/memory.c deleted file mode 100644 index e023f9869..000000000 --- a/armci/src-gemini/memory.c +++ /dev/null @@ -1,1062 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: memory.c,v 1.56.2.3 2007-04-25 23:49:55 d3p687 Exp $ */ -#ifndef NEW_MALLOC -#include -#include -#include "armcip.h" -#include "message.h" -#include "kr_malloc.h" - -#define DEBUG_ 0 -#define USE_MALLOC -#define USE_SHMEM_ -#define SHM_UNIT 1024 - -static context_t ctx_localmem; -/* -static context_t ctx_mlocalmem; -*/ -#if defined(SYSV) || defined(WIN32) || defined(MMAP) || defined(HITACHI) -#include "armci_shmem.h" - -#if !defined(USE_SHMEM) && (defined(HITACHI) || defined(MULTI_CTX)) -# define USE_SHMEM -#endif - -#if !(defined(LAPI)||defined(QUADRICS)||defined(SERVER_THREAD)) ||\ - defined(USE_SHMEM)||defined(LIBONESIDED) -#define RMA_NEEDS_SHMEM -#endif - -void kr_check_local() -{ -#if 0 -kr_malloc_print_stats(&ctx_localmem); -#endif -kr_malloc_verify(&ctx_localmem); -} - -void armci_print_ptr(void **ptr_arr, int bytes, int size, void* myptr, int off) -{ -int i; -int nproc = armci_clus_info[armci_clus_me].nslave; - - ARMCI_PR_DBG("enter",0); - for(i=0; i< armci_nproc; i++){ - int j; - if(armci_me ==i){ - printf("%d master =%d nproc=%d off=%d\n",armci_me, - armci_master,nproc, off); - printf("%d:bytes=%d mptr=%p s=%d ",armci_me, bytes, myptr,size); - for(j = 0; j< armci_nproc; j++)printf(" %p",ptr_arr[j]); - printf("\n"); fflush(stdout); - } - armci_msg_barrier(); - } - ARMCI_PR_DBG("exit",0); -} - - -/*\ master exports its address of shmem region at the beggining of that region -\*/ -static void armci_master_exp_attached_ptr(void* ptr) -{ - ARMCI_PR_DBG("enter",0); - if(!ptr) armci_die("armci_master_exp_att_ptr: null ptr",0); - *(volatile void**)ptr = ptr; - ARMCI_PR_DBG("exit",0); -} - - -/*\ Collective Memory Allocation on shared memory systems -\*/ -void armci_shmem_malloc(void *ptr_arr[], armci_size_t bytes) -{ - void *myptr=NULL, *ptr=NULL; - long idlist[SHMIDLEN]; - long size=0, offset=0; - long *size_arr; - void **ptr_ref_arr; - int i,cn, len; - int nproc = armci_clus_info[armci_clus_me].nslave; - ARMCI_PR_DBG("enter",0); - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - - /* allocate work arrays */ - size_arr = (long*)calloc(armci_nproc,sizeof(long)); - if(!size_arr)armci_die("armci_malloc:calloc failed",armci_nproc); - /* allocate arrays for cluster address translations */ - - ptr_ref_arr = calloc(armci_nclus,sizeof(void*)); /* must be zero */ - if(!ptr_ref_arr)armci_die("armci_malloc:calloc 2 failed",armci_nclus); - - /* combine all memory requests into size_arr */ - size_arr[armci_me] = bytes; - armci_msg_lgop(size_arr, armci_nproc, "+"); - - /* determine aggregate request size on the cluster node */ - for(i=0, size=0; i< nproc; i++) size += size_arr[i+armci_master]; - - /* master process creates shmem region and then others attach to it */ - if(armci_me == armci_master ){ - /* can malloc if there is no data server process and has 1 process/node*/ -# ifndef RMA_NEEDS_SHMEM - if(nproc == 1) - myptr = kr_malloc(size, &ctx_localmem, 0, NULL, NULL); - else -# endif - myptr = Create_Shared_Region(idlist+1,size,idlist); - if(!myptr && size>0 )armci_die("armci_malloc: could not create", (int)(size>>10)); - - /* place its address at begining of attached region for others to see */ - if(size)armci_master_exp_attached_ptr(myptr); - - if(DEBUG_){ - printf("%d:armci_malloc addr mptr=%p size=%ld\n",armci_me,myptr,size); - fflush(stdout); - } - } - - /* broadcast shmem id to other processes on the same cluster node */ - armci_msg_clus_brdcst(idlist, SHMIDLEN*sizeof(long)); - - if(armci_me != armci_master){ - myptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!myptr)armci_die("armci_malloc: could not attach", (int)(size>>10)); - - /* now every process in a SMP node needs to find out its offset - * w.r.t. master - this offset is necessary to use memlock table - */ - if(size) armci_set_mem_offset(myptr); - if(DEBUG_){ - printf("%d:armci_malloc attached addr mptr=%p ref=%p size=%ld\n", - armci_me,myptr, *(void**)myptr,size); fflush(stdout); - } - } -# if defined(DATA_SERVER) - /* get server reference address for every cluster node to perform - * remote address translation for global address space */ - if(armci_nclus>1){ - if(armci_me == armci_master){ -# ifdef SERVER_THREAD - ptr_ref_arr[armci_clus_me]=myptr; -# else - { - extern int _armci_server_started; - if(_armci_server_started) { - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &ptr, sizeof(void*)); - ptr_ref_arr[armci_clus_me]= myptr; /* from server*/ - } - else /* server not yet started */ - ptr_ref_arr[armci_clus_me]=myptr; - } - - if(DEBUG_){ - printf("%d:addresses server=%p myptr=%p\n",armci_me,ptr,myptr); - fflush(stdout); - } -# endif - } - /* exchange ref addr of shared memory region on every cluster node*/ - armci_exchange_address(ptr_ref_arr, armci_nclus); - # ifdef ARMCI_REGISTER_SHMEM - armci_register_shmem(myptr,size,idlist+1,idlist[0],ptr_ref_arr[armci_clus_me]); - # endif - }else { - ptr_ref_arr[armci_master] = myptr; - } - /* translate addresses for all cluster nodes */ - for(cn = 0; cn < armci_nclus; cn++){ - int master = armci_clus_info[cn].master; - offset = 0; - /* on local cluster node use myptr directly */ - ptr = (armci_clus_me == cn) ? myptr: ptr_ref_arr[cn]; - /* compute addresses pointing to the memory regions on cluster node*/ - for(i=0; i< armci_clus_info[cn].nslave; i++){ - /* NULL if request size is 0*/ - ptr_arr[i+master] = (size_arr[i+master])? ((char*)ptr)+offset : NULL; - offset += size_arr[i+master]; - } - } -# else - /* compute addresses for local cluster node */ - offset =0; - for(i=0; i< nproc; i++) { - ptr_ref_arr[i] = (size_arr[i+armci_master])? ((char*)myptr)+offset : 0L; - offset += size_arr[i+armci_master]; - } - /* exchange addreses with all other processes */ - ptr_arr[armci_me] = (char*)ptr_ref_arr[armci_me-armci_master]; - armci_exchange_address(ptr_arr, armci_nproc); - /* overwrite entries for local cluster node with ptr_ref_arr */ - bcopy((char*)ptr_ref_arr, (char*)(ptr_arr+armci_master), nproc*sizeof(void*)); - /* armci_print_ptr(ptr_arr, bytes, size, myptr, offset);*/ -# endif - - armci_msg_barrier(); - - /* free work arrays */ - free(ptr_ref_arr); - free(size_arr); - ARMCI_PR_DBG("exit",0); - -} - -/******************************************************************** - * Non-collective Memory Allocation on shared memory systems -\*/ -void armci_shmem_memget(armci_meminfo_t *meminfo, size_t size) { - void *myptr=NULL; - void *armci_ptr=NULL; /* legal ARCMIptr used in ARMCI data xfer ops */ - long idlist[SHMIDLEN]; - - /* can malloc if there is no data server process & has 1 process/node*/ -#ifndef RMA_NEEDS_SHMEM - if( armci_clus_info[armci_clus_me].nslave == 1) - myptr = kr_malloc(size, &ctx_localmem, 0, NULL, NULL); - else -#endif - myptr = Create_Shared_Region(idlist+1,size,idlist); - - if(!myptr && size>0 ) - armci_die("armci_shmem_memget: create failed", (int)(size>>10)); - - if(DEBUG_) - { - printf("%d: armci_shmem_memget: addr=%p size=%ld %ld %ld \n", armci_me, - myptr, size, idlist[0], idlist[1]); - fflush(stdout); - } - - armci_ptr = myptr; - -#if defined(DATA_SERVER) - - /* get server reference address to perform - * remote address translation for global address space */ - if(armci_nclus>1) - { -# ifdef SERVER_THREAD - - /* data server thread runs on master process */ - if(armci_me != armci_master) { - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &armci_ptr, sizeof(void*)); - } - -# else - /* ask dataserver process to attach to region and get ptr*/ - { - extern int _armci_server_started; - if(_armci_server_started) { - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &armci_ptr, sizeof(void*)); - } - } -# endif - } -#endif - - /* fill the meminfo structure */ - meminfo->armci_addr = armci_ptr; - meminfo->addr = myptr; - meminfo->size = size; - meminfo->cpid = armci_me; - bcopy(idlist, meminfo->idlist, SHMIDLEN*sizeof(long)); - -} - -void* armci_shmem_memat(armci_meminfo_t *meminfo) { - void *ptr=NULL; - long size = (long) meminfo->size; - long *idlist = (long*) meminfo->idlist; - - if(SAMECLUSNODE(meminfo->cpid)) - { - /* Attach to the shared memory segment */ - ptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!ptr)armci_die("ARMCi_Memat: could not attach", (int)(size>>10)); - - /* CHECK: now every process in a SMP node needs to find out its offset - * w.r.t. master - this offset is necessary to use memlock table - */ - if(size) armci_set_mem_offset(ptr); - } - else - { - ptr = meminfo->armci_addr; /* remote address */ - } - - return ptr; -} - -void armci_shmem_memctl(armci_meminfo_t *meminfo) { - - /* only the creator can delete the segment */ - if(meminfo->cpid == armci_me) { - void *ptr = meminfo->addr; - -#ifdef RMA_NEEDS_SHMEM - Free_Shmem_Ptr(0,0,ptr); -#else - if(armci_clus_info[armci_clus_me].nslave>1) - Free_Shmem_Ptr(0,0,ptr); - else kr_free(ptr, &ctx_localmem); -#endif - } -} - -/****** End: Non-collective memory allocation on shared memory systems *****/ - -#ifdef MSG_COMMS_MPI -/******************************************************************** - * Group Memory Allocation on shared memory systems for ARMCI Groups -\*/ -void armci_shmem_malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) -{ - void *myptr=NULL, *ptr=NULL; - long idlist[SHMIDLEN]; - long size=0, offset=0; - long *size_arr; - void **ptr_ref_arr; - int i,cn, len; - /* int nproc = armci_clus_info[armci_clus_me].nslave; ? change ? */ - int grp_me, grp_nproc, grp_nclus, grp_master, grp_clus_nproc, grp_clus_me; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - ARMCI_PR_DBG("enter",0); - - /* Get the group info: group size & group rank */ - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(grp_me == MPI_UNDEFINED) { /* check if the process is in this group */ - armci_die("armci_malloc_group: process is not a member in this group", - armci_me); - } - - grp_nclus = grp_attr->grp_nclus; - grp_clus_me = grp_attr->grp_clus_me; - grp_master = grp_attr->grp_clus_info[grp_clus_me].master; - grp_clus_nproc = grp_attr->grp_clus_info[grp_clus_me].nslave; - - bzero((char*)ptr_arr,grp_nproc*sizeof(void*)); - - /* allocate work arrays */ - size_arr = (long*)calloc(grp_nproc,sizeof(long)); - if(!size_arr)armci_die("armci_malloc_group:calloc failed",grp_nproc); - - /* allocate arrays for cluster address translations */ -# if defined(DATA_SERVER) - len = grp_nclus; -# else - len = grp_clus_nproc; -# endif - - ptr_ref_arr = calloc(len,sizeof(void*)); /* must be zero */ - if(!ptr_ref_arr)armci_die("armci_malloc_group:calloc 2 failed",len); - - /* combine all memory requests into size_arr */ - size_arr[grp_me] = bytes; - armci_msg_group_gop_scope(SCOPE_ALL, size_arr, grp_nproc, "+", ARMCI_LONG, - group); - - /* determine aggregate request size on the cluster node */ - for(i=0, size=0; i< grp_clus_nproc; i++) size += size_arr[i+grp_master]; - - /* master process creates shmem region and then others attach to it */ - if(grp_me == grp_master ){ - - - /* can malloc if there is no data server process and has 1 process/node*/ -# ifndef RMA_NEEDS_SHMEM - if( armci_clus_info[armci_clus_me].nslave == 1) - myptr = kr_malloc(size, &ctx_localmem, 0, NULL, NULL); - else -# endif - myptr = Create_Shared_Region(idlist+1,size,idlist); - if(!myptr && size>0 ) - armci_die("armci_malloc_group: could not create", (int)(size>>10)); - - /* place its address at begining of attached region for others to see */ - if(size)armci_master_exp_attached_ptr(myptr); - - if(DEBUG_){ - printf("%d:armci_malloc_group addr mptr=%p ref=%p size=%ld %ld %ld \n",armci_me,myptr,*(void**)myptr, size,idlist[0],idlist[1]); - fflush(stdout); - } - } - - /* broadcast shmem id to other processes (in the same group) on the - same cluster node */ - armci_grp_clus_brdcst(idlist, SHMIDLEN*sizeof(long), grp_master, - grp_clus_nproc, group); - - if(grp_me != grp_master){ - myptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!myptr)armci_die("armci_malloc_group: could not attach", (int)(size>>10)); - - /* now every process in a SMP node needs to find out its offset - * w.r.t. master - this offset is necessary to use memlock table - */ - if(size) armci_set_mem_offset(myptr); - if(DEBUG_){ - printf("%d:armci_malloc_group attached addr mptr=%p ref=%p size=%ld\n", - armci_me,myptr, *(void**)myptr,size); fflush(stdout); - } - } - -# if defined(DATA_SERVER) - - /* get server reference address for every cluster node in the group - * to perform remote address translation for global address space */ - if(grp_nclus>1){ - if(grp_me == grp_master){ - -# ifdef SERVER_THREAD - - /* data server thread runs on master process */ - if(ARMCI_Absolute_id(group,grp_master)!=armci_master){ - /*printf("\n%d: grp_master=%d %ld %ld \n",armci_me,ARMCI_Absolute_id(group,grp_master),idlist[0],idlist[1]);*/ - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &ptr, sizeof(void*)); - ptr_ref_arr[grp_clus_me]= ptr; /* from server*/ - } - else - ptr_ref_arr[grp_clus_me]=myptr; - -# else - /* ask data server process to attach to the region and get ptr */ - { - extern int _armci_server_started; - if(_armci_server_started) { - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &ptr, sizeof(void*)); - ptr_ref_arr[grp_clus_me]= ptr; /* from server*/ - } - else /* server not yet started */ - ptr_ref_arr[grp_clus_me]=myptr; - } - - if(DEBUG_){ - printf("%d:addresses server=%p myptr=%p\n",grp_me,ptr,myptr); - fflush(stdout); - } -# endif - } - /* exchange ref addr of shared memory region on every cluster node*/ - { - int ratio = sizeof(void*)/sizeof(int); - if(DEBUG_)printf("%d: exchanging %ld ratio=%d\n",armci_me, - (long)ptr_arr[grp_me], ratio); - armci_msg_group_gop_scope(SCOPE_ALL, ptr_ref_arr, grp_nclus*ratio, - "+", ARMCI_INT, group); - # ifdef ARMCI_REGISTER_SHMEM - armci_register_shmem_grp(myptr,size,idlist+1,idlist[0],ptr_ref_arr[armci_clus_me],group); - # endif - } - }else { - - ptr_ref_arr[grp_master] = myptr; - - } - - /* translate addresses for all cluster nodes */ - for(cn = 0; cn < grp_nclus; cn++){ - - int master = grp_attr->grp_clus_info[cn].master; - offset = 0; - - /* on local cluster node use myptr directly */ - ptr = (grp_clus_me == cn) ? myptr: ptr_ref_arr[cn]; - - /* compute addresses pointing to the memory regions on cluster node*/ - for(i=0; i< grp_attr->grp_clus_info[cn].nslave; i++){ - - /* NULL if request size is 0*/ - ptr_arr[i+master] =(size_arr[i+master])? ((char*)ptr)+offset: NULL; - offset += size_arr[i+master]; - } - } - -# else - - /* compute addresses for local cluster node */ - offset =0; - for(i=0; i< grp_clus_nproc; i++) { - - ptr_ref_arr[i] = (size_arr[i+grp_master])? ((char*)myptr)+offset : 0L; - offset += size_arr[i+grp_master]; - - } - - /* exchange addreses with all other processes */ - ptr_arr[grp_me] = (char*)ptr_ref_arr[grp_me-grp_master]; - armci_exchange_address_grp(ptr_arr, grp_nproc, group); - - /* overwrite entries for local cluster node with ptr_ref_arr */ - bcopy((char*)ptr_ref_arr, (char*)(ptr_arr+grp_master), grp_clus_nproc*sizeof(void*)); - -# endif - - /* armci_print_ptr(ptr_arr, bytes, size, myptr, offset);*/ - - armci_msg_group_barrier(group); - - /* free work arrays */ - free(ptr_ref_arr); - free(size_arr); - ARMCI_PR_DBG("exit",0); -} -#endif /* ifdef MSG_COMMS_MPI */ - -#else - -void armci_shmem_malloc(void* ptr_arr[], int bytes) -{ - armci_die("armci_shmem_malloc should never be called on this system",0); -} -void armci_shmem_memget(armci_meminfo_t *meminfo, size_t size) { - armci_die("armci_shmem_memget should never be called on this system",0); -} -void* armci_shmem_memat(armci_meminfo_t *meminfo) { - armci_die("armci_shmem_memat should never be called on this system",0); -} -void armci_shmem_memctl(armci_meminfo_t *meminfo) { - armci_die("armci_shmem_memctl should never be called on this system",0); -} -# ifdef MSG_COMMS_MPI - void armci_shmem_malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) { - armci_die("armci_shmem_malloc_group should never be called on this system",0); - } -# endif - -#endif - - -/* public constructor to initialize the kr_malloc context */ -void armci_krmalloc_init_localmem() { -#if defined(ALLOW_PIN) - kr_malloc_init(0, 0, 0, reg_malloc, 0, &ctx_localmem); - kr_malloc_init(0, 0, 0, malloc, 0, &ctx_mlocalmem); - ctx_mlocalmem.ctx_type = KR_CTX_LOCALMEM; -#elif defined(CRAY_SHMEM) && defined(CRAY_XT) -# ifdef CATAMOUNT - int units_avail = (cnos_shmem_size() - 1024 * 1024) / SHM_UNIT; -# else - extern size_t get_xt_heapsize(); - int units_avail = (get_xt_heapsize() - 1024 * 1024) / SHM_UNIT; -# endif - - if(DEBUG_) - { - fprintf(stderr,"%d:krmalloc_init_localmem: symheap=%llu,units(%d)=%d\n", - armci_me, SHM_UNIT*units_avail, SHM_UNIT, units_avail); - } - kr_malloc_init(SHM_UNIT, units_avail, units_avail, shmalloc, 0, - &ctx_localmem); - armci_shmalloc_exchange_offsets(&ctx_localmem); -#else - - kr_malloc_init(0, 0, 0, malloc, 0, &ctx_localmem); - -#endif - - ctx_localmem.ctx_type = KR_CTX_LOCALMEM; -} - -/** - * Local Memory Allocation and Free - */ -void *PARMCI_Malloc_local(armci_size_t bytes) { - void *rptr; - ARMCI_PR_DBG("enter",0); - ARMCI_PR_DBG("exit",0); - rptr = (void *)kr_malloc((size_t)bytes, &ctx_localmem, 0, NULL, NULL); - - # ifdef CRAY_REGISTER_ARMCI_MALLOC - onesided_hnd_t cp_hnd; - cos_mdesc_t local_mdesc; - - // get the onesided v2.0 api handle for the compute process - cpGetOnesidedHandle(&cp_hnd); - - // register the memory - onesided_mem_register(cp_hnd, rptr, bytes, 0, &local_mdesc); - - // for now; until we can search through the linked-list of registered memory - // to deregister it by pointer (ptr) value only [see ARMCI_Free_local], we'll - // take advanatage of lazy deregistration and assume that this segment will - // be kept around as long as it's active. - onesided_mem_deregister(cp_hnd, &local_mdesc); - # endif - - //printf("\n%d:%s:%d:%p\n",armci_me,__FUNCTION__,bytes,rptr); - return rptr; -} - -int PARMCI_Free_local(void *ptr) { - ARMCI_PR_DBG("enter",0); - kr_free((char *)ptr, &ctx_localmem); - ARMCI_PR_DBG("exit",0); - return 0; -} - - -/*\ Collective Memory Allocation - * returns array of pointers to blocks of memory allocated by everybody - * Note: as the same shared memory region can be mapped at different locations - * in each process address space, the array might hold different values - * on every process. However, the addresses are legitimate - * and can be used in the ARMCI data transfer operations. - * ptr_arr[nproc] -\*/ -int PARMCI_Malloc(void *ptr_arr[], armci_size_t bytes) -{ - void *ptr; - char *new_base; - size_t new_size=0; - ARMCI_PR_DBG("enter",0); - if(DEBUG_){ - fprintf(stderr,"%d bytes in armci_malloc %d\n",armci_me, (int)bytes); - fflush(stderr); - armci_msg_barrier(); - } - -# ifdef USE_MALLOC - if(armci_nproc == 1) { - - # ifdef CRAY_REGISTER_ARMCI_MALLOC - printf("%d: special case where ARMCI_Malloc uses malloc for nppn=1 - broken!\n",armci_me); - abort(); - # endif - - ptr = kr_malloc((size_t) bytes, &ctx_localmem, 0, NULL, NULL); - if(bytes) if(!ptr) armci_die("armci_malloc:malloc 1 failed",(int)bytes); - ptr_arr[armci_me] = ptr; - ARMCI_PR_DBG("exit",0); - return (0); - } -# endif - - // static int one_time = 0; - - if( ARMCI_Uses_shm() ) { - // if(one_time++ == 0 && armci_me==0) printf("%d: ARMCI_Uses_shm = true\n",armci_me); - armci_shmem_malloc(ptr_arr,bytes); - } else { - /* on distributed-memory systems just malloc & collect all addresses */ - ptr = kr_malloc(bytes, &ctx_localmem, 1, &new_base, &new_size); - if(bytes) if(!ptr) armci_die("armci_malloc:malloc 2 failed",bytes); - - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - ptr_arr[armci_me] = ptr; - - /* now combine individual addresses into a single array */ - armci_exchange_address(ptr_arr, armci_nproc); - - # ifdef ARMCI_REGISTER_SHMEM - if(new_size) - armci_register_shmem(new_base,new_size,NULL,0,new_base); - else - armci_register_shmem(ptr,bytes,NULL,0,ptr); - # endif - - } - - # ifdef CRAY_REGISTER_ARMCI_MALLOC - int i; - cos_comm_t info; - cos_mdesc_t mdesc, *mdhs; - onesided_hnd_t cp_hnd; - //uint64_t options = 0; - uint64_t options = ONESIDED_MEM_NO_UDREG | ONESIDED_MEM_NO_RX_CQH; - remote_mdh_node_t *ll; - int node_master = armci_me; - long total_bytes, lbytes = (long) bytes; - NTK_MPI_GetComm(ARMCI_COMM_WORLD, &info); - long *bytes_per_rank = (long *) malloc(info.numa_np*sizeof(long)); - - // not a wonderfully scalable solution - // revisit this at a later time and make the storage node based - if(info.np > 80000 && armci_me == 0) { - // make it obvious!!!! - for(i=0; i<50; i++) printf("WARNING: Examine ARMCI_Malloc for memory scaling issues at large scale.\n"); - } - mdhs = (cos_mdesc_t *) malloc(info.np*sizeof(cos_mdesc_t)); - - ptr = ptr_arr[armci_me]; - - // determine the total number of bytes on the node that were allocated - // also allgather the number of bytes per rank on the node so we can set the offsets - MPI_Allreduce(&lbytes, &total_bytes, 1, MPI_LONG, MPI_SUM, info.numa_comm); - MPI_Allgather(&lbytes, 1, MPI_LONG, bytes_per_rank, 1, MPI_LONG, info.numa_comm); - - // get the onesided v2.0 api handle for the compute process - cpGetOnesidedHandle(&cp_hnd); - - if(info.numa_me == 0 && total_bytes) { - // register the data for the entire node - // ABHINAV: ASSERT(armci_me == node master) - onesided_mem_register(cp_hnd, ptr, total_bytes, options, &mdesc); - } else { - bzero(&mdesc, sizeof(cos_mdesc_t)); - } - - // bcast rank of the node master and and the mdesc for the nodes shared-memory segment - MPI_Bcast(&node_master, 1, MPI_INT, 0, info.numa_comm); - MPI_Bcast(&mdesc, sizeof(cos_mdesc_t), MPI_BYTE, 0, info.numa_comm); - - // each rank need to compare is starting virtual address to the master's starting virtual - // address. if it is differnet (ptr != mdesc.addr), then set the offset - - // each rank will update mdesc to point at the memory region it owns - uint64_t offset = 0; - for(i=0; i<(armci_me-node_master); i++) offset += bytes_per_rank[i]; - mdesc.addr += offset; - mdesc.length = bytes_per_rank[armci_me-node_master]; - - // now we allgather over all np ranks - this used to be over the node master - much more scalable - // but i couldn't come up with a simple solution to look up remote mdhs - MPI_Allgather(&mdesc, sizeof(cos_mdesc_t), MPI_BYTE, - mdhs, sizeof(cos_mdesc_t), MPI_BYTE, info.world_comm); - - // update the linked list - ll = remote_mdh_base_node; - if(ll == NULL) { - remote_mdh_base_node = ll = (remote_mdh_node_t *) malloc(sizeof(remote_mdh_node_t)); - ll->ptrs = ptr_arr; - ll->mdhs = mdhs; - ll->next = NULL; - } else { - while(ll->next != NULL) { ll = ll->next; } - assert(ll->next == NULL); - ll->next = (remote_mdh_node_t *) malloc(sizeof(remote_mdh_node_t)); - ll = ll->next; - ll->ptrs = ptr_arr; - ll->mdhs = mdhs; - ll->next = NULL; - } - # endif - - ARMCI_PR_DBG("exit",0); - //printf("\n%d:%s:%d:%p\n",armci_me,__FUNCTION__,bytes,ptr_arr[armci_me]); - return(0); -} - -/*\ - * Wrapper on PARMCI_Malloc to keep old code from breaking -\*/ -int PARMCI_Malloc_memdev(void *ptr_arr[], armci_size_t bytes, const char *device) -{ - return PARMCI_Malloc(ptr_arr,bytes); -} - - -/*\ shared memory is released to kr_malloc only on process 0 - * with data server malloc cannot be used -\*/ -int PARMCI_Free(void *ptr) -{ - ARMCI_PR_DBG("enter",0); - - # ifdef CRAY_REGISTER_ARMCI_MALLOC - // assumes that PARMCI_Free is a collective operation, the following function requires - // a collective operation for all ranks on the node - armci_onesided_remove_from_remote_mdh_list(ptr); - # endif - - // if ptr is NULL, we can now return - if(!ptr)return 1; - -# if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(NO_SHM) -# ifdef USE_MALLOC - if(armci_nproc > 1) -# endif - if(ARMCI_Uses_shm()){ - if(armci_me==armci_master){ -# ifdef RMA_NEEDS_SHMEM - Free_Shmem_Ptr(0,0,ptr); -# else - if(armci_clus_info[armci_clus_me].nslave>1) { - kr_free(ptr, &ctx_localmem); - // Free_Shmem_Ptr(0,0,ptr); - } -# endif - } - ptr = NULL; - return 0; - } -# endif - // kr_free(ptr, &ctx_localmem); - //armci_unregister_shmem(ptr,0); - ptr = NULL; - ARMCI_PR_DBG("exit",0); - return 0; -} - -int PARMCI_Free_memdev(void *ptr) -{ - return PARMCI_Free(ptr); -} - - -int ARMCI_Uses_shm() -{ - int uses=0; - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) \ - && !defined(NO_SHM) -# ifdef RMA_NEEDS_SHMEM - if(armci_nproc >1) uses= 1; /* always unless serial mode */ -# else - if(armci_nproc != armci_nclus)uses= 1; /* only when > 1 node used */ -# endif -#endif - if(DEBUG_ || 0) fprintf(stderr,"%d:uses shmem %d\n",armci_me, uses); - return uses; -} -#ifdef MSG_COMMS_MPI - -int ARMCI_Uses_shm_grp(ARMCI_Group *group) -{ - int uses=0, grp_me, grp_nproc, grp_nclus; - ARMCI_PR_DBG("enter",0); - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - grp_nclus = grp_attr->grp_nclus; - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) \ - && !defined(NO_SHM) -# ifdef RMA_NEEDS_SHMEM - if(grp_nproc >1) uses= 1; /* always unless serial mode */ -# else - if(grp_nproc != grp_nclus)uses= 1; /* only when > 1 node used */ -# endif -#endif - if(DEBUG_) fprintf(stderr,"%d (grp_id=%d):uses shmem %d\n",armci_me, grp_me, uses); - ARMCI_PR_DBG("exit",0); - return uses; -} - -/*\ ************** Begin Group Collective Memory Allocation ****************** - * returns array of pointers to blocks of memory allocated by everybody - * Note: as the same shared memory region can be mapped at different locations - * in each process address space, the array might hold different values - * on every process. However, the addresses are legitimate - * and can be used in the ARMCI data transfer operations. - * ptr_arr[nproc] -\*/ -int ARMCI_Malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) -{ - void *ptr; - int grp_me, grp_nproc; - ARMCI_PR_DBG("enter",0); - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(DEBUG_)fprintf(stderr,"%d (grp_id=%d) bytes in armci_malloc_group %d\n", - armci_me, grp_me, (int)bytes); -#ifdef USE_MALLOC - if(grp_nproc == 1) { - ptr = kr_malloc((size_t) bytes, &ctx_localmem, 0, NULL, NULL); - if(bytes) if(!ptr) armci_die("armci_malloc_group:malloc 1 failed",(int)bytes); - ptr_arr[grp_me] = ptr; - ARMCI_PR_DBG("exit",0); - return (0); - } -#endif - - if( ARMCI_Uses_shm_grp(group) ) { -# ifdef SGIALTIX - armci_altix_shm_malloc_group(ptr_arr,bytes,group); -# else - armci_shmem_malloc_group(ptr_arr,bytes,group); -# endif - } - else { - void *new_base=NULL; - size_t new_size=NULL; - ptr = kr_malloc(bytes, &ctx_localmem, 1, &new_base, &new_size); - if(bytes) if(!ptr) armci_die("armci_malloc:malloc 2 failed",bytes); - - bzero((char*)ptr_arr,grp_nproc*sizeof(void*)); - ptr_arr[grp_me] = ptr; - - /* now combine individual addresses into a single array */ - armci_exchange_address_grp(ptr_arr, grp_nproc, group); - - } - ARMCI_PR_DBG("exit",0); - return(0); -} - -/*\ - * Wrapper on ARMCI_Malloc_group to keep old code from breaking -\*/ -int ARMCI_Malloc_group_memdev(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group, const char *device) -{ - return ARMCI_Malloc_group(ptr_arr,bytes,group); -} - - -/*\ shared memory is released to kr_malloc only on process 0 - * with data server malloc cannot be used - \*/ -int ARMCI_Free_group(void *ptr, ARMCI_Group *group) -{ - int grp_me, grp_nproc, grp_master, grp_clus_me; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - ARMCI_PR_DBG("enter",0); - - if(!ptr)return 1; - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(grp_me == MPI_UNDEFINED) { /* check if the process is in this group */ - armci_die("armci_malloc_group: process is not a member in this group", - armci_me); - } - /* get the group cluster info */ - grp_clus_me = grp_attr->grp_clus_me; - grp_master = grp_attr->grp_clus_info[grp_clus_me].master; - -# if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(NO_SHM) -# ifdef USE_MALLOC - if(grp_nproc > 1) -# endif - if(ARMCI_Uses_shm_grp(group)){ - if(grp_me == grp_master) { -# ifdef RMA_NEEDS_SHMEM - Free_Shmem_Ptr(0,0,ptr); -# else - if(armci_clus_info[armci_clus_me].nslave>1) Free_Shmem_Ptr(0,0,ptr); - else kr_free(ptr, &ctx_localmem); -# endif - } - ptr = NULL; - ARMCI_PR_DBG("exit",0); - return 0; - } -# endif - kr_free(ptr, &ctx_localmem); - - ptr = NULL; - ARMCI_PR_DBG("exit",0); - return 0; -} -/* ***************** End Group Collective Memory Allocation ******************/ - -/* ************** Begin Non-Collective Memory Allocation ****************** - * Prototype similar to SysV shared memory. - */ - -/** - * CHECK: On Altix we are forced to use SysV as shmalloc is collective. We - * may use a preallocated shmalloc memory, however, it may NOT still solve - * our problem... - * NOTE: "int memflg" option for future optimiztions. - */ -void PARMCI_Memget(size_t bytes, armci_meminfo_t *meminfo, int memflg) { - - void *myptr=NULL; - void *armci_ptr=NULL; /* legal ARCMI ptr used in ARMCI data xfer ops*/ - size_t size = bytes; - - if(size<=0) armci_die("PARMCI_Memget: size must be > 0", (int)size); - if(meminfo==NULL) armci_die("PARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - if(memflg!=0) armci_die("PARMCI_Memget: Invalid memflg", memflg); - - if( !ARMCI_Uses_shm() ) - { - armci_ptr = myptr = kr_malloc(size, &ctx_localmem, 0, NULL, NULL); - if(size) if(!myptr) armci_die("PARMCI_Memget failed", (int)size); - - /* fill the meminfo structure */ - meminfo->armci_addr = armci_ptr; - meminfo->addr = myptr; - meminfo->size = size; - meminfo->cpid = armci_me; - /* meminfo->attr = NULL; */ - } - else - { - armci_shmem_memget(meminfo, size); - } - - if(DEBUG_){ - printf("%d: PARMCI_Memget: addresses server=%p myptr=%p bytes=%ld\n", - armci_me, meminfo->armci_addr, meminfo->addr, bytes); - fflush(stdout); - } -} - -void* PARMCI_Memat(armci_meminfo_t *meminfo, long memflg) { - void *ptr=NULL; - - if(meminfo==NULL) armci_die("PARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - if(memflg!=0) armci_die("PARMCI_Memget: Invalid memflg", memflg); - - if(meminfo->cpid==armci_me) { ptr = meminfo->addr; return ptr; } - - if( !ARMCI_Uses_shm()) - { - ptr = meminfo->addr; - } - else - { - ptr = armci_shmem_memat(meminfo); - } - - if(DEBUG_) - { - printf("%d:PARMCI_Memat: attached addr mptr=%p size=%ld\n", - armci_me, ptr, meminfo->size); fflush(stdout); - } - - return ptr; -} - -void ARMCI_Memdt(armci_meminfo_t *meminfo, int memflg) { - /** - * Do nothing. May be we need to have reference counting in future. This - * is to avoid the case of dangling pointers when the creator of shm - * segment calls Memctl and other processes are still attached to this - * segment - */ -} - -void ARMCI_Memctl(armci_meminfo_t *meminfo) { - - if(meminfo==NULL) armci_die("PARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - - /* only the creator can delete the segment */ - if(meminfo->cpid == armci_me) - { - if( !ARMCI_Uses_shm() ) - { - void *ptr = meminfo->addr; - kr_free(ptr, &ctx_localmem); - } - else - { - armci_shmem_memctl(meminfo); - } - } - - meminfo->addr = NULL; - meminfo->armci_addr = NULL; - /* if(meminfo->attr!=NULL) free(meminfo->attr); */ -} - -/* ***************** End Non-Collective Memory Allocation ******************/ - -#endif -#endif diff --git a/armci/src-gemini/message.c b/armci/src-gemini/message.c deleted file mode 100644 index b0f8d130d..000000000 --- a/armci/src-gemini/message.c +++ /dev/null @@ -1,2174 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: message.c,v 1.58.6.4 2007-04-24 10:08:26 vinod Exp $ */ -#if defined(BGML) -# include "bgml.h" -#elif defined(PVM) -# include -#elif defined(TCGMSG) -# include -#else -# ifndef MSG_COMMS_MPI -# define MSG_COMMS_MPI -# endif -# include -#endif -#include "message.h" -#include "armcip.h" -#include "copy.h" -#if HAVE_STDIO_H -# include -#endif -#if HAVE_ASSERT_H -# include -#endif -#ifdef _POSIX_PRIORITY_SCHEDULING -#ifndef HITACHI -# include -#endif -#endif -#include "armci.h" -#include "acc.h" - -#define DEBUG_ 0 -#if defined(SYSV) || defined(MMAP) ||defined (WIN32) -# include "armci_shmem.h" -#endif - -/* global operations are use buffer size of BUF_SIZE doubles */ -#define BUF_SIZE (4*2048) -#define INFO_BUF_SIZE (BUF_SIZE*sizeof(BUF_SIZE) - sizeof(double)) -#undef EMPTY -#define EMPTY 0 -#define FULL 1 - -static double *work=NULL; -static long *lwork = NULL; -static long long *llwork = NULL; -static int *iwork = NULL; -static float *fwork = NULL; -static int _armci_gop_init=0; /* tells us if we have a buffers allocated */ -static int _armci_gop_shmem =0; /* tells us to use shared memory for gops */ -extern void armci_util_wait_int(volatile int *, int , int ); -static int empty=EMPTY,full=FULL; -#if !defined(SGIALTIX) && defined(SYSV) || defined(MMAP) || defined(WIN32) -static void **ptr_arr=NULL; -#endif - -typedef struct { - union { - volatile int flag; - double dummy[16]; - }a; - union { - volatile int flag; - double dummy[16]; - }b; - double array[BUF_SIZE]; -} bufstruct; - -static bufstruct *_gop_buffer; - -#define GOP_BUF(p) (_gop_buffer+((p)-armci_master)) - -/*\ macro to set a flag includes mem barrier to assure that flag is not set - * before any outstanding writes complete -\*/ -#ifdef NEED_MEM_SYNC -# ifdef AIX -# define SET_SHM_FLAG(_flg,_val) _clear_lock((int *)(_flg),_val); -# elif defined(NEC) -# define SET_SHM_FLAG(_flg,_val) MEM_FENCE; *(_flg)=(_val) -# elif defined(__ia64) -# if defined(__GNUC__) && !defined (__INTEL_COMPILER) -# define SET_SHM_FLAG(_flg,_val)\ - __asm__ __volatile__ ("mf" ::: "memory"); *(_flg)=(_val) -# else /* Intel Compiler */ - extern void _armci_ia64_mb(); -# define SET_SHM_FLAG(_flg,_val)\ - _armci_ia64_mb(); *(_flg)=(_val); -# endif -# elif defined(MACX) -# if defined(__GNUC__) -# define SET_SHM_FLAG(_flg,_val)\ - *(_flg)=(_val);__asm__ __volatile__ ("isync" ::: "memory") -# endif -# endif -#endif - -#ifndef SET_SHM_FLAG -# define SET_SHM_FLAG(_flg,_val) *(_flg)=_val; -#endif - - - -/*\ - * Variables/structures for use in Barrier and for Binomial tree -\*/ -#if HAVE_MATH_H -# include -#endif -int barr_switch; -static int LnB=0,powof2nodes,Lp2; -typedef struct { - volatile int flag1; - double dum[16]; - volatile int flag2; -} barrier_struct; -barrier_struct *_bar_buff; -#define BAR_BUF(p) (_bar_buff+((p))) -void **barr_snd_ptr,**barr_rcv_ptr; -int _armci_barrier_init=0; -int _armci_barrier_shmem=0; - - -/*\ - * Tree generation code -\*/ -static void _dfs_bintree_parse(int *idlist, int index, int max, int *result) -{ -int left = (int)2*index+1; -int right = (int) 2*index+2; -static int pos=0; -int r_end,l_end; - l_end=pos++; - result[pos++]=idlist[index]; - if(leftarray; /* each process finds its place */ - GOP_BUF(armci_me)->a.flag=EMPTY; /* initially buffer is empty */ - GOP_BUF(armci_me)->b.flag=EMPTY; /* initially buffer is empty */ - if(armci_me == armci_master ){ - GOP_BUF(armci_clus_last+1)->a.flag=EMPTY;/*initially buffer is empty*/ - GOP_BUF(armci_clus_last+2)->a.flag=EMPTY;/*initially buffer is empty*/ - GOP_BUF(armci_clus_last+1)->b.flag=EMPTY;/*initially buffer is empty*/ - GOP_BUF(armci_clus_last+2)->b.flag=EMPTY;/*initially buffer is empty*/ - } - _armci_gop_shmem = 1; - } -#endif - /*stuff needed for barrier and binomial bcast/reduce*/ -#ifdef LAPI - if(!_armci_barrier_shmem){ - int size = 2*sizeof(int); - /*allocate memory to send/rcv data*/ - barr_snd_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - barr_rcv_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - - if(PARMCI_Malloc(barr_snd_ptr,size))armci_die("malloc barrinit failed",0); - if(PARMCI_Malloc(barr_rcv_ptr,size))armci_die("malloc barrinit failed",0); - if(barr_rcv_ptr[armci_me]==NULL || barr_snd_ptr[armci_me]==NULL) - armci_die("problems in malloc barr_init",0); - powof2nodes=1; - LnB = floor(log(armci_nclus)/log(2))+1; - if(pow(2,LnB-1)-1) _armci_dummy_work *=DUMMY_INIT; - if(_armci_dummy_work>(double)armci_msg_nproc())_armci_dummy_work=DUMMY_INIT; -} - - -/***************************Barrier Code*************************************/ - -void armci_msg_barr_init(){ -#if defined(SYSV) || defined(MMAP) || defined(WIN32) - int size=sizeof(barrier_struct)*armci_clus_info[armci_clus_me].nslave; - char *tmp; - void **ptr_arr; - barr_switch=0; - /*First allocate space for flags*/ - - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - if(armci_me==armci_master) size = size+128; - else size=0; - PARMCI_Malloc(ptr_arr, size); - tmp = (char*)ptr_arr[armci_master]; - size=2*sizeof(int); - - if(!tmp)armci_die("allocate barr shm failed",0); - _bar_buff=(barrier_struct *)tmp; - - SET_SHM_FLAG(&(BAR_BUF(armci_me-armci_master)->flag1),empty); - SET_SHM_FLAG(&(BAR_BUF(armci_me-armci_master)->flag2),empty); - - /*allocate memory to send/rcv data*/ - barr_snd_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - barr_rcv_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - - if(PARMCI_Malloc(barr_snd_ptr,size))armci_die("malloc barr_init failed",0); - if(PARMCI_Malloc(barr_rcv_ptr,size))armci_die("malloc barr_init failed",0); - if(barr_rcv_ptr[armci_me]==NULL || barr_snd_ptr[armci_me]==NULL) - armci_die("problems in malloc barr_init",0); - - /*we have to figure if we have power of ,two nodes*/ - powof2nodes=1; - LnB = (int)floor(log(armci_nclus)/log(2))+1; - if(pow(2,LnB-1)flag1,FULL,100000); - SET_SHM_FLAG(&(BAR_BUF(i)->flag1),empty); - } - if(armci_nclus>1){ - last = ((int)pow(2,(LnB-1)))^armci_clus_me; - if(last>=0 && lastarmci_clus_me){ /*the pow2 set of procs*/ - if(last=0 && next armci_me){ - armci_msg_snd(ARMCI_TAG, srcp,4,next_node); - armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_node); - } - else{ - /*would we gain anything by doing a snd,rcv instead of rcv,snd*/ - armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_node); - armci_msg_snd(ARMCI_TAG, srcp,4,next_node); - } - armci_util_wait_int((volatile int *)dstn,barr_count,100000); - } - } - if(last1*/ - for(i=1;iflag2),full); - } - else { /*if not master, partake in the smp barrier,only*/ - i=armci_me-armci_master; - SET_SHM_FLAG(&(BAR_BUF(i)->flag1),full); - armci_util_wait_int(&BAR_BUF(i)->flag2,FULL,100000); - SET_SHM_FLAG(&(BAR_BUF(i)->flag2),empty); - } -} - -#endif /*barrier enabled only for lapi*/ -void parmci_msg_barrier() -{ -#ifdef BGML - bgml_barrier (3); /* this is always faster than MPI_Barrier() */ -#elif defined(MSG_COMMS_MPI) - MPI_Barrier(ARMCI_COMM_WORLD); -# elif defined(PVM) - pvm_barrier(mp_group_name, armci_nproc); -# elif defined(LAPI) -#if !defined(NEED_MEM_SYNC) - if(_armci_barrier_init) - _armci_msg_barrier(); - else -#endif - { - tcg_synch(ARMCI_TAG); - } -# else - { - tcg_synch(ARMCI_TAG); - } -# endif -} -/***********************End Barrier Code*************************************/ - - -void armci_msg_init(int *argc, char ***argv) -{ -#if defined(TCGMSG) - if (!tcg_ready()) { - tcg_pbegin(argc,argv); - } -#elif defined(BGML) - /* empty */ -#elif defined(MSG_COMMS_MPI) - int flag=0; - MPI_Initialized(&flag); - if (!flag) { -# if defined(DCMF) || defined(MPI_MT) - int provided; - MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided); -# else - MPI_Init(argc, argv); -# endif - } - if (!PARMCI_Initialized()) { - MPI_Comm_dup(MPI_COMM_WORLD, &ARMCI_COMM_WORLD); - } -#endif -} - -#ifdef MSG_COMMS_MPI -void armci_msg_init_comm(MPI_Comm comm) -{ - if (!PARMCI_Initialized()) { - MPI_Comm_dup(comm, &ARMCI_COMM_WORLD); - } -} -#endif - - -int armci_msg_me() -{ -#ifdef BGML - return BGML_Messager_rank(); -#elif defined(DCMF) - return DCMF_Messager_rank(); -#elif defined(MSG_COMMS_MPI) - static int counter = 0; - if (counter == 0) { - int me; - MPI_Comm_rank(ARMCI_COMM_WORLD, &me); - armci_me = me; - counter = 1; - } - return armci_me; - -#elif defined(PVM) - return(pvm_getinst(mp_group_name,pvm_mytid())); -#else - return (int)tcg_nodeid(); -#endif -} - - -int armci_msg_nproc() -{ -#ifdef BGML - return BGML_Messager_size(); -#elif defined(DCMF) - return DCMF_Messager_size(); -#elif defined(MSG_COMMS_MPI) - static int counter = 0; - if (counter == 0) { - int nproc; - MPI_Comm_size(ARMCI_COMM_WORLD, &nproc); - armci_nproc = nproc; - counter = 1; - } - return armci_nproc; -#elif defined(PVM) - return(pvm_gsize(mp_group_name)); -#else - return (int)tcg_nnodes(); -#endif -} - -#ifdef CRAY_YMP -#define BROKEN_MPI_ABORT -#endif - -#ifndef PVM -double armci_timer() -{ -#ifdef BGML - return BGML_Timer(); -#elif defined(DCMF) - return DCMF_Timer(); -#elif defined(MSG_COMMS_MPI) - - return MPI_Wtime(); -#else - return tcg_time(); -#endif -} -#endif - - -void armci_msg_abort(int code) -{ -#ifdef BGML - fprintf(stderr,"ARMCI aborting [%d]\n", code); -#elif defined(DCMF) - fprintf(stderr,"ARMCI aborting [%d]\n", code); -#elif defined(MSG_COMMS_MPI) -# ifndef BROKEN_MPI_ABORT - MPI_Abort(ARMCI_COMM_WORLD,code); -# endif -#elif defined(PVM) - char error_msg[25]; - sprintf(error_msg, "ARMCI aborting [%d]", code); - pvm_halt(); -#else - tcg_error("ARMCI aborting",(long)code); -#endif - fprintf(stderr,"%d:aborting\n",armci_me); - /* trap for broken abort in message passing libs */ - _exit(1); -} - -void armci_msg_finalize() -{ -#if defined(TCGMSG) - tcg_pend(); -#elif defined(MSG_COMMS_MPI) - MPI_Finalize(); -#endif -} - -void armci_msg_bintree(int scope, int* Root, int *Up, int *Left, int *Right) -{ -int root, up, left, right, index, nproc; - if(scope == SCOPE_NODE){ - root = armci_clus_info[armci_clus_me].master; - nproc = armci_clus_info[armci_clus_me].nslave; - index = armci_me - root; - up = (index-1)/2 + root; if( up < root) up = -1; - left = 2*index + 1 + root; if(left >= root+nproc) left = -1; - right = 2*index + 2 + root; if(right >= root+nproc)right = -1; - }else if(scope ==SCOPE_MASTERS){ - root = armci_clus_info[0].master; - nproc = armci_nclus; - if(armci_me != armci_master){up = -1; left = -1; right = -1; } - else{ - index = armci_clus_me - root; - up = (index-1)/2 + root; - up = ( up < root)? -1: armci_clus_info[up].master; - left = 2*index + 1 + root; - left = ( left >= root+nproc)? -1: armci_clus_info[left].master; - right = 2*index + 2 + root; - right =( right >= root+nproc)? -1: armci_clus_info[right].master; - } - }else{ - root = 0; - nproc = armci_nproc; - index = armci_me - root; - up = (index-1)/2 + root; if( up < root) up = -1; - left = 2*index + 1 + root; if(left >= root+nproc) left = -1; - right = 2*index + 2 + root; if(right >= root+nproc)right = -1; - } - - *Up = up; - *Left = left; - *Right = right; - *Root = root; -} - -/*\ root broadcasts to everyone else -\*/ -void armci_msg_bcast_scope(int scope, void *buf, int len, int root) -{ - int up, left, right, Root; - - if(!buf)armci_die("armci_msg_bcast: NULL pointer", len); -#ifdef BGML - BGTr_Bcast(root, buf, len, 3); -#else - armci_msg_bintree(scope, &Root, &up, &left, &right); - - if(root !=Root){ - if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root); - if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root); - } - - /* printf("%d: scope=%d left=%d right=%d up=%d\n",armci_me, scope, - left, right, up);*/ - - if(armci_me != Root && up!=-1) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, up); - if (left > -1) armci_msg_snd(ARMCI_TAG, buf, len, left); - if (right > -1) armci_msg_snd(ARMCI_TAG, buf, len, right); -#endif -} - - - - -/*\ shared memory based broadcast for a single SMP node -\*/ -void armci_smp_bcast(void *x, int n , int root) -{ -int ndo, len,i, bufsize = BUF_SIZE*sizeof(double); -static int bufid=1; - - if(armci_clus_info[armci_clus_me].nslave<2) return; /* nothing to do */ - - if(!x)armci_die("armci_msg_bcast: NULL pointer", n); - - /* enable or balance pipeline for messages comparable to bufsize */ - if((n>bufsize/2) && (n <(2*bufsize-64))){ - bufsize = n/2; bufsize>>=3; bufsize<<=3; - } - - while ((ndo = (n<=bufsize) ? n : bufsize)) { - len = ndo; - - if(armci_me==root){ - - /* wait for the flag protecting the buffer to clear */ - armci_util_wait_int(&(GOP_BUF(armci_clus_last+bufid)->a.flag),EMPTY,100); - SET_SHM_FLAG(&(GOP_BUF(armci_clus_last+bufid)->a.flag),full); -#if 0 - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root)armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100); - armci_copy(x,GOP_BUF(armci_clus_last+bufid+1)->array,len); - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root) GOP_BUF(i)->b.flag=FULL; -#else - armci_copy(x,GOP_BUF(armci_clus_last+bufid)->array,len); - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root){ - armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100); - SET_SHM_FLAG(&(GOP_BUF(i)->b.flag),full); - } -#endif - }else{ - armci_util_wait_int(&GOP_BUF(armci_me)->b.flag, FULL, 100); - armci_copy(GOP_BUF(armci_clus_last+bufid)->array,x,len); - SET_SHM_FLAG(&(GOP_BUF(armci_me)->b.flag),empty); - } - - n -=ndo; - x = len + (char*)x; - - bufid = (bufid)%2 +1; - - /* since root waited for everybody to check in the previous buffer is free*/ - if(armci_me==root){ - SET_SHM_FLAG(&(GOP_BUF(armci_clus_last+bufid)->a.flag),empty); - } - } -} - - - -/*\ shared memory based broadcast for a single SMP node out of shmem buffer -\*/ -void armci_smp_buf_bcast(void *x, int n, int root, void *shmbuf ) -{ -int i, nslave = armci_clus_info[armci_clus_me].nslave; - - if(nslave<2){ - armci_copy(shmbuf,x,n); - return; /* nothing to do */ - } - if(!x)armci_die("armci_msg_bcast: NULL pointer", n); - if(!shmbuf)armci_die("armci_msg_bcast: NULL pointer", n); - - if(armci_me==root){ - /* notify others that the data in buffer is ready */ - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root){ - armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100); - GOP_BUF(i)->b.flag=FULL; - } - /* root also needs to copy */ - armci_copy(shmbuf,x,n); - /* wait until everybody is finished -- can reclaim buffer */ - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root)armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY,100000); - - }else{ - /* spin until data in buffer is ready */ - armci_util_wait_int(&GOP_BUF(armci_me)->b.flag , FULL, 100000); - armci_copy(shmbuf,x,n); /* copy data */ - GOP_BUF(armci_me)->b.flag = EMPTY; /* indicate we are done */ - } -} - -void _armci_msg_binomial_bcast(void *buf, int len, int root){ - int Root = armci_master; - int nslave = armci_clus_info[armci_clus_me].nslave; - int i,next_node,next; -/* int my_rank,root_rank,next_rank; */ - /* inter-node operation between masters */ - if(root !=armci_clus_info[0].master){ - Root = armci_clus_info[0].master; - if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root); - if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root); - root = Root; - Root = armci_master; - } - if(armci_nclus>1 &&armci_me==armci_master){/*the internode bcast, first*/ - /*first do the recv*/ - int rcv_proc=armci_clus_me,flag=1,diff=1; - if(armci_me!=root){ - while(!(rcv_proc & flag)){ - diff=diff<<1; - flag=flag<<1; - } - rcv_proc = armci_clus_info[armci_clus_me-diff].master; - armci_msg_rcv(ARMCI_TAG, buf,len,NULL,rcv_proc); - /*printf("\n%d: recv from %d \n",armci_me,rcv_proc);fflush(stdout);*/ - } - else - diff = Lp2; - - /*printf("\n%d: %d diff>>1 = %d\n",armci_me,Lp2,diff>>1);*/ - for(i=diff>>1;i>=1;i=i>>1){ - next=i^armci_clus_me; - if(next>=0 && next1)armci_msg_bcast_scope(SCOPE_MASTERS, buf, len, root); - else Root = root; - - /* intra-node operation */ -#if 1 - if(_armci_gop_shmem && nslave<33) - armci_smp_bcast(buf, len, Root); - else -#endif - armci_msg_bcast_scope(SCOPE_NODE, buf, len, Root); -} -#endif - - - -void armci_msg_brdcst(void* buffer, int len, int root) -{ - if(!buffer)armci_die("armci_msg_brdcast: NULL pointer", len); - -#ifdef BGML - BGTr_Bcast(root, buffer, len, PCLASS); -# elif defined(MSG_COMMS_MPI) - MPI_Bcast(buffer, len, MPI_CHAR, root, ARMCI_COMM_WORLD); -# elif defined(PVM) - armci_msg_bcast(buffer, len, root); -# else - { - long ttag=ARMCI_TAG, llen=len, rroot=root; - tcg_brdcst(ttag, buffer, llen, rroot); - } -# endif -} - - -void armci_msg_snd(int tag, void* buffer, int len, int to) -{ -# ifdef MSG_COMMS_MPI - MPI_Send(buffer, len, MPI_CHAR, to, tag, ARMCI_COMM_WORLD); -# elif defined(PVM) - pvm_psend(pvm_gettid(mp_group_name, to), tag, buffer, len, PVM_BYTE); -# elif defined(BGML) - /* We don't actually used armci_msg_snd in ARMCI. we use optimized - * collectives where - * armci_msg_snd is used. If you build Global Arrays, the MSG_COMMS_MPI flag is - * set, so that - * will work fine - */ - armci_die("bgl shouldn't use armci_msg_snd", armci_me); -# else - long ttag=tag, llen=len, tto=to, block=1; - tcg_snd(ttag, buffer, llen, tto, block); -# endif -} - - -/*\ receive message of specified tag from proc and get its len if msglen!=NULL -\*/ -void armci_msg_rcv(int tag, void* buffer, int buflen, int *msglen, int from) -{ -# ifdef MSG_COMMS_MPI - MPI_Status status; - MPI_Recv(buffer, buflen, MPI_CHAR, from, tag, ARMCI_COMM_WORLD, &status); - if(msglen) MPI_Get_count(&status, MPI_CHAR, msglen); -# elif defined(PVM) - int src, rtag,mlen; - pvm_precv(pvm_gettid(mp_group_name, from), tag, buffer, buflen, PVM_BYTE, - &src, &rtag, &mlen); - if(msglen)*msglen=mlen; -#elif defined(BGML) - armci_die("bgl shouldn't use armci_msg_rcv", armci_me); -# else - long ttag=tag, llen=buflen, mlen, ffrom=from, sender, block=1; - tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block); - if(msglen)*msglen = (int)mlen; -# endif -} - - -int armci_msg_rcvany(int tag, void* buffer, int buflen, int *msglen) -{ -#if defined(MSG_COMMS_MPI) - int ierr; - MPI_Status status; - - ierr = MPI_Recv(buffer, buflen, MPI_CHAR, MPI_ANY_SOURCE, tag, - ARMCI_COMM_WORLD, &status); - if(ierr != MPI_SUCCESS) armci_die("armci_msg_rcvany: Recv failed ", tag); - - if(msglen)if(MPI_SUCCESS!=MPI_Get_count(&status, MPI_CHAR, msglen)) - armci_die("armci_msg_rcvany: count failed ", tag); - return (int)status.MPI_SOURCE; -# elif defined(PVM) - int src, rtag,mlen; - pvm_precv(-1, tag, buffer, buflen, PVM_BYTE, &src, &rtag, &mlen); - if(msglen)*msglen=mlen; - return(pvm_getinst(mp_group_name,src)); -# elif defined (BGML) - armci_die("bgl shouldn't use armci_msg_rcvany", armci_me); -# else - long ttag=tag, llen=buflen, mlen, ffrom=-1, sender, block=1; - tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block); - if(msglen)*msglen = (int)mlen; - return (int)sender; -# endif -} - - -/*\ cluster master broadcasts to everyone else in the same cluster -\*/ -void armci_msg_clus_brdcst(void *buf, int len) -{ -int root, up, left, right; -int tag=ARMCI_TAG, lenmes; - - armci_msg_bintree(SCOPE_NODE, &root, &up, &left, &right); - if(armci_me != root) armci_msg_rcv(tag, buf, len, &lenmes, up); - if (left > -1) armci_msg_snd(tag, buf, len, left); - if (right > -1) armci_msg_snd(tag, buf, len, right); -} - - -/*\ reduce operation for long -\*/ -static void ldoop(int n, char *op, long *x, long* work) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ += *work++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ *= *work++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x |= *work; - x++; work++; - } - /* these are new */ - else if ((strncmp(op, "&&", 2) == 0) || (strncmp(op, "land", 4) == 0)) { - while(n--) { - *x = *x && *work; - x++; work++; - } - } - else if ((strncmp(op, "||", 2) == 0) || (strncmp(op, "lor", 3) == 0)) { - while(n--) { - *x = *x || *work; - x++; work++; - } - } - else if ((strncmp(op, "&", 1) == 0) || (strncmp(op, "band", 4) == 0)) { - while(n--) { - *x &= *work; - x++; work++; - } - } - else if ((strncmp(op, "|", 1) == 0) || (strncmp(op, "bor", 3) == 0)) { - while(n--) { - *x |= *work; - x++; work++; - } - } - else - armci_die("ldoop: unknown operation requested", n); -} - -/*\ reduce operation for long x= op(work,work2) -\*/ -static void ldoop2(int n, char *op, long *x, long* work, long* work2) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ = *work++ + *work2++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ = *work++ * *work2++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x = *work | *work2; - x++; work++; work2++; - } - else - armci_die("ldoop2: unknown operation requested", n); -} - -/*\ reduce operation for long long -\*/ -static void lldoop(int n, char *op, long long *x, long long* work) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ += *work++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ *= *work++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register long long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register long long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x |= *work; - x++; work++; - } - else - armci_die("lldoop: unknown operation requested", n); -} - -/*\ reduce operation for long long x= op(work,work2) -\*/ -static void lldoop2(int n, char *op, long long *x, long long* work, - long long* work2) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ = *work++ + *work2++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ = *work++ * *work2++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register long long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register long long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x = *work | *work2; - x++; work++; work2++; - } - else - armci_die("ldoop2: unknown operation requested", n); -} - -/*\ reduce operation for int -\*/ -static void idoop(int n, char *op, int *x, int* work) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ += *work++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ *= *work++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register int x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register int x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x |= *work; - x++; work++; - } - else - armci_die("idoop: unknown operation requested", n); -} - -/*\ reduce operation for int x= op(work,work2) -\*/ -static void idoop2(int n, char *op, int *x, int* work, int* work2) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ = *work++ + *work2++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ = *work++ * *work2++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register int x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register int x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x = *work | *work2; - x++; work++; work2++; - } - else - armci_die("idoop2: unknown operation requested", n); -} - -/*\ reduce operation for double -\*/ -static void ddoop(int n, char* op, double* x, double* work) -{ - if (strncmp(op,"+",1) == 0){ - if(n>63) FORT_DADD(&n,x,work); - else while(n--) *x++ += *work++; - }else if (strncmp(op,"*",1) == 0){ - if(n>63) FORT_DMULT(&n,x,work); - else while(n--) *x++ *= *work++; - }else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register double x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register double x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else - armci_die("ddoop: unknown operation requested", n); -} - -/*\ reduce operation for double x= op(work,work2) -\*/ -static void ddoop2(int n, char *op, double *x, double* work, double* work2) -{ - if (strncmp(op,"+",1) == 0){ - if(n>63) FORT_DADD2(&n,x,work,work2); - else while(n--) *x++ = *work++ + *work2++; - }else if (strncmp(op,"*",1) == 0){ - if(n>63) FORT_DMULT2(&n,x,work,work2); - while(n--) *x++ = *work++ * *work2++; - }else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register double x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register double x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else - armci_die("ddoop2: unknown operation requested", n); -} - - -/*\ reduce operation for float -\*/ -static void fdoop(int n, char* op, float* x, float* work) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ += *work++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ *= *work++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register float x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register float x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else - armci_die("fdoop: unknown operation requested", n); -} - -/*\ reduce operation for float x= op(work,work2) -\*/ -static void fdoop2(int n, char *op, float *x, float* work, float* work2) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ = *work++ + *work2++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ = *work++ * *work2++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register float x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register float x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else - armci_die("fdoop2: unknown operation requested", n); -} - -/*\ combine array of longs/ints accross all processes -\*/ -void armci_msg_gop_scope(int scope, void *x, int n, char* op, int type) -{ -int root, up, left, right, size; -int tag=ARMCI_TAG; -int ndo, len, lenmes, orign =n, ratio; -void *origx =x; - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - if(work==NULL)_allocate_mem_for_work(); -#ifdef BGML - BGML_Dt dt; - BGML_Op theop; - - if(n > 0 && (strncmp(op, "+", 1) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_SUM; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else if(n > 0 && (strncmp(op, "max", 3) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_MAX; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else if(n > 0 && (strncmp(op, "min", 3) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_MIN; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else -#endif - { - armci_msg_bintree(scope, &root, &up, &left, &right); - - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - len = lenmes = ndo*size; - - if (left > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, left); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - - if (right > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, right); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up); - - n -=ndo; - x = len + (char*)x; - } - - /* Now, root broadcasts the result down the binary tree */ - len = orign*size; - armci_msg_bcast_scope(scope, origx, len, root); - } -} - - -void armci_msg_reduce_scope(int scope, void *x, int n, char* op, int type) -{ -int root, up, left, right, size; -int tag=ARMCI_TAG; -int ndo, len, lenmes, ratio; - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - if(work==NULL)_allocate_mem_for_work(); - - armci_msg_bintree(scope, &root, &up, &left, &right); - - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - len = lenmes = ndo*size; - - if (left > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, left); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - - if (right > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, right); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up); - - n -=ndo; - x = len + (char*)x; - } -} - -static void gop(int type, int ndo, char* op, void *x, void *work) -{ - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, (int*)work); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, (long*)work); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x, (long long*)work); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, (float*)work); - else ddoop(ndo, op, (double*)x, (double*)work); -} - - -static void gop2(int type, int ndo, char* op, void *x, void *work, void *work2) -{ -#if 0 - int size; - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - - armci_copy(work2,x,ndo*size); - - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, (int*)work); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, (long*)work); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x, (long long*)work); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, (float*)work); - else ddoop(ndo, op, (double*)x, (double*)work); -#else - if(type==ARMCI_INT) idoop2(ndo, op, (int*)x, (int*)work, (int*)work2); - else if(type==ARMCI_LONG)ldoop2(ndo,op,(long*)x,(long*)work,(long*)work2); - else if(type==ARMCI_LONG_LONG) lldoop2(ndo,op,(long long*)x,(long long*)work,(long long*)work2); - else if(type==ARMCI_FLOAT)fdoop2(ndo,op,(float*)x,(float*)work,(float*)work2); - else ddoop2(ndo, op, (double*)x, (double*)work,(double*)work2); -#endif -} - - - - -/*\ shared memory based reduction for a single SMP node -\*/ -static void armci_smp_reduce(void *x, int n, char* op, int type) -{ -int root, up, left, right, size; -int ndo, len, lenmes, ratio; -int nslave = armci_clus_info[armci_clus_me].nslave; - - if(nslave<2) return; /* nothing to do */ - - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - - armci_msg_bintree(SCOPE_NODE, &root, &up, &left, &right); - - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - len = lenmes = ndo*size; - - armci_util_wait_int(&GOP_BUF(armci_me)->a.flag, EMPTY, 100); - -#if 1 - if(left<0 && right<0) armci_copy(x,GOP_BUF(armci_me)->array,len); - - /* version oblivious to the order of data arrival */ - { - int need_left = left >-1; - int need_right = right >-1; - int from, first =1, maxspin=100, count=0; - bufstruct *b; - - while(need_left || need_right){ - from =-1; - if(need_left && GOP_BUF(left)->a.flag == FULL){ - from =left; - need_left =0; - }else if(need_right && GOP_BUF(right)->a.flag == FULL) { - from =right; - need_right =0; - } - if(from != -1){ - b = GOP_BUF(from); -#if 1 - if(armci_me == root) gop(type, ndo, op, x, b->array); - else { - if(first) - gop2(type, ndo, op, GOP_BUF(armci_me)->array, b->array,x); - else - gop(type, ndo, op, GOP_BUF(armci_me)->array, b->array); - } - first =0; -#else - gop(type, ndo, op, GOP_BUF(armci_me)->array, b->array); -#endif - SET_SHM_FLAG(&( b->a.flag),empty); - }else if((++count)array,len); - - /* this version requires a specific order of data arrival */ - if (left >-1) { - while(GOP_BUF(left)->a.flag != FULL) cpu_yield(); - gop(type, ndo, op, GOP_BUF(armci_me)->array, GOP_BUF(left)->array); - SET_SHM_FLAG(&( GOP_BUF(left)->a.flag),empty); - } - if (right >-1 ) { - while(GOP_BUF(right)->a.flag != FULL) cpu_yield(); - gop(type, ndo, op, GOP_BUF(armci_me)->array, GOP_BUF(right)->array); - GOP_BUF(right)->a.flag = EMPTY; - } -#endif - - if (armci_me != root ) { - SET_SHM_FLAG(&(GOP_BUF(armci_me)->a.flag),full); - } -#if 0 - else - /* NOTE: this copy can be eliminated in a cluster configuration */ - armci_copy(GOP_BUF(armci_me)->array,x,len); -#endif - - n -=ndo; - x = len + (char*)x; - } -} - -void _armci_msg_binomial_reduce(void *x, int n, char* op, int type){ - int root = armci_clus_info[0].master; - int i,next_node,next; - int size, ratio, ndo, lenmes,len; -/* int my_rank,root_rank,next_rank; */ - if(work==NULL)_allocate_mem_for_work(); - if(armci_me!=armci_master)return; - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - int snd_proc=armci_clus_me,flag=1,diff=1; - - len = lenmes = ndo*size; - if(armci_me!=root){ - while(!(snd_proc & flag)){ - diff=diff<<1; - flag=flag<<1; - } - snd_proc = armci_clus_info[armci_clus_me-diff].master; - } - else - diff = Lp2; - - /*printf("\n%d: %d diff>>1 = %d\n",armci_me,Lp2,diff>>1);*/ - for(i=diff>>1;i>=1;i=i>>1){ - next=i^armci_clus_me; - if(next>=0 && next1){ -#ifdef LAPI - if(_armci_gop_init) - _armci_msg_binomial_reduce(x,n,op,type); - else -#endif - armci_msg_reduce_scope(SCOPE_MASTERS, x, n, op, type); - } -} - - -static void armci_msg_gop2(void *x, int n, char* op, int type) -{ -int size, root=0; - if(work==NULL)_allocate_mem_for_work(); - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); -#ifdef BGML /*optimize what we can at the message layer */ - void *origx=x; - BGML_Dt dt; - BGML_Op rop; - - if(n>0 && (strncmp(op, "+", 1) == 0)) - { - rop=BGML_SUM; - if(type == ARMCI_INT) - { - dt=BGML_SIGNED_INT; - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - else if(type == ARMCI_LONG || type == ARMCI_LONG_LONG) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); -/* dt=BGML_UNSIGNED_LONG; */ -/* BGTr_Allreduce(origx, x, n, dt, rop, -1, 3);*/ - } - else if(type == ARMCI_DOUBLE) - { - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - else if(type == ARMCI_FLOAT) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } - else - { - fprintf(stderr,"Unknown data type\n"); - exit(1); - } - } - - else if(n>0 && ((strncmp(op, "max", 3) == 0) || (strncmp(op, "min", 3) ==0 ))) - { - if(strncmp(op, "max", 3) == 0) - rop=BGML_MAX; - else - rop=BGML_MIN; - - if(type == ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type == ARMCI_DOUBLE) - dt=BGML_DOUBLE; - else if(type == ARMCI_FLOAT) - dt=BGML_FLOAT; - else if(type == ARMCI_LONG) - dt=BGML_SIGNED_LONG; - else if(type == ARMCI_LONG_LONG) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } - else - { - fprintf(stderr,"Unknown data type\n"); - exit(1); - } - if(type != ARMCI_LONG_LONG) - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - - else -#endif - { /* brackets needed for final gelse clause of bgml */ - - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } -} - - -static void armci_sel(int type, char *op, void *x, void* work, int n) -{ -int selected=0; - switch (type) { - case ARMCI_INT: - if(strncmp(op,"min",3) == 0){ - if(*(int*)x > *(int*)work) selected=1; - }else - if(*(int*)x < *(int*)work) selected=1; - break; - case ARMCI_LONG: - if(strncmp(op,"min",3) == 0){ - if(*(long*)x > *(long*)work) selected=1; - }else - if(*(long*)x < *(long*)work) selected=1; - break; - case ARMCI_LONG_LONG: - if(strncmp(op,"min",3) == 0){ - if(*(long long*)x > *(long long*)work) selected=1; - }else - if(*(long long*)x < *(long long*)work) selected=1; - break; - case ARMCI_FLOAT: - if(strncmp(op,"min",3) == 0){ - if(*(float*)x > *(float*)work) selected=1; - }else - if(*(float*)x < *(float*)work) selected=1; - break; - default: - if(strncmp(op,"min",3) == 0){ - if(*(double*)x > *(double*)work) selected=1; - }else - if(*(double*)x < *(double*)work) selected=1; - } - if(selected)armci_copy(work,x, n); -} - - - -/*\ global for op with extra info -\*/ -void armci_msg_sel_scope(int scope, void *x, int n, char* op, int type, int contribute) -{ -int root, up, left, right; -int tag=ARMCI_TAG; -int len, lenmes, min; - - min = (strncmp(op,"min",3) == 0); - if(!min && (strncmp(op,"max",3) != 0)) - armci_die("armci_msg_gop_info: operation not supported ", 0); - - if(!x)armci_die("armci_msg_gop_info: NULL pointer", n); - - if(n>((int)INFO_BUF_SIZE))armci_die("armci_msg_gop_info: info too large",n); - - len = lenmes = n; - - armci_msg_bintree(scope, &root, &up, &left, &right); - - if (left > -1) { - - /* receive into work if contributing otherwise into x */ - if(contribute)armci_msg_rcv(tag, work, len, &lenmes, left); - else armci_msg_rcv(tag, x, len, &lenmes, left); - - if(lenmes){ - if(contribute) armci_sel(type, op, x, work, n); - else contribute =1; /* now we got data to pass */ - } - } - - if (right > -1) { - /* receive into work if contributing otherwise into x */ - if(contribute) armci_msg_rcv(tag, work, len, &lenmes, right); - else armci_msg_rcv(tag, x, len, &lenmes, right); - - if(lenmes){ - if(contribute) armci_sel(type, op, x, work, n); - else contribute =1; /* now we got data to pass */ - } - } - - if (armci_me != root){ - if(contribute) armci_msg_snd(tag, x, len, up); - else armci_msg_snd(tag, x, 0, up); /* send 0 bytes */ - } - - /* Now, root broadcasts the result down the binary tree */ - armci_msg_bcast_scope(scope, x, n, root); -} - - -/*\ combine array of longs/ints/doubles accross all processes -\*/ - -#if defined(NEC) - -void armci_msg_igop(int *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_INT); } - -void armci_msg_lgop(long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG); } - -void armci_msg_llgop(long long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG_LONG); } - -void armci_msg_dgop(double *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_DOUBLE); } - -void armci_msg_fgop (float *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_FLOAT);} - -#else -void armci_msg_igop(int *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_INT); } -void armci_msg_lgop(long *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_LONG); } -void armci_msg_llgop(long long *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_LONG_LONG); } -void armci_msg_fgop(float *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_FLOAT); } -void armci_msg_dgop(double *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_DOUBLE); } -#endif - - -/*\ add array of longs/ints within the same cluster node -\*/ -void armci_msg_clus_igop(int *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_INT); } - -void armci_msg_clus_lgop(long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_LONG); } - -void armci_msg_clus_llgop(long long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_LONG_LONG); } - -void armci_msg_clus_fgop(float *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_FLOAT); } - -void armci_msg_clus_dgop_scope(double *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_DOUBLE); } - - - -void armci_exchange_address(void *ptr_ar[], int n) -{ - int ratio = sizeof(void*)/sizeof(int); -/* - armci_msg_lgop((long*)ptr_ar, n, "+"); -*/ - if(DEBUG_)printf("%d: exchanging %ld ratio=%d\n",armci_me,(long)ptr_ar[armci_me],ratio); - - armci_msg_gop2(ptr_ar, n*ratio, "+",ARMCI_INT); -} - -/** - * ********************* Begin ARMCI Groups Code **************************** - * NOTE: This part is MPI dependent (i.e. ifdef MSG_COMMS_MPI) - */ -#ifdef MSG_COMMS_MPI -MPI_Comm armci_group_comm(ARMCI_Group *group) -{ -#ifdef ARMCI_GROUP - return MPI_COMM_NULL; -#else - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - return (MPI_Comm)igroup->icomm; -#endif -} - -void parmci_msg_group_barrier(ARMCI_Group *group) -{ - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - -#ifdef ARMCI_GROUP - { - int val=0; - armci_msg_group_igop(&val, 1, "+", group); - } -#else - MPI_Barrier((MPI_Comm)(igroup->icomm)); -#endif -} - -#ifdef ARMCI_GROUP -extern void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Group *group); -#else -extern void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Comm comm); -#endif -void armci_grp_clus_brdcst(void *buf, int len, int grp_master, - int grp_clus_nproc, ARMCI_Group *mastergroup) { - ARMCI_iGroup *igroup = armci_get_igroup_from_group(mastergroup); - int i, *pid_list, root=0; -#ifdef ARMCI_GROUP - ARMCI_Group group; -#else - MPI_Group group_world; - MPI_Group group; - MPI_Comm comm; -#endif - - /* create a communicator for the processes with in a node */ - pid_list = (int *)malloc(grp_clus_nproc*sizeof(int)); - for(i=0; iicomm), &group_world); - MPI_Group_incl(group_world, grp_clus_nproc, pid_list, &group); - - MPI_Comm_create((MPI_Comm)(igroup->icomm), (MPI_Group)group, - (MPI_Comm*)&comm); - - /* Broadcast within the node (for this sub group of processes) */ - ARMCI_Bcast_(buf, len, root, comm); - - free(pid_list); - MPI_Comm_free(&comm); /* free the temporary communicator */ - MPI_Group_free(&group); -#endif -} - - -/* to avoid warning */ -extern int ARMCI_Absolute_id(ARMCI_Group *group,int group_rank); - -void armci_msg_group_bintree(int scope, int* Root, int *Up, int *Left, int *Right, - ARMCI_Group *group) -{ - int root, up, left, right, index, nproc,grp_clus_me,grp_me,grp_master,grp_nproc; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - grp_me = grp_attr->grp_me; - grp_clus_me = grp_attr->grp_clus_me; - grp_master = grp_attr->grp_clus_info[grp_clus_me].master; - ARMCI_Group_size(group, &grp_nproc); - if(scope == SCOPE_NODE){ - root = grp_attr->grp_clus_info[grp_clus_me].master; - nproc = grp_attr->grp_clus_info[grp_clus_me].nslave; - index = grp_me - root; - up = (index-1)/2 + root; if( up < root) up = -1; - left = 2*index + 1 + root; if(left >= root+nproc) left = -1; - right = 2*index + 2 + root; if(right >= root+nproc)right = -1; - }else if(scope ==SCOPE_MASTERS){ - root = grp_attr->grp_clus_info[0].master; - nproc = grp_attr->grp_nclus; - if(grp_me != grp_master){up = -1; left = -1; right = -1; } - else{ - index = grp_clus_me - root; - up = (index-1)/2 + root; - up = ( up < root)? -1: grp_attr->grp_clus_info[up].master; - left = 2*index + 1 + root; - left =( left >= root+nproc)?-1:grp_attr->grp_clus_info[left].master; - right= 2*index + 2 + root; - right=( right>=root+nproc)?-1:grp_attr->grp_clus_info[right].master; - } - }else{ - root = 0; - nproc = grp_nproc; - index = grp_me - root; - up = (index-1)/2 + root; if( up < root) up = -1; - left = 2*index + 1 + root; if(left >= root+nproc) left = -1; - right = 2*index + 2 + root; if(right >= root+nproc)right = -1; - } - - *Up = (up==-1)?up:ARMCI_Absolute_id(group,up); - *Left = (left==-1)?left:ARMCI_Absolute_id(group,left); - *Right = (right==-1)?right:ARMCI_Absolute_id(group,right); - *Root = (root==-1)?root:ARMCI_Absolute_id(group,root); -} - -void armci_msg_group_bcast_scope(int scope, void *buf, int len, int root, - ARMCI_Group *group) -{ - int up, left, right, Root; - int grp_me; - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - - if(!buf)armci_die("armci_msg_bcast: NULL pointer", len); - - if(!group)armci_msg_bcast_scope(scope,buf,len,root); - else grp_me = igroup->grp_attr.grp_me; - armci_msg_group_bintree(scope, &Root, &up, &left, &right,group); - - if(root !=Root){ - if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root); - if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root); - } - - /* printf("%d: scope=%d left=%d right=%d up=%d\n",armci_me, scope, - left, right, up);*/ - - if(armci_me != Root && up!=-1) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, up); - if (left > -1) armci_msg_snd(ARMCI_TAG, buf, len, left); - if (right > -1) armci_msg_snd(ARMCI_TAG, buf, len, right); -} - -void -armci_msg_group_gop_scope(int scope, void *x, int n, char* op, int type, - ARMCI_Group *group) -{ - int root, up, left, right, size; - int tag=ARMCI_TAG,grp_me; - int ndo, len, lenmes, orign =n, ratio; - void *origx =x; - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - - if(!group)armci_msg_gop_scope(scope,x,n,op,type); - else grp_me = igroup->grp_attr.grp_me; - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - if(work==NULL)_allocate_mem_for_work(); - - armci_msg_group_bintree(scope, &root, &up, &left, &right,group); - - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - len = lenmes = ndo*size; - - if (left > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, left); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - - if (right > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, right); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x, llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up); - - n -=ndo; - x = len + (char*)x; - } - - /* Now, root broadcasts the result down the binary tree */ - len = orign*size; - armci_msg_group_bcast_scope(scope, origx, len, root,group); -} - -void armci_exchange_address_grp(void *ptr_arr[], int n, ARMCI_Group *group) -{ - int ratio = sizeof(void*)/sizeof(int); - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - int grp_me = igroup->grp_attr.grp_me; - if(DEBUG_){ - printf("%d: exchanging %ld ratio=%d\n",armci_me, - (long)ptr_arr[grp_me], ratio); - } - armci_msg_group_gop_scope(SCOPE_ALL, ptr_arr, n*ratio, - "+", ARMCI_INT, group); -} - -/*\ combine array of longs/ints/doubles accross all processes -\*/ -void armci_msg_group_igop(int *x, int n, char* op, ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_INT,group); } - -void armci_msg_group_lgop(long *x, int n, char* op,ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG,group); } - -void armci_msg_group_llgop(long long *x, int n, char* op,ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG_LONG,group); } - -void armci_msg_group_fgop(float *x, int n, char* op,ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_FLOAT,group); } - -void armci_msg_group_dgop(double *x, int n, char* op,ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_DOUBLE,group); } - -# endif /* ifdef MSG_COMMS_MPI */ -/*********************** End ARMCI Groups Code ****************************/ - - -#ifdef PVM -/* set the group name if using PVM */ -void ARMCI_PVM_Init(char *mpgroup) -{ -#ifdef CRAY - mp_group_name = (char *)NULL; -#else - if(mpgroup != NULL) { -/* free(mp_group_name); */ - mp_group_name = (char *)malloc(25 * sizeof(char)); - strcpy(mp_group_name, mpgroup); - } -#endif -} -#endif diff --git a/armci/src-gemini/message.h b/armci/src-gemini/message.h deleted file mode 100644 index aa30cda89..000000000 --- a/armci/src-gemini/message.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef _MESSAGE_H_ -#define _MESSAGE_H_ - -#include "armci.h" - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -#define ARMCI_INT -99 -#define ARMCI_LONG -101 -#define ARMCI_LONG_LONG -102 -#define ARMCI_FLOAT -306 -#define ARMCI_DOUBLE -307 - -#define SCOPE_ALL 333 -#define SCOPE_NODE 337 -#define SCOPE_MASTERS 339 - -#define armci_msg_sel(x,n,op,type,contribute)\ - armci_msg_sel_scope(SCOPE_ALL,(x),(n),(op),(type),(contribute)) -#if 0 -#define armci_msg_bcast(buffer, len, root)\ - armci_msg_bcast_scope(SCOPE_ALL, (buffer), (len), (root)) -#else -extern void armci_msg_bcast(void *buffer, int len, int root); -#endif - -extern void armci_msg_sel_scope(int scope, void *x, int n, char* op, - int type, int contribute); -extern void armci_msg_bcast_scope(int scope, void* buffer, int len, int root); -extern void armci_msg_brdcst(void* buffer, int len, int root); -extern void armci_msg_snd(int tag, void* buffer, int len, int to); -extern void armci_msg_rcv(int tag, void* buffer, int buflen, int *msglen, int from); -extern int armci_msg_rcvany(int tag, void* buffer, int buflen, int *msglen); -extern void armci_msg_reduce(void *x, int n, char *op, int type); -extern void armci_msg_reduce_scope(int scope, void *x, int n, char *op, int type); - -extern void armci_msg_gop_scope(int scope, void *x, int n, char* op, int type); -extern void armci_msg_igop(int *x, int n, char* op); -extern void armci_msg_lgop(long *x, int n, char* op); -extern void armci_msg_llgop(long long *x, int n, char* op); -extern void armci_msg_fgop(float *x, int n, char* op); -extern void armci_msg_dgop(double *x, int n, char* op); -extern void armci_exchange_address(void *ptr_ar[], int n); -extern void armci_msg_barrier(); -extern void armci_msg_bintree(int scope, int* Root, int *Up, int *Left, int *Right); - -extern int armci_msg_me(); -extern int armci_msg_nproc(); -extern void armci_msg_abort(int code); -extern void armci_msg_init(int *argc, char ***argv); -extern void armci_msg_init_comm(MPI_Comm comm); -extern void armci_msg_finalize(); -extern double armci_timer(); - -extern void armci_msg_clus_brdcst(void *buf, int len); -extern void armci_msg_clus_igop(int *x, int n, char* op); -extern void armci_msg_clus_fgop(float *x, int n, char* op); -extern void armci_msg_clus_lgop(long *x, int n, char* op); -extern void armci_msg_clus_llgop(long long *x, int n, char* op); -extern void armci_msg_clus_dgop(double *x, int n, char* op); - -extern void armci_msg_group_gop_scope(int scope, void *x, int n, char* op, int type, ARMCI_Group *group); -extern void armci_msg_group_igop(int *x, int n, char* op,ARMCI_Group *group); -extern void armci_msg_group_lgop(long *x, int n, char* op,ARMCI_Group *group); -extern void armci_msg_group_llgop(long long *x, int n, char* op,ARMCI_Group *group); -extern void armci_msg_group_fgop(float *x, int n, char* op,ARMCI_Group *group); -extern void armci_msg_group_dgop(double *x, int n,char* op,ARMCI_Group *group); -extern void armci_exchange_address_grp(void *ptr_arr[], int n, ARMCI_Group *group); -extern void armci_msg_group_barrier(ARMCI_Group *group); -extern void armci_msg_group_bcast_scope(int scope, void *buf, int len, int root, ARMCI_Group *group); -extern void armci_grp_clus_brdcst(void *buf, int len, int grp_master, int grp_clus_nproc,ARMCI_Group *mastergroup); - -#if defined(__cplusplus) || defined(c_plusplus) -} -#endif - - -#endif diff --git a/armci/src-gemini/mutex.c b/armci/src-gemini/mutex.c deleted file mode 100644 index 719fa3585..000000000 --- a/armci/src-gemini/mutex.c +++ /dev/null @@ -1,406 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: mutex.c,v 1.24.10.1 2006-12-21 23:50:48 manoj Exp $ */ -#include "armcip.h" -#include "copy.h" -#include "request.h" -#include - -#define DEBUG 0 -#define MAX_LOCKS 32768 -#define SPINMAX 1000 - -#if defined(LAPI) || defined(GM) -# define SERVER_LOCK -#endif - -double _dummy_work_=0.; -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ -int mymutexcount; -double _dummy_server_work_=0.; -#endif -static int num_mutexes=0, *tickets; - -typedef struct { - int mutex; - int turn; - msg_tag_t tag; -} waiting_list_t; - - -/* data structure to store info about blocked (waiting) process for mutex */ -static waiting_list_t* blocked=(waiting_list_t*)0; - - -typedef struct { -int* token; -int* turn; -int* tickets; -int count; -} mutex_entry_t; - -void** mutex_mem_ar; -mutex_entry_t *glob_mutex; - - -int PARMCI_Create_mutexes(int num) -{ -int rc,p, totcount; -int *mutex_count = (int*)armci_internal_buffer; - - if((sizeof(int)*armci_nproc) > armci_getbufsize()){ - mutex_count = (double *)malloc(sizeof(int)*armci_nproc); - } - if (num < 0 || num > MAX_LOCKS) return(FAIL); - if(num_mutexes) armci_die("mutexes already created",num_mutexes); - - if(armci_nproc == 1){ num_mutexes=1; return(0); } - - /* local memory allocation for mutex arrays*/ - mutex_mem_ar = (void*) malloc(armci_nproc*sizeof(void*)); - if(!mutex_mem_ar) armci_die("PARMCI_Create_mutexes: malloc failed",0); - glob_mutex = (void*)malloc(armci_nproc*sizeof(mutex_entry_t)); - if(!glob_mutex){ - free(mutex_mem_ar); - armci_die("PARMCI_Create_mutexes: malloc 2 failed",0); - } - - -/* bzero(mutex_count,armci_nproc*sizeof(int));*/ - bzero((char*)mutex_count,sizeof(int)*armci_nproc); - - /* find out how many mutexes everybody allocated */ - mutex_count[armci_me]=num; - armci_msg_igop(mutex_count, armci_nproc, "+"); - for(p=totcount=0; p< armci_nproc; p++)totcount+=mutex_count[p]; - - tickets = calloc(totcount,sizeof(int)); - if(!tickets) { - free(glob_mutex); - free(mutex_mem_ar); - return(FAIL2); - } - - /* we need memory for token and turn - 2 ints */ - rc = PARMCI_Malloc(mutex_mem_ar,2*num*sizeof(int)); - if(rc){ - free(glob_mutex); - free(mutex_mem_ar); - free(tickets); - return(FAIL3); - } - - if(num)bzero((char*)mutex_mem_ar[armci_me],2*num*sizeof(int)); - - /* setup global mutex array */ - for(p=totcount=0; p< armci_nproc; p++){ - glob_mutex[p].token = mutex_mem_ar[p]; - glob_mutex[p].turn = glob_mutex[p].token + mutex_count[p]; - glob_mutex[p].count = mutex_count[p]; - glob_mutex[p].tickets = tickets + totcount; - totcount += mutex_count[p]; - } - - num_mutexes= totcount; -#ifdef LAPI - mymutexcount = num; -#endif - PARMCI_Barrier(); - - if(DEBUG) - fprintf(stderr,"%d created (%d,%d) mutexes\n",armci_me,num,totcount); - - return(0); -} - - -void armci_serv_mutex_create() -{ - int mem = armci_nproc*sizeof(waiting_list_t); - blocked = (waiting_list_t*)malloc(mem); - if(!blocked) armci_die("armci server:error allocating mutex memory ",0); -} - - -void armci_serv_mutex_close() -{ - if(blocked) free(blocked ); - blocked = (waiting_list_t*)0; -} - - -int PARMCI_Destroy_mutexes() -{ -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ - int proc, mutex, i,factor=0; -#endif - if(num_mutexes==0)armci_die("armci_destroy_mutexes: not created",0); - if(armci_nproc == 1) return(0); - - armci_msg_barrier(); - -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ - for(proc=0;proc myturn) - armci_die2("armci: problem with tickets",myturn,next_in_line); - - /* apply a linear backoff delay before retrying */ - for(i=0; i< SPINMAX * factor; i++) _dummy_work_ += 1.; - - factor += 1; - - }while (myturn != next_in_line); - - glob_mutex[proc].tickets[mutex] = myturn; /* save ticket value */ -} - - -static void armci_generic_unlock(int mutex, int proc) -{ -int *mutex_ticket= glob_mutex[proc].turn + mutex; -int *newval = glob_mutex[proc].tickets +mutex; -int len=sizeof(int); - - /* update ticket for next process requesting this mutex */ - (*newval) ++; - - /* write new ticket value stored previously in tickets */ - PARMCI_Put(newval, mutex_ticket, len, proc); - MEM_FENCE; -} - - -/*\ Acquire mutex for "proc" - * -must be executed in hrecv/AM handler thread - * -application thread must use generic_lock routine -\*/ -int armci_server_lock_mutex(int mutex, int proc, msg_tag_t tag) -{ -int myturn; -int *mutex_ticket, next_in_line, len=sizeof(int); -int owner = armci_me; - - - if(DEBUG)fprintf(stderr,"SLOCK=%d owner=%d p=%d m=%d\n", - armci_me,owner, proc,mutex); - - mutex_ticket= glob_mutex[owner].turn + mutex; - myturn = register_in_mutex_queue(mutex, owner); - - armci_copy(mutex_ticket, &next_in_line, len); - - if(next_in_line > myturn) - armci_die2("armci-s: problem with tickets",myturn,next_in_line); - - if(next_in_line != myturn){ - if(!blocked)armci_serv_mutex_create(); - blocked[proc].mutex = mutex; - blocked[proc].turn = myturn; - blocked[proc].tag = tag; - if(DEBUG) fprintf(stderr,"SLOCK=%d proc=%d blocked (%d,%d)\n", - armci_me, proc, next_in_line,myturn); - return -1; - - } else { - - if(DEBUG) fprintf(stderr,"SLOCK=%d proc=%d sending ticket (%d)\n", - armci_me, proc, myturn); - - /* send ticket to requesting node */ - /* GA_SEND_REPLY(tag, &myturn, sizeof(int), proc); */ - return (myturn); - } -} - - - -/*\ Release mutex "id" held by proc - * called from hrecv/AM handler AND application thread -\*/ -int armci_server_unlock_mutex(int mutex, int proc, int Ticket, msg_tag_t* ptag) -{ -#define NOBODY -1 -int owner = armci_me; -int i, p=NOBODY, *mutex_ticket= glob_mutex[owner].turn + mutex; -int len=sizeof(int); - - if(DEBUG) fprintf(stderr,"SUNLOCK=%d node=%d mutex=%d ticket=%d\n", - armci_me,proc,mutex,Ticket); - - Ticket++; - armci_copy(&Ticket, mutex_ticket, len); - - /* if mutex is free then nobody is reqistered in queue */ - if(armci_mutex_free(mutex, proc)) return -1; - - /* search for the next process in queue waiting for this mutex */ - for(i=0; i< armci_nproc; i++){ - if(!blocked)break; /* not allocated yet - nobody is waiting */ - if(DEBUG)fprintf(stderr,"SUNLOCK=%d node=%d list=(%d,%d)\n", - armci_me, i, blocked[i].mutex, blocked[i].turn); - if((blocked[i].mutex == mutex) && (blocked[i].turn == Ticket)){ - p = i; - break; - } - } - - /* send Ticket to a process waiting for mutex */ - if(p != NOBODY){ - if(p == armci_me)armci_die("server_unlock: cannot unlock self",0); - else { - - if(DEBUG)fprintf(stderr,"SUNLOCK=%d node=%d unlock ticket=%d go=%d\n", - armci_me, proc, Ticket, p); - - /* GA_SEND_REPLY(blocked[p].tag, &Ticket, sizeof(int), p); */ - *ptag = blocked[p].tag; - return p; - - } - } - - return -1; /* nobody is waiting */ -} - - - -void PARMCI_Lock(int mutex, int proc) -{ -#if defined(SERVER_LOCK) -int direct; -#endif - - if(DEBUG)fprintf(stderr,"%d enter lock\n",armci_me); - - if(!num_mutexes) armci_die("armci_lock: create mutexes first",0); - - if(mutex > glob_mutex[proc].count) - armci_die2("armci_lock: mutex not allocated", mutex, - glob_mutex[proc].count); - - if(armci_nproc == 1) return; - -# if defined(SERVER_LOCK) - direct=SAMECLUSNODE(proc); - if(!direct) - armci_rem_lock(mutex,proc, glob_mutex[proc].tickets + mutex ); - else -# endif - armci_generic_lock(mutex,proc); - - if(DEBUG)fprintf(stderr,"%d leave lock\n",armci_me); -} - - - -void PARMCI_Unlock(int mutex, int proc) -{ - if(DEBUG)fprintf(stderr,"%d enter unlock\n",armci_me); - - if(!num_mutexes) armci_die("armci_lock: create mutexes first",0); - - if(mutex > glob_mutex[proc].count) - armci_die2("armci_lock: mutex not allocated", mutex, - glob_mutex[proc].count); - - if(armci_nproc == 1) return; - -# if defined(SERVER_LOCK) - if(armci_nclus >1) { - if(proc != armci_me) - armci_rem_unlock(mutex, proc, glob_mutex[proc].tickets[mutex]); - else { - int ticket = glob_mutex[proc].tickets[mutex]; - msg_tag_t tag; - int waiting; - - waiting = armci_server_unlock_mutex(mutex, proc, ticket, &tag); - if(waiting >-1) - armci_unlock_waiting_process(tag, waiting, ++ticket); - } - } - else -# endif - armci_generic_unlock(mutex, proc); - - if(DEBUG)fprintf(stderr,"%d leave unlock\n",armci_me); -} diff --git a/armci/src-gemini/new_memory.c b/armci/src-gemini/new_memory.c deleted file mode 100644 index a0f0df8b8..000000000 --- a/armci/src-gemini/new_memory.c +++ /dev/null @@ -1,409 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include -#include -#include -#include -#include -#include "armcip.h" -#include "message.h" - -#define DEBUG_ 0 -#define USE_SHMEM_ -#define SHM_UNIT 1024 - -void armci_print_ptr(void **ptr_arr, int bytes, int size, void* myptr, int off) -{ -int i; -int nproc = armci_clus_info[armci_clus_me].nslave; - - ARMCI_PR_DBG("enter",0); - for(i=0; i< armci_nproc; i++){ - int j; - if(armci_me ==i){ - printf("%d master =%d nproc=%d off=%d\n",armci_me, - armci_master,nproc, off); - printf("%d:bytes=%d mptr=%p s=%d ",armci_me, bytes, myptr,size); - for(j = 0; j< armci_nproc; j++)printf(" %p",ptr_arr[j]); - printf("\n"); fflush(stdout); - } - armci_msg_barrier(); - } - ARMCI_PR_DBG("exit",0); -} - - -/******************************************************************** - * Non-collective Memory Allocation on shared memory systems -\*/ -void armci_shmem_memget(armci_meminfo_t *meminfo, size_t size) { - void *myptr=NULL; - void *armci_ptr=NULL; /* legal ARCMIptr used in ARMCI data xfer ops */ - - /* can malloc if there is no data server process & has 1 process/node*/ -} - -void* armci_shmem_memat(armci_meminfo_t *meminfo) { - return NULL; -} - -void armci_shmem_memctl(armci_meminfo_t *meminfo) { - -} - -/****** End: Non-collective memory allocation on shared memory systems *****/ - -/** - * Local Memory Allocation and Free - */ -void *PARMCI_Malloc_local(armci_size_t bytes) { - void *rptr; - ARMCI_PR_DBG("enter",0); - ARMCI_PR_DBG("exit",0); - return malloc(bytes); -} - -int PARMCI_Free_local(void *ptr) { - ARMCI_PR_DBG("enter",0); - free(ptr); - ARMCI_PR_DBG("exit",0); - return 0; -} - -/*\ A wrapper to shmget. Just to be sure that ID is not 0. -\*/ -static int armci_shmget(size_t size,char *from) -{ -int id; - - id = shmget(IPC_PRIVATE, size, (IPC_CREAT | 00600)); - - /*attaching with id 0 somehow fails (Seen on pentium4+linux24+gm163) - *so if id=0, shmget again. */ - while(id==0){ - /* free id=0 and get a new one */ - if(shmctl((int)id,IPC_RMID,(struct shmid_ds *)NULL)) { - fprintf(stderr,"id=%d \n",id); - armci_die("allocate: failed to _delete_ shared region ",id); - } - id = shmget(IPC_PRIVATE, size, (IPC_CREAT | 00600)); - } - if(DEBUG_){ - printf("\n%d:armci_shmget sz=%ld caller=%s id=%d\n",armci_me,(long)size, - from,id); - fflush(stdout); - } - return(id); -} - - -/*\ Collective Memory Allocation - * returns array of pointers to blocks of memory allocated by everybody - * Note: as the same shared memory region can be mapped at different locations - * in each process address space, the array might hold different values - * on every process. However, the addresses are legitimate - * and can be used in the ARMCI data transfer operations. - * ptr_arr[nproc] -\*/ -#define CLEANUP_CMD(command) sprintf(command,"/usr/bin/ipcrm shm %d",id); -int PARMCI_Malloc(void *ptr_arr[], armci_size_t bytes) -{ -int mynslave = armci_clus_info[armci_clus_me].nslave; -void *servptr,*mynodeptrs[mynslave]; -int id,nodeids[mynslave],mynodeid=armci_me-armci_master; - - ARMCI_PR_DBG("enter",0); -#ifdef DEBUG_MEM - fprintf(stderr,"%d bytes in armci_malloc %d\n",armci_me, (int)bytes); - fflush(stderr); - armci_msg_barrier(); -#endif - if(bytes>0){ - if(mynslave>1){ - -#ifdef DEBUG_MEM - printf("\n%d:%s:mynslave is %d",armci_me,__FUNCTION__,mynslave);fflush(stdout); -#endif - bzero((void *)nodeids,sizeof(int)*mynslave); - id =nodeids[mynodeid]= armci_shmget(bytes,"PARMCI_Malloc"); - armci_msg_gop_scope(SCOPE_NODE,nodeids,mynslave,"+",ARMCI_INT); - for(int i=0;i1){ - servptr = armci_server_ptr(id); - } - else servptr = mynodeptrs[mynodeid]; - - } - else{ -#ifdef DEBUG_MEM - printf("\n%d:%s:mynslave is %d, doing malloc",armci_me,__FUNCTION__,mynslave);fflush(stdout); -#endif - mynodeptrs[mynodeid] = servptr = malloc(bytes); - } - } - else{ - mynodeptrs[mynodeid] = servptr = NULL; - } - - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - /*ptr_arr[armci_me] = servptr;*/ - ptr_arr[armci_me] = mynodeptrs[mynodeid]; - armci_exchange_address(ptr_arr,armci_nproc); - - if(mynslave>1)for(int i=0;i1){ - armci_portals_memsetup((long)servptr-(long)ptr_arr[armci_me]); - } - - ARMCI_PR_DBG("exit",0); - return(0); - -} - -/*\ - * Wrapper on PARMCI_Malloc to keep old code from breaking -\*/ -int PARMCI_Malloc_memdev(void *ptr_arr[], armci_size_t bytes, const char *device) -{ - return PARMCI_Malloc(ptr_arr[],bytes); -} - -int PARMCI_Free(void *ptr) -{ - ARMCI_PR_DBG("enter",0); - if(!ptr)return 1; - - ARMCI_PR_DBG("exit",0); - return 0; -} - -/*\ - * Wrapper on PARMCI_Free to keep old code from breaking -\*/ -int PARMCI_Free_memdev(void *ptr) -{ - return PARMCI_Free(ptr); -} - -int ARMCI_Uses_shm() -{ - int uses=0; - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) \ - && !defined(NO_SHM) -# ifdef RMA_NEEDS_SHMEM - if(armci_nproc >1) uses= 1; /* always unless serial mode */ -# else - if(armci_nproc != armci_nclus)uses= 1; /* only when > 1 node used */ -# endif -#endif - if(DEBUG_) fprintf(stderr,"%d:uses shmem %d\n",armci_me, uses); - return uses; -} -#ifdef MSG_COMMS_MPI - -int ARMCI_Uses_shm_grp(ARMCI_Group *group) -{ - int uses=0, grp_me, grp_nproc, grp_nclus; - ARMCI_PR_DBG("enter",0); - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - grp_nclus = grp_attr->grp_nclus; - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) \ - && !defined(NO_SHM) -# ifdef RMA_NEEDS_SHMEM - if(grp_nproc >1) uses= 1; /* always unless serial mode */ -# else - if(grp_nproc != grp_nclus)uses= 1; /* only when > 1 node used */ -# endif -#endif - if(DEBUG_) fprintf(stderr,"%d (grp_id=%d):uses shmem %d\n",armci_me, grp_me, uses); - ARMCI_PR_DBG("exit",0); - return uses; -} - -/*\ ************** Begin Group Collective Memory Allocation ****************** - * returns array of pointers to blocks of memory allocated by everybody - * Note: as the same shared memory region can be mapped at different locations - * in each process address space, the array might hold different values - * on every process. However, the addresses are legitimate - * and can be used in the ARMCI data transfer operations. - * ptr_arr[nproc] -\*/ -int ARMCI_Malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) -{ - void *ptr; - int grp_me, grp_nproc; - ARMCI_PR_DBG("enter",0); - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(DEBUG_)fprintf(stderr,"%d (grp_id=%d) bytes in armci_malloc_group %d\n", - armci_me, grp_me, (int)bytes); - - ARMCI_PR_DBG("exit",0); - return(0); -} - -/*\ - * Wrapper on PARMCI_Malloc_group to keep old code from breaking -\*/ -int ARMCI_Malloc_group_memdev(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group, const char *device) -{ - return ARMCI_Malloc_group(ptr_arr,bytes,group); -} - - -int ARMCI_Free_group(void *ptr, ARMCI_Group *group) -{ - int grp_me, grp_nproc, grp_master, grp_clus_me; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - ARMCI_PR_DBG("enter",0); - - if(!ptr)return 1; - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(grp_me == MPI_UNDEFINED) { /* check if the process is in this group */ - armci_die("armci_malloc_group: process is not a member in this group", - armci_me); - } - /* get the group cluster info */ - grp_clus_me = grp_attr->grp_clus_me; - grp_master = grp_attr->grp_clus_info[grp_clus_me].master; - - ARMCI_PR_DBG("exit",0); - return 0; -} -/* ***************** End Group Collective Memory Allocation ******************/ - -/* ************** Begin Non-Collective Memory Allocation ****************** - * Prototype similar to SysV shared memory. - */ - -/** - * CHECK: On Altix we are forced to use SysV as shmalloc is collective. We - * may use a preallocated shmalloc memory, however, it may NOT still solve - * our problem... - * NOTE: "int memflg" option for future optimiztions. - */ -void PARMCI_Memget(size_t bytes, armci_meminfo_t *meminfo, int memflg) { - - void *myptr=NULL; - void *armci_ptr=NULL; /* legal ARCMI ptr used in ARMCI data xfer ops*/ - size_t size = bytes; - - if(size<=0) armci_die("PARMCI_Memget: size must be > 0", (int)size); - if(meminfo==NULL) armci_die("PARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - if(memflg!=0) armci_die("PARMCI_Memget: Invalid memflg", memflg); - - if( !ARMCI_Uses_shm() ) - { - - /* fill the meminfo structure */ - meminfo->armci_addr = armci_ptr; - meminfo->addr = myptr; - meminfo->size = size; - meminfo->cpid = armci_me; - /* meminfo->attr = NULL; */ - } - else - { - armci_shmem_memget(meminfo, size); - } - - if(DEBUG_){ - printf("%d: PARMCI_Memget: addresses server=%p myptr=%p bytes=%ld\n", - armci_me, meminfo->armci_addr, meminfo->addr, bytes); - fflush(stdout); - } -} - -void* PARMCI_Memat(armci_meminfo_t *meminfo, long memflg) { - void *ptr=NULL; - - if(meminfo==NULL) armci_die("PARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - if(memflg!=0) armci_die("PARMCI_Memget: Invalid memflg", memflg); - - if(meminfo->cpid==armci_me) { ptr = meminfo->addr; return ptr; } - - if( !ARMCI_Uses_shm()) - { - ptr = meminfo->addr; - } - else - { - ptr = armci_shmem_memat(meminfo); - } - - if(DEBUG_) - { - printf("%d:PARMCI_Memat: attached addr mptr=%p size=%ld\n", - armci_me, ptr, meminfo->size); fflush(stdout); - } - - return ptr; -} - -void ARMCI_Memdt(armci_meminfo_t *meminfo, int memflg) { - /** - * Do nothing. May be we need to have reference counting in future. This - * is to avoid the case of dangling pointers when the creator of shm - * segment calls Memctl and other processes are still attached to this - * segment - */ -} - -void ARMCI_Memctl(armci_meminfo_t *meminfo) { - - if(meminfo==NULL) armci_die("PARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - - /* only the creator can delete the segment */ - if(meminfo->cpid == armci_me) - { - if( !ARMCI_Uses_shm() ) - { - void *ptr = meminfo->addr; - } - else - { - armci_shmem_memctl(meminfo); - } - } - - meminfo->addr = NULL; - meminfo->armci_addr = NULL; - /* if(meminfo->attr!=NULL) free(meminfo->attr); */ -} - -/* ***************** End Non-Collective Memory Allocation ******************/ - -#endif diff --git a/armci/src-gemini/pack.c b/armci/src-gemini/pack.c deleted file mode 100644 index 8db0b76a7..000000000 --- a/armci/src-gemini/pack.c +++ /dev/null @@ -1,360 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: pack.c,v 1.36.10.1 2006-12-14 13:24:37 manoj Exp $ */ -#include "armcip.h" -#include - -#if !defined(ACC_COPY) &&!defined(CRAY_YMP) &&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML) -# define REMOTE_OP -#endif - -#if defined(REMOTE_OP) -# define OP_STRIDED armci_rem_strided -#else -# define OP_STRIDED(_a,_b,_c,_d,_e,_f,_g,_h,_i,_delete1,_j,_hdl)\ - armci_op_strided(_a,_b,_c,_d,_e,_f,_g,_h,_i,_j,_hdl) -#endif - - -/*\ determine if patch fits in the ARMCI buffer, and if not - * at which stride level (patch dim) need to decompose it - * *fit_level is the value of stride level to perform packing at - * *nb means number of elements of count[*fit_level] that fit in buf -\*/ -static void armci_fit_buffer(int count[], int stride_levels, int* fit_level, - int *nb, int bufsize) -{ - int bytes=1, sbytes=1; - int level; - - /* find out at which stride level BUFFER becomes too small */ - for(level=0; level<= stride_levels; level++){ - sbytes = bytes; /* store #bytes at current level to save div cost later */ - bytes *= count[level]; - if(bufsize < bytes) break; - } - - /* buffer big enough for entire patch */ - if(bufsize >= bytes){ - *fit_level = stride_levels; - *nb = count[stride_levels]; - return; - } - - /* buffer too small */ - switch (level){ - case 0: - /* smaller than a single column */ - *fit_level = 0; - *nb = bufsize; - break; - case -1: /* one column fits */ - *fit_level = 0; - *nb = sbytes; - break; - default: - /* it could keep nb instances of (level-1)-dimensional patch */ - *fit_level = level; - *nb = bufsize/sbytes; - } -} - - -/*\ The function decomposes a multi-dimensional patch so that it fits in the - * internal ARMCI buffer. - * It works by recursively reducing patch dimension until some portion of the - * subpatch fits in the buffer. - * The recursive process is controlled by "fit_level" and "nb" arguments, - * which have to be set to -1 at the top-level of the recursion tree. - * - * Argument last and variable looplast are used to indicate to sending/packing - * routine that we are dealing with the last portion of the request. - * Due to the recursive nature of packing code, the algorithm is following: - * if last=1 then internal for loop passes 1 for the last chunk - * else it passes 0 - * -\*/ -int armci_pack_strided(int op, void* scale, int proc, - void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, ext_header_t *h, - int fit_level, int nb, int last,armci_ihdl_t nb_handle) -{ - int rc=0, sn, bufsize=BUFSIZE,noswap=0; - void *src, *dst; -#ifdef REMOTE_OP - int flag=0; -#else - int flag=1; -#endif - int b; - static int call_count; - -#ifdef STRIDED_GET_BUFLEN - if(op==GET) bufsize=STRIDED_GET_BUFLEN; -# ifdef HITACHI - else - if(stride_levels || ARMCI_ACC(op)) bufsize=MSG_BUFLEN_SMALL-PAGE_SIZE; -# endif -#endif - -#if (defined(GM_) || defined(VIA_) || defined(VAPI_)) - /*we cant assume that the entire available buffer will be used for data, - fact that the header and descriptor also go in the same buffer should be - considered while packing. - */ - bufsize-=(sizeof(request_header_t)+(MAX_STRIDE_LEVEL+4)*sizeof(int)+2*sizeof(void *)); -# if defined(PIPE_BUFSIZE) && defined(MAX_PIPELINE_CHUNKS) - bufsize-=8*MAX_PIPELINE_CHUNKS; -# endif -#endif - -#ifdef BALANCE_FACTOR - /* Added the following for balancing buffers */ - if(op==PUT){ - int bytes=1, i; - for(i=0; i<= stride_levels; i++) - bytes *= count[i]; - if(bytes > bufsize && bytes/bufsize < 3 && bytes%bufsize < BALANCE_BUFSIZE){ - /* bytes div bufsize - 1 is to increase the balence factor for 3 buffer case */ - bufsize = bytes/ (bytes/bufsize - 1 + BALANCE_FACTOR); - noswap = 1; /*** yuck: if set to 1, error in buffers.c ***/ - } - bytes = bufsize%8; - bufsize -= bytes; - } -#endif - - /* determine decomposition of the patch to fit in the buffer */ - if(fit_level<0){ - armci_fit_buffer(count, stride_levels, &fit_level, &nb, bufsize); - last = 1; - } - -// printf("%s [cp]: pack_strided: flag=%d, bufsize=%ld; fit_level=%d; stride_level=%d; nb=%d\n",Portals_ID(),flag,(long) bufsize,fit_level,stride_levels,nb); - - if(fit_level == stride_levels){ - - /* we can fit subpatch into the buffer */ - int chunk = count[fit_level]; - int dst_stride, src_stride; - - if(nb == chunk){ /* take shortcut when whole patch fits in the buffer */ - if(h) h->last = last?1:0; - if(nb_handle && call_count ){ - nb_handle->bufid=NB_MULTI; - call_count++; - } - return(OP_STRIDED(op, scale, proc, src_ptr, src_stride_arr, - dst_ptr,dst_stride_arr,count,stride_levels,h,flag,nb_handle)); - } - - if(fit_level){ - dst_stride = dst_stride_arr[fit_level -1]; - src_stride = src_stride_arr[fit_level -1]; - }else{ - dst_stride = src_stride = 1; - } - if(op == GET || noswap == 1) b =nb; - else{ b = chunk%nb; if(b==0)b=nb; } /* put smallest piece first */ - - for(sn = 0; sn < chunk; ){ - src = (char*)src_ptr + src_stride* sn; - dst = (char*)dst_ptr + dst_stride* sn; - count[fit_level] = ARMCI_MIN(b, chunk-sn); /*modify count for this level*/ - - if(h) h->last = (last && ((sn+b)>=chunk))? 1: 0 ; - if(nb_handle)call_count++; - rc = OP_STRIDED( op, scale, proc, src, src_stride_arr, - dst,dst_stride_arr,count,fit_level,h,flag,nb_handle); - if(rc) break; - - sn += b; - b = nb; - } - count[fit_level] = chunk; /* restore original count */ - - } - else { - for(sn = 0; sn < count[stride_levels]; sn++){ - int looplast =0; - src = (char*)src_ptr + src_stride_arr[stride_levels -1]* sn; - dst = (char*)dst_ptr + dst_stride_arr[stride_levels -1]* sn; - - if(last && (sn == count[stride_levels]-1)) looplast =1; - rc = armci_pack_strided(op, scale, proc, src, src_stride_arr, - dst, dst_stride_arr, count, stride_levels -1, - h,fit_level, nb, looplast,nb_handle); - if(rc) return rc; - } - } - if(nb_handle && call_count ) - nb_handle->bufid=NB_MULTI; - return rc; -} - -/*\ decompose strided data into chunks and call func on each chunk -\*/ -void armci_dispatch_strided(void *ptr, int stride_arr[], int count[], - int strides, int fit_level, int nb, int bufsize, - void (*fun)(void*,int*,int*,int,void*), void *arg) -{ - int sn,first_call=0; - void *ptr_upd; - - /* determine decomposition of the patch to fit in the buffer */ - if(fit_level<0){ - first_call=1; - armci_fit_buffer(count, strides, &fit_level, &nb, bufsize); - } - - - if(fit_level == strides){ - - /* we can fit subpatch into the buffer */ - int chunk = count[fit_level]; - int stride_upd; - -# ifdef PIPE_MEDIUM_BUFSIZE_ - /* for first call we adjust nb for performance in medium request */ - if(first_call && strides==0) - if(chunk<2*bufsize && chunk>PIPE_MEDIUM_BUFSIZE) - nb = PIPE_MEDIUM_BUFSIZE; -# endif - - if(nb == chunk){ /* take shortcut when whole patch fits in the buffer */ - fun(ptr, stride_arr, count, strides, arg); - } - - if(fit_level) - stride_upd = stride_arr[fit_level -1]; - else - stride_upd = 1; - - for(sn = 0; sn < chunk; sn += nb){ - - ptr_upd = (char*)ptr + stride_upd* sn; - count[fit_level] = ARMCI_MIN(nb, chunk-sn); /*modify count for this level*/ - fun(ptr_upd, stride_arr, count, fit_level, arg); - } - count[fit_level] = chunk; /* restore original count */ - - }else for(sn = 0; sn < count[strides]; sn++){ - ptr_upd = (char*)ptr + stride_arr[strides -1]* sn; - armci_dispatch_strided(ptr_upd, stride_arr, count, strides -1, - fit_level, nb, bufsize, fun, arg); - } -} - -/* how much space is needed to move data + reduced descriptor ? */ -int armci_vector_bytes( armci_giov_t darr[], int len) -{ -int i, bytes=0; - for(i=0; isrc_ptr_array=NULL; - /* go through the sets looking for set to be split */ - for(s=0;sBUFSIZE1){ - - split =(BUFSIZE1 -bytes-2*sizeof(int))/(darr[s].bytes +sizeof(void*)); - if(split == 0) s--; /* no room available - do not split */ - break; - - }else bytes+=csize; - - if(BUFSIZE1 -bytes < 64) break; /* stop here if almost full */ - } - - if(s==len)s--; /* adjust loop counter should be < number of sets */ - *nlen = s+1; - - if(split){ - - /* save the value to be overwritten only if "save" is not filled */ - if(!save->src_ptr_array)*save= darr[s]; - - /* split the set: reduce # of elems, "extra" keeps info for rest of set*/ - *extra = darr[s]; - darr[s].ptr_array_len = split; - extra->ptr_array_len -= split; - extra->src_ptr_array = &extra->src_ptr_array[split]; - extra->dst_ptr_array = &extra->dst_ptr_array[split]; - } -} - - - -int armci_pack_vector(int op, void *scale, armci_giov_t darr[],int len, - int proc,armci_ihdl_t nb_handle) -{ -armci_giov_t extra; /* keeps data remainder of set to be processed in chunks */ -armci_giov_t save; /* keeps original value of set to be processed in chunks */ -armci_giov_t *ndarr; /* points to first array element to be processed now */ -int rc=0, nlen, count=0; - - ndarr = darr; - - save.src_ptr_array=NULL; /* indicates that save slot is empty */ - while(len){ - - armci_split_dscr_array(ndarr, len, &extra, &nlen, &save); -# if defined(REMOTE_OP) - rc = armci_rem_vector(op, scale, ndarr,nlen,proc,0,nb_handle); -# else - if(ARMCI_ACC(op))rc=armci_acc_vector(op,scale,ndarr,nlen,proc); - else rc = armci_copy_vector(op,ndarr,nlen,proc); -# endif - if(rc) break; - - /* non-NULL pointer indicates that set was split */ - if(extra.src_ptr_array){ - - if(nb_handle) { - nb_handle->bufid = NB_MULTI; /*can be set multiple times here; but not reset here*/ - } - - ndarr[nlen-1]=extra; /* set the pointer to remainder of last set */ - nlen--; /* since last set not done in full need to process it again */ - - }else{ - - if(save.src_ptr_array){ - ndarr[0]=save; - save.src_ptr_array=NULL; /* indicates that save slot is empty */ - } - - if(nlen==0) - armci_die("vector packetization problem:buffer too small",BUFSIZE1); - } - - len -=nlen; - ndarr +=nlen; - count ++; - } - - return rc; -} diff --git a/armci/src-gemini/pendbufs.c b/armci/src-gemini/pendbufs.c deleted file mode 100644 index 6f7193efe..000000000 --- a/armci/src-gemini/pendbufs.c +++ /dev/null @@ -1,697 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if defined(PEND_BUFS) - -#include "pendbufs.h" -#include "armcip.h" -#include -#include -#include -#include - -#define DEBUG_SERVER 0 - -/*-------------------Attributes-------------------------*/ - -/**Attributes to control buffer count and sizes. Implement this way to - hide the global variables, and provide get/set methods.*/ - -#define NUM_ATTRIBUTES 4 -#define ATTRIB_IMMBUF_LEN 0 -#define ATTRIB_IMMBUF_NUM 1 -#define ATTRIB_PNDBUF_LEN 2 -#define ATTRIB_PNDBUF_NUM 3 - -/** List of hidden attributes and their operations. - * @param attid IN Attribute id. Choose from the list above - * @param gs IN Get(=0)/Set(=1) - * @param v IN Value (used only when gs==1) - * @return Value of the attribute on return - */ -static int att_ops(int attid, int gs, int v) { - static not_first[NUM_ATTRIBUTES]; /*auto-init to zero*/ - static val[NUM_ATTRIBUTES]; - assert(attid>=0 && attidIMM_BUF_LEN*/ - INIT=7, /**buf; - proc_waitlist_t *info = &pbuf_proc_list_info[msginfo->from]; - assert(msginfo->tag.imm_msg == 0); - pbuf = _armci_serv_pendbuf_getbuf(); - if(pbuf) { - pbuf->status = INIT; - pbuf->avail = 0; - pbuf->vbuf = vbuf; - memcpy(pbuf->buf, vbuf->buf, sizeof(request_header_t)+msginfo->dscrlen); -/* pbuf_proc_list_info[msginfo->from].waiting_on=pbuf; */ - pbuf->order_prev = info->order_tail; - if(info->order_tail) info->order_tail->order_next = pbuf; - info->order_tail = pbuf; - if(!info->order_head) info->order_head = pbuf; - } - return pbuf; -} - - -/**Free a pending buffer - * @param pbuf IN Pointer to Pending buffer to be freed - * @return none - */ -static void _armci_serv_pendbuf_freebuf(pendbuf_t *pbuf){ - const request_header_t *msginfo = (request_header_t *)pbuf->buf; - proc_waitlist_t *info = &pbuf_proc_list_info[msginfo->from]; - ARMCI_PR_DBG("enter",0); - assert(pbuf != NULL); - pbuf->avail=1; - pbuf->status = -1; - pbuf->vbuf = NULL; -/* assert(info->waiting_on == pbuf); */ -/* info->waiting_on = NULL; */ - if(pbuf->order_prev) - pbuf->order_prev->order_next = pbuf->order_next; - if(pbuf->order_next) - pbuf->order_next->order_prev = pbuf->order_prev; - if(info->order_head == pbuf) { - assert(pbuf->order_prev == NULL); - info->order_head = pbuf->order_next; - } - if(info->order_tail == pbuf) { - assert(pbuf->order_next == NULL); - info->order_tail = pbuf->order_prev; - } - pbuf->order_prev = pbuf->order_next = NULL; /*not necessary here*/ - - _nPendBufsUsed -= 1; - assert(_nPendBufsUsed>=0); - ARMCI_PR_DBG("exit",0); -} - -/** Implement ordering between messages. This function needs to be - * implemented in conjunction with @_armci_serv_pendbuf_promote to - * ensure ordered processing of messages. - * @param vbuf IN Message in immediate buffer being checked - * @return 1 if the message can be progressed (either in-place or - * after copying to a pending buffer). 0 therwise. - */ -static int _armci_serv_pendbuf_can_progress(immbuf_t *vbuf) { - const request_header_t *msginfo=(request_header_t*)vbuf->buf; - const int proc = msginfo->from; - const proc_waitlist_t *info = &pbuf_proc_list_info[proc]; - - if(_pbufOrder == ONE_PBUF_MESG) { - /*Only one pending buffer used at any time*/ - if(_nPendBufsUsed>0) - return 0; - return 1; - } - if(_pbufOrder == ONE_PBUF_MESG_PER_PROC) { - /*Only one non-immediate mesg can be assigned to the pending - buffers at any time*/ - if(info->order_head - || (info->immbuf_wlist_head && info->immbuf_wlist_head!=vbuf)) { - return 0;/*other requests from this process remain*/ - } - if(!IS_IMM_MSG(*msginfo) && _nPendBufsUsed==PENDING_BUF_NUM) { - return 0; /*This buffer needs a free pending buffer*/ - } - assert(info->n_pending == 0 || info->immbuf_wlist_head==vbuf); - return 1; - } - if(_pbufOrder == ACC_NO_ORDER) { - /*Messages are processed in-place in immediate buffers or issued - into pending buffers for progress in order (like - ONE_PBUF_PER_MESG). This rule relaxes ONE_PBUF_PER_MESG by - allowing a sequence of ACCs to be processed in-place/issued - without waiting for the prior ones to complete*/ - int i, nwaiting_on, nacc; - pendbuf_t *ptr; - if(!IS_IMM_MSG(*msginfo) && _nPendBufsUsed==PENDING_BUF_NUM) { -/* printf("%d(s): op=%d from=%d datalen=%d waiting for pending buffers\n",armci_me,msginfo->operation,msginfo->from,msginfo->tag.data_len); */ -/* fflush(stdout); */ - return 0; /*This buffer needs a free pending buffer*/ - } -#if 1 /*commented for now: it does work*/ - if(IS_IMM_MSG(*msginfo) && ARMCI_ACC(msginfo->operation)) { - return 1; - } -#endif - if(info->immbuf_wlist_head && info->immbuf_wlist_head!=vbuf) { -/* printf("%d(s): op=%d from=%d datalen=%d not queue head\n",armci_me,msginfo->operation,msginfo->from,msginfo->tag.data_len); */ -/* fflush(stdout); */ - return 0; /*in order issue*/ - } - - if(!ARMCI_ACC(msginfo->operation)) { - if(info->order_head) - return 0; - return 1; - } - - int check = ARMCI_ACC(msginfo->operation); - assert(check); - for(ptr=info->order_head; ptr!=NULL; ptr=ptr->order_next) { - request_header_t *m = (request_header_t *)ptr->buf; - assert(m->from == msginfo->from); - if(!ARMCI_ACC(m->operation)) - break; - } - if(ptr != NULL) - return 0; - return 1; - } - if(_pbufOrder == PUTACC_SPLIT_ORDER) { - if(!IS_IMM_MSG(*msginfo) && _nPendBufsUsed==PENDING_BUF_NUM) { - return 0; /*This buffer needs a free pending buffer*/ - } - if(info->immbuf_wlist_head && info->immbuf_wlist_head!=vbuf) { - return 0; - } - if(msginfo->operation!=PUT && !ARMCI_ACC(msginfo->operation)) { - if(info->order_head) - return 0; - return 1; - } -#if 1 - if(IS_IMM_MSG(*msginfo) && ARMCI_ACC(msginfo->operation)) { - return 1; - } -#endif - if(IS_IMM_MSG(*msginfo) && info->order_head) - return 0; - return 1; - } - armci_die("Unknown pbuf ordering rule",_pbufOrder); - return 0; -} - -/** Goes through the set of immediate buffers waiting to be processed - * and completed, and identifies a buffer that can be processed - * now. Removes it from the list and returns it. Promote also - * considers availability of pending buffers if need be. - * @return Pointer to buffer that can be processed now. NULL if none exists. - */ -static immbuf_t* _armci_serv_pendbuf_promote() { - immbuf_t *immbuf = NULL; - proc_waitlist_t *info; - - ARMCI_PR_DBG("enter",0); - - assert(_nPendBufsUsed>=0); - if(!pbuf_ordering_plist_head) { - return NULL; /*nothing to promote*/ - } - - info = pbuf_ordering_plist_head; - do { - if(info->immbuf_wlist_head==NULL) { - printf("%d(s): Why is info->immbuf_wlist_head NULL\n", armci_me); - fflush(stdout); - pause(); - } - assert(info->immbuf_wlist_head!=NULL); - assert(info->n_pending>0); - if(_armci_serv_pendbuf_can_progress(info->immbuf_wlist_head)) { - immbuf = info->immbuf_wlist_head; - info->immbuf_wlist_head = immbuf->immbuf_list_next; - info->n_pending -= 1; - immbuf->immbuf_list_next = NULL; - if(!info->immbuf_wlist_head) { - info->immbuf_wlist_tail = NULL; - /*remove this proc from proc list*/ - info->prev->next = info->next; - info->next->prev = info->prev; - if(pbuf_ordering_plist_head == info) { - pbuf_ordering_plist_head = (info->next==info)?NULL:info->next; - } - info->prev = info->next = NULL; - } - break; - } - info = info->next; - } while(info != pbuf_ordering_plist_head); - - if(DEBUG_SERVER) if(immbuf) { - request_header_t *msginfo=(request_header_t*)immbuf->buf; - printf("%d:: promoting a buffer immbuf=%p op=%d from=%d n_pending=%d\n", - armci_me,immbuf,msginfo->operation,msginfo->from,info->n_pending); - fflush(stdout); - } - ARMCI_PR_DBG("exit",0); - return immbuf; -} - -/** Enqueue a message. It could be an immediate message that cannot - * make progress or a non-immediate message that cannot make progress - * either due to ordering constraints or lack of pending buffers. - * @param vbuf IN Immediate buffer to be enqueud - * @return Pending buffer into which the message was enqueued. NULL - * if no pending buffer was allocated (which is always the case for - * immediate messages) - */ -static pendbuf_t* _armci_serv_pendbuf_enqueue(immbuf_t *vbuf) { - request_header_t *msginfo=(request_header_t *)vbuf->buf; - int from = msginfo->from; - pendbuf_t *pbuf; - proc_waitlist_t *info = &pbuf_proc_list_info[msginfo->from]; - ARMCI_PR_DBG("enter",0); - -/* printf("%d: Entered serv_pbuf_enqueue\n", armci_me); */ - - pbuf=NULL; - if(msginfo->tag.imm_msg) { - assert(!_armci_serv_pendbuf_can_progress(vbuf)); - } - else if(_armci_serv_pendbuf_can_progress(vbuf)) { - pbuf = _armci_serv_pendbuf_assignbuf(vbuf); - assert(pbuf != NULL); /*can_progress() should ensure this*/ - } - if(pbuf == NULL) { -/* printf("%d(s):: Enqueing op=%d imm=%d from %d. n_pending=%d\n", armci_me, msginfo->operation, msginfo->tag.imm_msg, msginfo->from,info->n_pending); */ -/* fflush(stdout); */ - vbuf->immbuf_list_next = NULL; - assert(info->n_pending < IMM_BUF_NUM); /*How another message now?*/ - info->n_pending += 1; - - if(!info->immbuf_wlist_head) { - assert(!info->immbuf_wlist_tail); - assert(!info->prev && !info->next); - /*insert proc into proc list*/ - if(!pbuf_ordering_plist_head) { - pbuf_ordering_plist_head=info->next=info->prev=info; - } - else { - info->next = pbuf_ordering_plist_head; - info->prev = pbuf_ordering_plist_head->prev; - pbuf_ordering_plist_head->prev->next = info; - pbuf_ordering_plist_head->prev = info; - } - } - /*insert vbuf into immbuf list for this proc*/ - if(info->immbuf_wlist_tail) - info->immbuf_wlist_tail->immbuf_list_next=vbuf; - info->immbuf_wlist_tail = vbuf; - if(!info->immbuf_wlist_head) - info->immbuf_wlist_head = vbuf; - } -/* printf("%d: Leaving serv_pbuf_enqueue\n", armci_me); */ - ARMCI_PR_DBG("exit",0); - return pbuf; -} - -/** Progress GET requests. - * @param pbuf IN Pending buffer containing the GET request - * @return none - */ -static void _armci_serv_pendbuf_progress_get(pendbuf_t *pbuf) { - int index = (pbuf - serv_pendbuf_arr); - request_header_t *msginfo = (request_header_t *)pbuf->buf; - void *buffer =((char *)(msginfo+1)+msginfo->dscrlen); - int *status = &pbuf->status; - - assert(sizeof(request_header_t)+msginfo->dscrlen+msginfo->datalendscrlen <= IMM_BUF_LEN) { - /*Have the header and descriptor; go process*/ - armci_complete_pendbuf(pbuf); - *status = SEND_DATA_PENDING; - } - else { /*Need to get rest of descriptor*/ - const int bytes = sizeof(request_header_t)+msginfo->dscrlen-IMM_BUF_LEN; -#warning "PEND_BUFS: Abusing msginfo->tag.ack_ptr for GETS with large descriptors!" - assert(msginfo->tag.ack_ptr != NULL); /*sanity check. Should point to tag.ack on the client side*/ - void *lptr = ((char *)msginfo)+IMM_BUF_LEN; - void *rptr = ((char *)msginfo->tag.ack_ptr) - (int)(&((request_header_t *)0)->tag.ack) + IMM_BUF_LEN; -/* printf("%d(s):: GET getting rest of descriptor index=%d bytes=%d ptr=%p from=%d\n", */ -/* armci_me,index,bytes,rptr,msginfo->from); */ -/* fflush(stdout); */ - assert(IMM_BUF_LEN+bytes < PENDING_BUF_LEN); - armci_pbuf_start_get(rptr,lptr,bytes,msginfo->from,index); - *status = RECV_DSCR_PENDING; - } - break; - case RECV_DSCR_PENDING: - armci_die("call_data_server should set status to RECV_DSCR_DONE before calling progress",*status); - break; - case SEND_DATA_PENDING: - armci_die("call_data_server should set status to SEND_DATA_DONE before calling progress",*status); - break; - case RECV_DSCR_DONE: -/* printf("%d(s):: GET. Done recving descriptor index=%d op=%d datalen=%d from=%d\n", */ -/* armci_me,index,msginfo->operation,msginfo->datalen,msginfo->from); */ -/* fflush(stdout); */ - armci_complete_pendbuf(pbuf); - *status = SEND_DATA_PENDING; - break; - case SEND_DATA_DONE: - _armci_serv_pendbuf_freebuf(pbuf); - break; - case RECV_DATA_PENDING: - case RECV_DATA_DONE: - default: - armci_die("pendbuf_progress_get: invalid status", *status); - } -} - -/** Progress PUT/ACC requests. - * @param pbuf IN Pending buffer containing the PUT/ACC request - * @return none - */ -static void _armci_serv_pendbuf_progress_putacc(pendbuf_t *pbuf) { - int index = (pbuf - serv_pendbuf_arr); - request_header_t *msginfo = (request_header_t *)pbuf->buf; - void *buffer =((char *)(msginfo+1))+msginfo->dscrlen; - int *status = &pbuf->status; - - assert(msginfo->operation==PUT || ARMCI_ACC(msginfo->operation)); - assert(sizeof(request_header_t)+msginfo->dscrlen+msginfo->datalenoperation,msginfo->from); */ -/* fflush(stdout); */ - if(sizeof(request_header_t)+msginfo->dscrlen <= IMM_BUF_LEN) { - /*Have the header and descriptor; go process*/ - assert(sizeof(request_header_t)+msginfo->dscrlen+msginfo->tag.data_len < PENDING_BUF_LEN); - armci_pbuf_start_get(msginfo->tag.data_ptr,buffer,msginfo->tag.data_len, - msginfo->from, index); -/* printf("%d(s): PUT/ACC getting data. pbuf_num=%d data_ptr=%p data_len=%d bytes=%d\n", armci_me,index,msginfo->tag.data_ptr, msginfo->tag.data_len,msginfo->bytes); */ - *status = RECV_DATA_PENDING; - } - else { /*Need to get rest of descriptor*/ - const int bytes = sizeof(request_header_t)+msginfo->dscrlen-IMM_BUF_LEN; -#warning "PEND_BUFS: Abusing msginfo->tag.ack_ptr for GETS with large descriptors!" - assert(msginfo->tag.ack_ptr != NULL); /*sanity check. Should point to tag.ack on the client side*/ - void *lptr = ((char *)msginfo)+IMM_BUF_LEN; - void *rptr = ((char *)msginfo->tag.ack_ptr) - (int)(&((request_header_t *)0)->tag.ack) + IMM_BUF_LEN; -/* printf("%d(s):: PUT getting rest of descriptor index=%d bytes=%d ptr=%p from=%d\n", */ -/* armci_me,index,bytes,rptr,msginfo->from); */ -/* fflush(stdout); */ - assert(IMM_BUF_LEN+bytes < PENDING_BUF_LEN); - armci_pbuf_start_get(rptr,lptr,bytes,msginfo->from,index); - *status = RECV_DSCR_PENDING; - } - break; - case RECV_DSCR_PENDING: - armci_die("call_data_server should set status to RECV_DSCR_DONE before calling progress",*status); - break; - case RECV_DATA_PENDING: - armci_die("call_data_server should set status to RECV_DONE before calling progress",*status); - break; - case RECV_DSCR_DONE: - assert(sizeof(request_header_t)+msginfo->dscrlen+msginfo->tag.data_len < PENDING_BUF_LEN); - armci_pbuf_start_get(msginfo->tag.data_ptr,buffer,msginfo->tag.data_len, - msginfo->from, index); -/* printf("%d(s): PUT/ACC getting data. pbuf_num=%d data_ptr=%p data_len=%d bytes=%d\n", armci_me,index,msginfo->tag.data_ptr, msginfo->tag.data_len,msginfo->bytes); */ - *status = RECV_DATA_PENDING; - break; - case RECV_DATA_DONE: -/* printf("%d(s):: Done PUT/ACC with buf index=%d op=%d datalen=%d from=%d\n", */ -/* armci_me,index,msginfo->operation,msginfo->datalen,msginfo->from); */ -/* fflush(stdout); */ - if(msginfo->operation == PUT && pbuf->order_prev!=NULL) { - assert(pbuf->commit_me == 0); /*Why called so many times in thie - state?*/ - pbuf->commit_me = 1; - break; - } - pbuf->commit_me = 0; - armci_complete_pendbuf(pbuf); - _armci_serv_pendbuf_freebuf(pbuf); - break; - case SEND_DATA_PENDING: - case SEND_DATA_DONE: - default: - armci_die("pendbuf_progress_putacc: invalid status", *status); - } -} - - -/** Make progress on processing a pending buffer. This function, also - * ensures any other waiting messages get processed if they can - * be. Thus, progress and eventual termination is guaranteed by this - * function. - * @param _pbuf IN Pending buffer to make progress on - * @return none - */ -static void _armci_serv_pendbuf_progress(pendbuf_t *_pbuf){ - request_header_t *msginfo = (request_header_t *)_pbuf->buf; - immbuf_t *vbuf = _pbuf->vbuf; - pendbuf_t *pbuf = _pbuf; - - assert(pbuf->vbuf!=NULL); - do { - if(vbuf && !IS_IMM_MSG(*msginfo)) { assert(pbuf->vbuf == vbuf); } -/* printf("%d(s):: progressing op=%d imm=%d from=%d datalen=%d pbuf=%p vbuf=%p n_pending=%d\n", armci_me, */ -/* msginfo->operation,msginfo->tag.imm_msg,msginfo->from,msginfo->datalen, pbuf,vbuf,pbuf_proc_list_info[msginfo->from].n_pending); */ -/* fflush(stdout); */ - if(IS_IMM_MSG(*msginfo)) { - armci_complete_immbuf(vbuf); - } - else { /*non-immediate message*/ - proc_waitlist_t* info = &pbuf_proc_list_info[msginfo->from]; - - do { - assert(pbuf->vbuf == vbuf); - if(msginfo->operation == PUT || ARMCI_ACC(msginfo->operation)) { - _armci_serv_pendbuf_progress_putacc(pbuf); - } - else if (msginfo->operation == GET) { - _armci_serv_pendbuf_progress_get(pbuf); - } - else { - armci_die("pending buffer processing for this op not yet implemented", msginfo->operation); - } - pbuf = info->order_head; - vbuf = pbuf ? pbuf->vbuf : NULL; - } while(info->order_head && info->order_head->commit_me); - } -/* sleep(2); */ - vbuf = _armci_serv_pendbuf_promote(); - if(vbuf) { - msginfo = (request_header_t *)vbuf->buf; - if(!msginfo->tag.imm_msg) { - pbuf = _armci_serv_pendbuf_assignbuf(vbuf); - assert(pbuf != NULL); - } - } - } while(vbuf != NULL); -} - - - -/*----------------External functions--------------------*/ - - -/** Initialize array of pending buffers - * @return none - */ -void armci_pendbuf_init() { - int i; - - ARMCI_PR_DBG("enter",0); - -/* bzero(serv_pendbuf_arr, sizeof(pendbuf_t)*PENDING_BUF_NUM); */ - for(i=0; ibuf; - bzero(pbuf, sizeof(pendbuf_t)); - pbuf->buf = buf; - pbuf->avail=1; - } - - pbuf_ordering_plist_head=NULL; - pbuf_proc_list_info = (proc_waitlist_t *)malloc(sizeof(proc_waitlist_t)*armci_nproc); - assert(pbuf_proc_list_info != NULL); - bzero(pbuf_proc_list_info, sizeof(proc_waitlist_t)*armci_nproc); - ARMCI_PR_DBG("exit",0); -} - -void armci_pendbuf_service_req(immbuf_t *immbuf) { - pendbuf_t *pbuf; - request_header_t *msginfo=(request_header_t*)immbuf->buf; - if(IS_IMM_MSG(*msginfo) && _armci_serv_pendbuf_can_progress(immbuf)) { - /* printf("%d: msg vbuf=%p op=%d from=%d imm=%d datalen=%d bytes=%d data_ptr=%p can progress. Completing it now!\n", */ - /* armci_me, vbuf, msginfo->operation, msginfo->from, msginfo->tag.imm_msg,msginfo->datalen,msginfo->bytes,msginfo->tag.data_ptr); */ - /* fflush(stdout); */ - armci_complete_immbuf(immbuf); - } - else if(pbuf = _armci_serv_pendbuf_enqueue(immbuf)) { - /* printf("%d: msg vbuf=%p op=%d from=%d imm=%d datalen=%d bytes=%d data_ptr=%p got pending buf. Progressing it!\n", */ - /* armci_me, vbuf, msginfo->operation, msginfo->from, msginfo->tag.imm_msg,msginfo->datalen,msginfo->bytes,msginfo->tag.data_ptr); */ - /* fflush(stdout); */ - _armci_serv_pendbuf_progress(pbuf); - } - else { - /* printf("%d: msg vbuf=%p op=%d from=%d imm=%d datalen=%d bytes=%d data_ptr=%p in waitlist!\n", armci_me, vbuf, msginfo->operation, msginfo->from, msginfo->tag.imm_msg,msginfo->datalen,msginfo->bytes,msginfo->tag.data_ptr); */ - /* fflush(stdout); */ - } -} - -/**Network layer reporting to split buffers code that a put completed - * on a pending buffer. - * @param pbufid IN Pending buffer id (specified when starting a - * put). - * @return void - */ -void armci_pendbuf_done_put(int pbufid) { - assert(pbufid>=0 && pbufid=0 && pbufidstatus) { - case RECV_DSCR_PENDING: - pbuf->status = RECV_DSCR_DONE; - break; - case RECV_DATA_PENDING: - pbuf->status = RECV_DATA_DONE; - break; - default: - armci_die("Reporting get done on buf with inappropriate status",pbufid); - } - _armci_serv_pendbuf_progress(pbuf); -} - - -#endif /*PEND_BUFS*/ - diff --git a/armci/src-gemini/pendbufs.h b/armci/src-gemini/pendbufs.h deleted file mode 100644 index d754b0ec8..000000000 --- a/armci/src-gemini/pendbufs.h +++ /dev/null @@ -1,72 +0,0 @@ -/** @file Split buffer implementation. - * @author Sriram Krishnamoorthy - * - * Supports multiple short/immediate buffers posted per client and a - * client-independent number of buffers to handle large messages. - */ -#ifndef _PENDBUFS_H_ -#define _PENDBUFS_H_ - -#if defined(PEND_BUFS) - -#include "armcip.h" -#include "request.h" - - -/**The buf should be the first field in immbuf_t and pendbuf_t. For - example, look at openib.c:armci_rcv_req and maybe other places*/ -typedef struct immbuf_t { - char *buf; /*immediate buffer[IMMBUF_LEN]*/ -/* IMMBUF_NW_T fields; */ - IMMBUF_NW_T - struct immbuf_t *immbuf_list_next; -} immbuf_t; - -typedef struct pendbuf_t { - char *buf; /*pending buffer[PENDBUF_LEN]*/ -/* PENDBUF_NW_T fields; */ - PENDBUF_NW_T - int status; /*state & state) { - desc->state &= ~state; - if(desc->state == 0) desc->done = 1; - return 1; - } else { - printf("event: %s with desc state %x not %x\n",name,desc->state,state); - abort(); - return 0; - } -} - - -int -portals_wait(portals_desc_t *wait_on_desc) { - - int rc; - ptl_event_t ev; - portals_desc_t *desc = NULL; - - while(wait_on_desc->state) { - - rc = portals_eqwait(wait_on_desc->eqh, &ev); - if (rc != PTL_OK) { - printf("eq wait error in portals_wait\n"); - abort(); - } - - desc = (portals_desc_t *) ev.md.user_ptr; - - switch(ev.type) { - - case PTL_EVENT_SEND_START: - if (portals_verbose) printf("%s event: send start\n",Portals_ID()); - notify(desc, STATE_SEND_START, "send start"); - break; - - case PTL_EVENT_SEND_END: - if (portals_verbose) printf("%s event: send end\n",Portals_ID()); - notify(desc, STATE_SEND_END, "send end"); - break; - - case PTL_EVENT_REPLY_START: - if (portals_verbose) printf("%s event: reply start\n",Portals_ID()); - notify(desc, STATE_REPLY_START, "reply start"); - break; - - case PTL_EVENT_REPLY_END: - if (portals_verbose) printf("%s event: reply end\n",Portals_ID()); - notify(desc, STATE_REPLY_END, "reply end"); - break; - - case PTL_EVENT_ACK: - if (portals_verbose) printf("%s event: ack\n",Portals_ID()); - printf("%s event ack: md.threshold=%d\n",Portals_ID(),ev.md.threshold); - notify(desc, STATE_ACK, "ack"); - break; - - case PTL_EVENT_PUT_START: - if (portals_verbose) printf("%s event: put start\n",Portals_ID()); - notify(desc, STATE_PUT_START, "put start"); - break; - - case PTL_EVENT_PUT_END: - if (portals_verbose) printf("%s event: put end\n",Portals_ID()); - if (notify(desc, STATE_PUT_END, "put end")) { - // desc->len = ev.mlength; - // desc->off = ev.offset; - } - break; - - case PTL_EVENT_GET_START: - if (portals_verbose) printf("%s event: get start\n",Portals_ID()); - notify(desc, STATE_GET_START, "get start"); - break; - - case PTL_EVENT_GET_END: - if (portals_verbose) printf("%s event: get end\n",Portals_ID()); - notify(desc, STATE_GET_END, "get end"); - break; - - case PTL_EVENT_UNLINK: - if (portals_verbose) printf("%s event: unlink\n",Portals_ID()); - notify(desc, STATE_UNLINK, "unlink"); - break; - - default: - printf("%s event: %d\n",Portals_ID(), ev.type); - break; - } - - } - - return PTL_OK; -} - - -int -portals_put(portals_desc_t *desc) -{ - int rc; - int threshold = 1; - ptl_md_t md = { 0 }; - ptl_handle_md_t md_handle; - ptl_ack_req_t ack_req = PTL_NOACK_REQ; - - # ifdef PORTALS_PUT_USE_ACK - ack_req = PTL_ACK_REQ; - threshold++; - # endif - - md.start = desc->buffer; - md.length = desc->length; - md.threshold = threshold; - md.options = 0; - # ifndef PORTALS_PUT_USE_START - md.options |= PTL_MD_EVENT_START_DISABLE; - # endif - md.user_ptr = desc; - md.eq_handle = desc->eqh; - - rc = portals_md_bind(desc->nih, md, PTL_UNLINK, &md_handle); - if (rc != PTL_OK) { - printf("failed to bind local md in put; err %d",rc); - Fatal_error(rc); - } - - rc = PtlPut(md_handle, - ack_req, - desc->id, - PORTALS_INDEX, - 0, - desc->mbits, - 0, - desc->hdr); - if (rc != PTL_OK) { - printf("PtlPut err %d\n",rc); - return rc; - } - - desc->done = 0; - desc->state = STATE_SEND_END; - - # ifdef PORTALS_PUT_USE_START - desc->state |= STATE_SEND_START; - # endif - - # ifdef PORTALS_PUT_USE_ACK - desc->state |= STATE_ACK; - # endif - - return PTL_OK; -} - - -int -portals_get(portals_desc_t* desc) -{ - int rc; - ptl_md_t md = { 0 }; - ptl_handle_md_t md_handle; - - md.start = desc->buffer; - md.length = desc->length; - md.threshold = 2; - md.options = 0; - # ifndef PORTALS_GET_USE_START - md.options |= PTL_MD_EVENT_START_DISABLE; - # endif - md.user_ptr = desc; - md.eq_handle = desc->eqh; - - rc = portals_md_bind(desc->nih, md, PTL_UNLINK, &md_handle); - if (rc != PTL_OK) { - printf("failed to bind local md in get; err %d\n",rc); - Fatal_error(rc); - } - - rc = PtlGet(md_handle, - desc->id, - PORTALS_INDEX, - 0, - desc->mbits, - 0); - if (rc != PTL_OK) { - printf("PtlGet err %d\n",rc); - Fatal_error(rc); - } - - desc->done = 0; - desc->state = STATE_REPLY_END | STATE_SEND_END; - - # ifdef PORTALS_GET_USE_START - desc->state |= STATE_REPLY_START; - # endif - - return PTL_OK; -} - - -/* -portals_desc_t* -portals_get_free_desc(void) -{ - int i,rc; - portals_desc_t *desc = NULL; - - while(desc == NULL) { - for(i=0; i= 0;bit--) - { - if ((mask << bit) & (unsigned char)*(data+ptr)) - { - printf("1"); - } - else - { - printf("0"); - } - } - printf(" "); - } - printf("\n"); -} - - -void -portals_print_summary() -{ - printf("PORTALS_MAX_DESCRIPTORS = %d\n",PORTALS_MAX_DESCRIPTORS); - printf("PORTALS_MAX_BUFS = %d\n",PORTALS_MAX_BUFS); - printf("PORTALS_MAX_SMALL_BUFS = %d\n",PORTALS_MAX_SMALL_BUFS); - printf("PORTALS_BUF_SIZE = %d\n",PORTALS_BUF_SIZE); - printf("PORTALS_SMALL_BUF_SIZE = %d\n",PORTALS_SMALL_BUF_SIZE); - printf("PORTALS_NREQUEST_BUFFERS = %d\n",PORTALS_NREQUEST_BUFFERS); - printf("PORTALS_MAX_EAGER_MESSAGE_SIZE = %d\n",PORTALS_MAX_EAGER_MESSAGE_SIZE); - return; -} diff --git a/armci/src-gemini/portals.h b/armci/src-gemini/portals.h deleted file mode 100644 index fe11caebf..000000000 --- a/armci/src-gemini/portals.h +++ /dev/null @@ -1,194 +0,0 @@ -/* ---------------------------------------------------------------------------------------------- *\ - portals.h header -\* ---------------------------------------------------------------------------------------------- */ - # ifndef _PORTALS_H_ - # define _PORTALS_H_ - - # define PORTALS_INDEX 1 - - # define ONE_KB 1024 - # define ONE_MB 1048576 - - # define MAX_DS_MSG_SIZE ONE_MB - - # define PORTALS_MAX_DESCRIPTORS (MAX_BUFS+MAX_SMALL_BUFS) - # define PORTALS_MAX_BUFS MAX_BUFS - # define PORTALS_MAX_SMALL_BUFS MAX_SMALL_BUFS - # define PORTALS_BUF_SIZE MSG_BUFLEN /* defined in requesh.h */ - -/* define small buf length here - formerly request.h */ - # ifdef PORTALS_USE_RENDEZ_VOUS - # define PORTALS_SMALL_BUF_SIZE 1024 /* for use with nwchem only -- will not pass armci test.x */ - # define PORTALS_MAX_EAGER_MESSAGE_SIZE PORTALS_SMALL_BUF_SIZE - # else - # define PORTALS_SMALL_BUF_SIZE 1024 - # define PORTALS_MAX_EAGER_MESSAGE_SIZE PORTALS_BUF_SIZE - # endif - - # define PORTALS_NREQUEST_BUFFERS 40 - # define PORTALS_REQUEST_BUFFER_SIZE_WARNING (128*ONE_MB) - - # define PORTALS_READ_ACCESS 1 - # define PORTALS_WRITE_ACCESS 1000 - - # define MATCH_ALL_MBITS 0x8000000000000000 /* should be set for all data requests */ - # define MATCH_ALL_IBITS ~MATCH_ALL_MBITS /* used to mask out all other bits, but MATCH_ALL */ - - - # define STATE_SEND_START 0x1 - # define STATE_SEND_END 0x2 - # define STATE_REPLY_START 0x4 - # define STATE_REPLY_END 0x8 - # define STATE_ACK 0x10 - # define STATE_PUT_START 0x20 - # define STATE_PUT_END 0x40 - # define STATE_GET_START 0x80 - # define STATE_GET_END 0x100 - # define STATE_UNLINK 0x200 - - - # define DS_RESPONSE_ACK 0x100000000000000 - # define DS_RESPONSE_PUT 0x200000000000000 - # define DS_RESPONSE_GET 0x400000000000000 - - # define PORTALS_ALLOW_NBGETS - # define PORTALS_USE_ARMCI_CLIENT_BUFFERS - - # define PORTALS_PUT_USE_ACK_TURNED_OFF - # define PORTALS_PUT_USE_START_TURNED_OFF - # define PORTALS_GET_USE_START_TURNED_OFF - -/* ---------------------------------------------------------------------------------------------- *\ - portals types -\* ---------------------------------------------------------------------------------------------- */ - typedef struct portals_desc_s { - void* buffer; // used for the md - ptl_size_t length; // used for the md - ptl_process_id_t id; // on whom the operation is acting on - ptl_match_bits_t mbits; // operations destination mbits - ptl_hdr_data_t hdr; // used for puts/unique counter value - - ptl_handle_ni_t nih; // network interface handle - ptl_handle_eq_t eqh; // event handler - ptl_handle_me_t meh; // me handle (if necessary) - ptl_handle_md_t mdh; // md handle (if necessary) - - int state; // track outstanding events remaining on the descriptor - int done; // flag to test whether all work on the descriptor is finished - int noperations; // the number of remote operations allowed on buffer - // this is only used when preposting/pinning CP memory - // for remote operations initiated by the data server - } portals_desc_t; - - - typedef struct portals_ds_req_s { - portals_desc_t req_desc; - portals_desc_t ack_desc; - portals_desc_t data_desc; - ptl_process_id_t dsid; - size_t unique_msg_id; - int active; - int remote_node; - } portals_ds_req_t; - - -/* ---------------------------------------------------------------------------------------------- *\ - portals global variables -\* ---------------------------------------------------------------------------------------------- */ - ptl_handle_ni_t cp_nih; - ptl_handle_ni_t ds_nih; - ptl_handle_eq_t cp_eqh; - ptl_handle_eq_t ds_eqh; - ptl_process_id_t *portals_id_map; - ptl_process_id_t *portals_cloned_id_map; - - int portals_ds_ready; - int portals_cp_finished; - - size_t portalsMaxEagerMessageSize; - - -/* ---------------------------------------------------------------------------------------------- *\ - portals prototypes -\* ---------------------------------------------------------------------------------------------- */ - int portals_init(ptl_handle_ni_t*); - int portals_finalize(ptl_handle_ni_t); - int portals_getid(ptl_handle_ni_t,ptl_process_id_t *); - int portals_free_eq(ptl_handle_eq_t); - int portals_create_eq(ptl_handle_ni_t, ptl_size_t, ptl_handle_eq_t*); - int portals_create_matchall_me(ptl_handle_me_t*); - int portals_me_attach(ptl_handle_ni_t,ptl_process_id_t,ptl_match_bits_t,ptl_match_bits_t,ptl_handle_me_t*); - int portals_me_insert(ptl_handle_me_t,ptl_process_id_t,ptl_match_bits_t,ptl_match_bits_t,ptl_handle_me_t*); - int portals_me_unlink(ptl_handle_me_t); - int portals_md_attach(ptl_handle_me_t,ptl_md_t,ptl_unlink_t,ptl_handle_md_t*); - int portals_md_bind(ptl_handle_ni_t,ptl_md_t,ptl_unlink_t,ptl_handle_md_t*); - - int portals_eqwait(ptl_handle_eq_t,ptl_event_t*); - int portals_put(portals_desc_t*); - int portals_get(portals_desc_t*); - int portals_wait(portals_desc_t*); - - - void* portalsCloneDataServer(void *); - void portalsSpinLockOnInt(volatile int*, int, int); - - void portals_print_event_details(ptl_event_t *ev); - - void Fatal_error(int); - const char *Portals_ID(); - - void bit_print(const char *,int); - void hex_print(const char *,int); - void portals_print_summary(); - -/* ---------------------------------------------------------------------------------------------- *\ - portals data server prototypes -\* ---------------------------------------------------------------------------------------------- */ - void* portals_ds_thread(void* args); - int portals_ds_init(void); - int portals_ds(void); - int portal_send_test_ack(int to,int val); - int portals_ds_requeue_md(int); - - void portals_ds_get_from_cp(void*,ptl_size_t,ptl_process_id_t,ptl_match_bits_t); - - //void ds_handler(DDI_Patch*,ptl_process_id_t); - -/* ---------------------------------------------------------------------------------------------- *\ - portals compute process prototypes -\* ---------------------------------------------------------------------------------------------- */ - int portals_cp_init(void); - int portals_cp_getid(ptl_process_id_t *id); - - void portals_req_send(void *buffer, size_t size, portals_ds_req_t *req); - void portals_req_nbsend(void *buffer, size_t size, portals_ds_req_t *req); - void portals_req_wait(portals_ds_req_t *req); - - void portals_remote_get(void *buffer, request_header_t *msginfo, int remote_node); - void portals_remote_put(void *buffer, request_header_t *msginfo, int remote_node); - void portals_remote_acc(void *buffer, request_header_t *msginfo, int remote_node); - void portals_remote_rmw(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req); - void portals_remote_nbget(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req); - void portals_remote_nbput(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req); - void portals_remote_nbacc(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req); - - - void portalsRemoteOperation(void*,size_t,ptl_process_id_t,portals_ds_req_t*); - void portalsRemoteOperationToRank(void*,size_t,int,portals_ds_req_t*); - void portalsRemoteOperationToNode(void*,size_t,int,portals_ds_req_t*); - - void portalsBlockingRemoteOperationToNode(void*,size_t,int); - - -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx; - - __asm__("cpuid" - : "=a" (eax), "=b" (ebx) - : "0" (op) - : "cx", "dx" ); - return ebx; -} - - # endif diff --git a/armci/src-gemini/portals_cp.c b/armci/src-gemini/portals_cp.c deleted file mode 100644 index fa102ae0e..000000000 --- a/armci/src-gemini/portals_cp.c +++ /dev/null @@ -1,913 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* ---------------------------------------------------------------------------------------------- *\ - portals_cp.c -- compute process portals calls - author: ryan olson - email: ryan@cray.com -\* ---------------------------------------------------------------------------------------------- */ - # include "armcip.h" - # include - # include - # include - # include - -/* ---------------------------------------------------------------------------------------------- *\ -\* ---------------------------------------------------------------------------------------------- */ - static ptl_handle_ni_t cp_nih; - static ptl_handle_eq_t cp_eqh; - static ptl_handle_eq_t cp_tx_eqh; - - static void *portals_eager_send_buffer = NULL; - static size_t portals_unique_msg_counter = 373; - - static int portals_smp_sem = -1; - static int *active_requests_by_node = NULL; - -/* ---------------------------------------------------------------------------------------------- *\ -\* ---------------------------------------------------------------------------------------------- */ - int portals_cp_finished = 0; - - -/* ---------------------------------------------------------------------------------------------- *\ - Implementation -\* ---------------------------------------------------------------------------------------------- */ - -int -portals_cp_init(void) -{ - int rc; - int me; - ptl_process_id_t id; - - rc = portals_init(&cp_nih); - if(rc != PTL_OK) { - printf("error in portals_init: err %d\n",rc); - Fatal_error(rc); - } - - rc = portals_create_eq(cp_nih,10*PORTALS_MAX_DESCRIPTORS,&cp_eqh); - if(rc != PTL_OK) { - printf("failed to create cp event queue; err %d\n",rc); - Fatal_error(911); - } - - rc = portals_create_eq(cp_nih,30,&cp_tx_eqh); - if(rc != PTL_OK) { - printf("failed to create cp_tx event queue; err %d\n",rc); - Fatal_error(911); - } - - rc = portals_cp_getid(&id); - if(rc != PTL_OK) { - printf("failed to get the portals id; err %d\n",rc); - Fatal_error(rc); - } - - /* creating an smp/intra-node communicator */ - MPI_Comm_rank(ARMCI_COMM_WORLD,&me); - MPI_Comm_split(ARMCI_COMM_WORLD,id.nid,me,&portals_smp_comm); - - /* set affinity */ - # ifdef PORTALS_AFFINITY - int smp_np, smp_me; - unsigned long mask; - unsigned int len = sizeof(mask); - unsigned long ncpus; - unsigned int nsockets, siblings; - int cores_per_socket, cps_per_socket; - int verbose = 0; - - MPI_Comm_size(portals_smp_comm,&smp_np); - MPI_Comm_rank(portals_smp_comm,&smp_me); - - - if((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) < 0) { - printf("%d [cp] sysconf(_SC_NPROCESSORS_ONLN) failed; err=%d\n", ncpus); - armci_die("sysconf in init_throttle",911); - } - - - if(sched_getaffinity(0, len, &mask) < 0) { - perror("sched_getaffinity"); - armci_die("getaffinity error in ds_init",911); - } - - if(armci_clus_me == 0 && /* verbose */ 0 ) { - printf("%d [cp]: old affinity = 0x%x, ncpus = %d\n", armci_me, mask, ncpus); - } - - if(smp_me == 0) { - mask = 1 << (ncpus-1); - if(sched_setaffinity(0, len, (cpu_set_t *) &mask) < 0) { - perror("sched_setaffinity to probe the socket count"); - armci_die("setaffinity error in ds_init",911); - } - siblings = cpuid_ebx(1) >>16 & 0xff; - nsockets = ncpus / siblings; - } - MPI_Bcast(&nsockets,1,MPI_INT,0,portals_smp_comm); - - cores_per_socket = ncpus/nsockets; - cps_per_socket = (smp_np / nsockets); - cps_per_socket += (smp_np % nsockets); - if(nsockets > 2) { - armci_die("nsockets > 2 not supported",911); - } - if(smp_me < cps_per_socket) { - mask = 1 << smp_me; - } else { - mask = 1 << (smp_me + (cores_per_socket - cps_per_socket)); - } - - if(sched_setaffinity(0, len, (cpu_set_t *) &mask) < 0) { - perror("sched_setaffinity"); - armci_die("setaffinity error in ds_init",911); - } - - if(sched_getaffinity(0, len, &mask) < 0) { - perror("sched_getaffinity"); - armci_die("getaffinity error (#2) in ds_init",911); - } - - if(armci_clus_me == 0 && verbose) { - printf("%d [cp]: new affinity = 0x%x, ncpus = %d\n", armci_me, mask, ncpus); - } - # endif - - return PTL_OK; -} - - -int -portals_cp_finalize() -{ - int rc; - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - armci_semrm(portals_smp_sem); - # endif - - rc = portals_free_eq(cp_eqh); - if (rc != PTL_OK) { - printf("error freeing cp_eqh; err %d\n",rc); - } - - MPI_Barrier(ARMCI_COMM_WORLD); - MPI_Finalize(); - - portals_cp_finished = 1; - exit(0); - - return PTL_OK; -// return portals_finalize(cp_nih); -} - - -int -portals_cp_getid(ptl_process_id_t *id) -{ - return portals_getid(cp_nih, id); -} - - -static size_t -portals_get_unique_msg_id(void) { - size_t val = armci_me*1000; - portals_unique_msg_counter++; - if(portals_unique_msg_counter == 1000) portals_unique_msg_counter=1; - val += portals_unique_msg_counter; - return val; -} - - -static void -portals_req_clear(portals_ds_req_t *req) -{ - req->active = 0; - req->unique_msg_id = 0; - - req->req_desc.done = 1; - req->req_desc.state = 0; - req->req_desc.eqh = cp_tx_eqh; - - req->ack_desc.done = 1; - req->ack_desc.state = 0; - req->ack_desc.eqh = cp_eqh; - - req->data_desc.done = 1; - req->data_desc.state = 0; - req->data_desc.eqh = cp_eqh; - - req->remote_node = -1; -} - - -static ptl_process_id_t -portals_get_dsid_from_node(int remote_node) -{ - int rank = armci_clus_info[remote_node].master; - if(portals_cloned_id_map) return portals_cloned_id_map[rank]; - else return portals_id_map[rank]; -} - - -static ptl_process_id_t -portals_get_dsid_from_rank(int remote_id) -{ - if(portals_cloned_id_map) return portals_cloned_id_map[remote_id]; - else return portals_id_map[remote_id]; -} - -void -portals_req_nbsend(void *buffer, size_t size, portals_ds_req_t *req) -{ - int rc; - portals_desc_t *desc = &req->req_desc; - - assert(req->unique_msg_id); - assert(size < portalsMaxEagerMessageSize); - assert(req->remote_node >= 0); - - /* ---------------------------------------------------------------------------- *\ - if we get here, we can guarantee that where are no outstanding requests from - this PE to the remote node; however, we can not guarantee that other PEs on - this node aren't talking to the intended data server ... so now we wait on - value in the "shared" array. - \* ---------------------------------------------------------------------------- */ - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - int got_lock = 0; - while(!got_lock) { - portalsSpinLockOnInt(&active_requests_by_node[req->remote_node],0,1000); - semaphoreAcquire(portals_smp_sem,1,PORTALS_WRITE_ACCESS); - if(active_requests_by_node[req->remote_node] == 0) { - active_requests_by_node[req->remote_node] = 1; - got_lock = 1; - } - semaphoreRelease(portals_smp_sem,1,PORTALS_WRITE_ACCESS); - } - # endif - - desc->buffer = buffer; - desc->length = size; - desc->id = req->dsid; - desc->mbits = MATCH_ALL_MBITS; - desc->hdr = req->unique_msg_id; - desc->state = 0; - desc->eqh = cp_tx_eqh; - desc->nih = cp_nih; - - rc = portals_put(desc); - if(rc != PTL_OK) { - printf("portals_put err %d\n",rc); - Fatal_error(rc); - } -} - -void -portals_req_send(void *buffer, size_t size, portals_ds_req_t *req) -{ - int rc; - portals_desc_t *desc = &req->req_desc; - - portals_req_nbsend(buffer,size,req); - - rc = portals_wait(desc); - if(rc != PTL_OK) { - printf("portals_wait err %d\n",rc); - Fatal_error(rc); - } -} - - -static inline void -portals_req_wait(portals_ds_req_t *req) -{ - int rc; - - if(req->req_desc.state) { - rc = portals_wait( &(req->req_desc) ); - if(rc != PTL_OK) { - printf("portals wait error on req_desc in req_wait; err=%d\n",rc); - Fatal_error(rc); - } - } - - if(req->ack_desc.state) { - rc = portals_wait( &(req->ack_desc) ); - if(rc != PTL_OK) { - printf("portals wait error on ack_desc in req_wait; err=%d\n",rc); - Fatal_error(rc); - } - } - if(req->data_desc.state) { - rc = portals_wait( &(req->data_desc) ); - if(rc != PTL_OK) { - printf("portals wait error on data_desc in req_wait; err=%d\n",rc); - Fatal_error(rc); - } - } - - req->active = 0; - return; -} - - -void -portalsWaitOnRequest(portals_ds_req_t *req) { - portals_req_wait(req); -} - - -static int -portals_prepost_ack_from_ds(portals_ds_req_t *req) -{ - int rc; - ptl_md_t md; - portals_desc_t *desc = &req->ack_desc; - unsigned long mbits = req->unique_msg_id; - - assert(req->unique_msg_id); - assert(req->remote_node >= 0); - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - desc->buffer = &active_requests_by_node[req->remote_node]; - desc->length = sizeof(int); - # else - desc->buffer = NULL; - desc->length = 0; - # endif - desc->id = req->dsid; - desc->mbits = mbits | DS_RESPONSE_ACK; - desc->hdr = mbits; - desc->eqh = cp_eqh; - - rc = portals_me_attach(cp_nih,desc->id,desc->mbits,0,&desc->meh); - if(rc != PTL_OK) { - printf("me failed in prepost ack\n"); - Fatal_error(rc); - } - - md.start = desc->buffer; - md.length = desc->length; - md.threshold = 1; - md.options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - md.user_ptr = desc; - md.eq_handle = cp_eqh; - - rc = portals_md_attach(desc->meh,md,PTL_UNLINK,&desc->mdh); - if(rc != PTL_OK) { - printf("md failed in prepost ack\n"); - Fatal_error(rc); - } - - // desc->state = STATE_PUT_END; - // |= needed for rendez-vous gets; put and get using the same descriptor - desc->state |= STATE_PUT_END; - desc->done = 0; -} - - -static int -portals_prepost_put_from_ds(void *buffer, size_t size, portals_ds_req_t *req) -{ - int rc; - int nputs; - ptl_md_t md; - portals_desc_t *desc = &req->data_desc; - unsigned long mbits = req->unique_msg_id; - - assert(req->unique_msg_id); - - desc->buffer = buffer; - desc->length = size; - desc->id = req->dsid; - desc->mbits = mbits | DS_RESPONSE_PUT; - desc->hdr = mbits; - desc->eqh = cp_eqh; - - rc = portals_me_attach(cp_nih,desc->id,desc->mbits,0,&desc->meh); - if(rc != PTL_OK) { - printf("me failed in prepost put\n"); - Fatal_error(rc); - } - - md.start = buffer; - md.length = size; - md.threshold = desc->noperations; - md.options = PTL_MD_OP_PUT - | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - | PTL_MD_EVENT_START_DISABLE - | PTL_MD_EVENT_END_DISABLE; - md.user_ptr = (void *) desc; - md.eq_handle = cp_eqh; - - rc = portals_md_attach(desc->meh,md,PTL_UNLINK,&desc->mdh); - if(rc != PTL_OK) { - printf("md failed in prepost put\n"); - Fatal_error(rc); - } - - // desc->state = STATE_UNLINK; - // |= needed for rendez-vous gets; put and get using the same descriptor - desc->state |= STATE_UNLINK; - desc->done = 0; -} - - -static int -portals_prepost_get_from_ds(void *buffer, size_t size, portals_ds_req_t *req) { - - int rc; - ptl_md_t md; - portals_desc_t *desc = &req->data_desc; - unsigned long mbits = req->unique_msg_id; - - assert(req->unique_msg_id); - - desc->buffer = buffer; - desc->length = size; - desc->id = req->dsid; - desc->mbits = mbits | DS_RESPONSE_GET; - desc->hdr = mbits; - desc->eqh = cp_eqh; - - rc = portals_me_attach(cp_nih,desc->id,desc->mbits,0,&desc->meh); - if(rc != PTL_OK) { - printf("me failed in prepost get\n"); - Fatal_error(rc); - } - - md.start = buffer; - md.length = size; - md.threshold = desc->noperations; - md.options = PTL_MD_OP_GET - | PTL_MD_EVENT_START_DISABLE; - // | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - // | PTL_MD_EVENT_START_DISABLE - // | PTL_MD_EVENT_END_DISABLE; - md.user_ptr = (void *) desc; - md.eq_handle = cp_eqh; - - rc = portals_md_attach(desc->meh,md,PTL_UNLINK,&desc->mdh); - if(rc != PTL_OK) { - printf("md failed in prepost get\n"); - Fatal_error(rc); - } - - // printf("%d: preposted get of lenght=%ld\n",armci_me,size); - // desc->state = STATE_UNLINK; - // desc->state = STATE_GET_END; - // |= needed for rendez-vous gets; put and get using the same descriptor - desc->state |= STATE_GET_END; - desc->done = 0; -} - - -void portalsBlockingRemoteOperationToNode(void *buffer, size_t length, int remote_node) { - portals_ds_req_t req; - portals_req_clear(&req); - portalsRemoteOperationToNode(buffer,length,remote_node,&req); - portalsWaitOnRequest(&req); -} - - -void portalsRemoteOperationToNode(void *buffer, size_t length, int remote_node, portals_ds_req_t *req) -{ - ptl_process_id_t id = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - portalsRemoteOperation(buffer,length,id,req); -} - - -/* -void portalsRemoteOperationToRank(void *buffer, size_t length, int remote_rank, portals_ds_req_t *req) { - ptl_process_id_t id = portals_get_dsid_from_rank(remote_rank); - portalsRemoteOperation(buffer,length,id,req); -} -*/ - - -void -portalsRemoteOperation(void *buffer, size_t length, ptl_process_id_t dsid, portals_ds_req_t *req) -{ - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - // portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = dsid; - - /* --------------------------------------------------------------------- *\ - the only response from the ds will be a 0-byte ack coming in as a put - \* --------------------------------------------------------------------- */ - portals_prepost_ack_from_ds(req); - - /* --------------------------------------------------------------------- *\ - send data request; this is a completely blocking req - \* --------------------------------------------------------------------- */ - portals_req_send(buffer,length,req); -} - - -void -portals_send_oper(int remote_node,int val, portals_ds_req_t *req) -{ - int rc; - request_header_t msg; - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - - /* --------------------------------------------------------------------- *\ - the only response from the ds will be a 0-byte ack coming in as a put - \* --------------------------------------------------------------------- */ - portals_prepost_ack_from_ds(req); - - /* --------------------------------------------------------------------- *\ - prepare data request and send it; this is a completely blocking req - \* --------------------------------------------------------------------- */ - msg.operation = val; - portals_req_send(&msg,sizeof(request_header_t),req); - return; -} - - -void -portals_send_QUIT(int remote_node) -{ - portals_ds_req_t req; - portals_send_oper(remote_node,QUIT,&req); - portals_req_wait(&req); -} - - -static int -portals_determine_remote_op_count(request_header_t *msg) -{ -#ifdef DDI - int nr,nc,np; - int datatype_extent = sizeof(double); - - /* --------------------------------------------------------------------- *\ - previously we have worked with words, but to provide support for - other data types, we must work with bytes. note to developers: - datatype_extent = the size in bytes of the stored datatype - \* --------------------------------------------------------------------- */ - if(msg->size*datatype_extent <= MAX_DS_MSG_SIZE) return 1; - - /* --------------------------------------------------------------------- *\ - the data must be moved in segments; determine patch dimensions - \* --------------------------------------------------------------------- */ - nr = msg->ihi - msg->ilo + 1; - nc = msg->jhi - msg->jlo + 1; - - /* --------------------------------------------------------------------- *\ - each column individually is too long to fit in the buffer - \* --------------------------------------------------------------------- */ - if(nr*datatype_extent < MAX_DS_MSG_SIZE) { - - /* ------------------------------------------------------------------ *\ - np the number of "evenly" sized passed needed to send a column - \* ------------------------------------------------------------------ */ - np = 2; - while(((nr/np)+((nr%np)?1:0)*datatype_extent)>MAX_DS_MSG_SIZE) np++; - - /* ------------------------------------------------------------------ *\ - noperations is np times the number of columns to be sent - \* ------------------------------------------------------------------ */ - return np*nc; - - } - - /* --------------------------------------------------------------------- *\ - determine the number of full columns that can be sent in one pass - break down the subpatch on this metric - \* --------------------------------------------------------------------- */ - else { - - /* ------------------------------------------------------------------ *\ - np is the number of passes needed to send the full patch which - is broken down into "evenly" sized sets of columns that fit in - the allocated buffer region - \* ------------------------------------------------------------------ */ - np = 2; - while(nr*((nc/np)+((nc%np)?1:0))*datatype_extent>MAX_DS_MSG_SIZE) np++; - - /* ------------------------------------------------------------------ *\ - noperations is np - \* ------------------------------------------------------------------ */ - return np; - - } - - assert(0); // should not happen - return -1; -#else - return 1; -#endif -} - -void -portals_remote_rmw(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req) -{ - ptl_size_t length; - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - - /* --------------------------------------------------------------------- *\ - prepare the buffer into which the ds will put data - \* --------------------------------------------------------------------- */ - req->data_desc.noperations=portals_determine_remote_op_count(msginfo); - portals_prepost_put_from_ds(buffer,msginfo->datalen,req); - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - portals_prepost_ack_from_ds(req); - # endif - - /* --------------------------------------------------------------------- *\ - send data request - note: from armci_send_req - if get, the value of bytes (local: length) - is msginfo->dscrlen + (hdrlen=sizeof(request_header_t) ... this is - the size of the "data server request message" to be sent - \* --------------------------------------------------------------------- */ - length = sizeof(request_header_t) + msginfo->dscrlen + msginfo->datalen; - portals_req_send(msginfo,length,req); -} - -void -portals_remote_get(void *buffer, request_header_t *msginfo, int remote_node) -{ - portals_ds_req_t req; - portals_remote_nbget(buffer,msginfo,remote_node,&req); - portals_req_wait(&req); -} - -void -portals_remote_nbget(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req) -{ - ptl_size_t length; - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - - /* --------------------------------------------------------------------- *\ - prepare the buffer into which the ds will put data - \* --------------------------------------------------------------------- */ - req->data_desc.noperations=portals_determine_remote_op_count(msginfo); - portals_prepost_put_from_ds(buffer,msginfo->datalen,req); - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - portals_prepost_ack_from_ds(req); - # endif - - /* --------------------------------------------------------------------- *\ - send data request - note: from armci_send_req - if get, the value of bytes (local: length) - is msginfo->dscrlen + (hdrlen=sizeof(request_header_t) ... this is - the size of the "data server request message" to be sent - \* --------------------------------------------------------------------- */ - length = sizeof(request_header_t) + msginfo->dscrlen; - - # if defined(PORTALS_USE_RENDEZ_VOUS) - if(length < portalsMaxEagerMessageSize) portals_req_send(msginfo,length,req); - else { - req->data_desc.noperations = 1; - portals_prepost_get_from_ds(msginfo,length,req); - - /* ------------------------------------------------------------------ *\ - send data request: branch here for eager vs. rendez-vous - \* ------------------------------------------------------------------ */ - assert(length <= PORTALS_BUF_SIZE); - portals_req_send(msginfo,sizeof(request_header_t),req); - } - # else - portals_req_send(msginfo,length,req); - # endif -} - - -void -portals_remote_put(void *buffer, request_header_t *msginfo, int remote_node) -{ - portals_ds_req_t req; - portals_remote_nbput(buffer,msginfo,remote_node,&req); - portals_req_wait(&req); -} - - -void -portals_remote_nbput(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req) -{ - char *eagerBuffer = NULL; - size_t eagerSendSize = 0; - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - - /* --------------------------------------------------------------------- *\ - prepost ack response from the data server - \* --------------------------------------------------------------------- */ - portals_prepost_ack_from_ds(req); - - /* --------------------------------------------------------------------- *\ - eager vs. rendez-vous messaging - eager: pack and send the message immediate (only for small messages) - developers note: since portals_eager_send_buffer only exists once, - this has to be a blocking send (ie the data is on the wire when - req_send has finished and the buffer can be reused. for greater - overlap, create a set of eager send buffers ... however they have to - be managed ... probably best to do it in a ring. - - note: armci put/acc buffer is prepacked. - \* --------------------------------------------------------------------- */ - eagerSendSize = sizeof(request_header_t) + msginfo->dscrlen + msginfo->datalen; - if(eagerSendSize < portalsMaxEagerMessageSize) { -// printf("sending eager message\n"); - # if 0 /* armci prepacked */ - eagerBuffer = (char *) portals_eager_send_buffer; - memcpy(eagerBuffer,msginfo,sizeof(request_header_t)); - eagerBuffer += sizeof(request_header_t); - memcpy(eagerBuffer,buffer,msginfo->bytes); - # endif - eagerBuffer = (char *) msginfo; /* buffer == msginfo for armci */ - portals_req_send(eagerBuffer,eagerSendSize,req); - } - - /* --------------------------------------------------------------------- *\ - rendez-vous: send the ds a request; ds will "get/pull" data - \* --------------------------------------------------------------------- */ - else { - # ifdef PORTALS_USE_RENDEZ_VOUS - /* ------------------------------------------------------------------ *\ - prepare the buffer into which the ds will put data - \* ------------------------------------------------------------------ */ - // req->data_desc.noperations=portals_determine_remote_op_count(msginfo); - req->data_desc.noperations = 1; - portals_prepost_get_from_ds(msginfo,eagerSendSize,req); - - /* ------------------------------------------------------------------ *\ - send data request: branch here for eager vs. rendez-vous - \* ------------------------------------------------------------------ */ - assert(eagerSendSize <= PORTALS_BUF_SIZE); - portals_req_send(msginfo,sizeof(request_header_t),req); - - # else - printf("%d [cp]: rendez-vous messaging not supported\n",armci_me); - abort(); - # endif - } - - -} - - -#if 0 -void -portals_remote_acc(void *buffer, request_header_t *msginfo, int remote_node) -{ - portals_ds_req_t req; - portals_remote_nbacc(buffer,msginfo,remote_node,&req); - portals_req_wait(&req); -} - - -void -portals_remote_nbacc(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req) -{ - char *eagerBuffer = NULL; - size_t eagerSendSize = 0; - - assert(msginfo->bytes); - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - - /* --------------------------------------------------------------------- *\ - eager vs. rendez-vous messaging - eager: pack and send the message immediate (only for small messages) - \* --------------------------------------------------------------------- */ - eagerSendSize = msginfo->bytes + sizeof(request_header_t); - if(eagerSendSize < portalsMaxEagerMessageSize) { - - /* ------------------------------------------------------------------ *\ - prepost ack response from the data server - developers note: if you globally fence an array with a collective - operation prior to a section of code and defence it after, then you - don't need to micro manage the fence on a per request basis in that - section; this eliminates the need for a DS ack - \* ------------------------------------------------------------------ */ - portals_prepost_ack_from_ds(req); - - /* ------------------------------------------------------------------ *\ - pack and send eager data request - blocking for now, since portals_eager_send_buffer only exists once - create multiple eager buffers for greater overlap - \* ------------------------------------------------------------------ */ - eagerBuffer = (char *) portals_eager_send_buffer; - memcpy(eagerBuffer,msginfo,sizeof(request_header_t)); - eagerBuffer += sizeof(request_header_t); - memcpy(eagerBuffer,buffer,msginfo->bytes); - eagerBuffer = (char *) portals_eager_send_buffer; - portals_req_send(eagerBuffer,eagerSendSize,req); - } - - /* --------------------------------------------------------------------- *\ - rendez-vous: send the ds a request; ds will "get/pull" data - developers note: a ds ack is not required for a rendez-vous pull, - this is because the ds will not start the pull until a local fence - has been raised (if needed - see note above) - \* --------------------------------------------------------------------- */ - else { - /* ------------------------------------------------------------------ *\ - prepare the buffer from which the ds will pull data - \* ------------------------------------------------------------------ */ - req->data_desc.noperations=portals_determine_remote_op_count(msginfo); - portals_prepost_get_from_ds(buffer,msginfo->bytes,req); - - /* ------------------------------------------------------------------ *\ - send data request - \* ------------------------------------------------------------------ */ - portals_req_send(msginfo,sizeof(request_header_t),req); - portalsWaitOnRequest(req); - } -} -#endif - -extern int armci_shmget(size_t,char*); -extern int armci_semget(int); -extern void *shmat(int,int,int); - -void -portals_cp_init_throttle(int nnodes) -{ - int i, shmid, smp_np, smp_me; - size_t size = nnodes*sizeof(int); - char *buf = NULL; - - MPI_Comm_size(portals_smp_comm,&smp_np); - MPI_Comm_rank(portals_smp_comm,&smp_me); - - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - if(armci_me == armci_master) { - if(smp_me != 0) armci_die("smp_me and armci_master are different",911); - } - - if(smp_me == 0) { - shmid = armci_shmget(size,"portals_cp_init_throttle"); - active_requests_by_node = (int *) shmat(shmid,0,0); - if(active_requests_by_node == (void *) -1) { - printf("%d [cp] shmat failed for shmid %d\n",armci_me,shmid); - armci_die("badness",911); - } - armci_shmrm(shmid); - for(i=0; i - # include - # include - # include - -static ptl_handle_ni_t ds_nih; -static ptl_handle_eq_t ds_eqh; -static ptl_handle_eq_t request_eqh; -static ptl_handle_me_t matchall_meh; - -static int request_buffer_cur_block; -static ptl_md_t request_buffer_md[PORTALS_NREQUEST_BUFFERS]; -static ptl_handle_me_t request_buffer_meh[PORTALS_NREQUEST_BUFFERS]; - -int portals_ds_ready = 0; - -// void *portals_ds_working_buffer = NULL; - -void* -portals_ds_thread(void* args) -{ - portals_ds_init(); - portals_ds(); - portals_ds_finalize(); - portalsSpinLockOnInt(&portals_cp_finished,1,1000); - exit(0); - return NULL; -} - - -int -portals_ds_init() -{ - int i,rc; - size_t bufferSize; - float warningSize; - - portals_ds_ready = 0; - - /* --------------------------------------------------------------------- *\ - unhook set affinity ... data servers can roam - \* --------------------------------------------------------------------- */ - # ifdef PORTALS_AFFINITY - int smp_np, smp_me; - unsigned long mask; - unsigned int len = sizeof(mask); - unsigned long ncpus; - int verbose = 0; - - MPI_Comm_size(portals_smp_comm,&smp_np); - MPI_Comm_rank(portals_smp_comm,&smp_me); - - if((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) < 0) { - printf("%d [ds] sysconf(_SC_NPROCESSORS_ONLN) failed; err=%d\n", armci_me, ncpus); - armci_die("sysconf in init_throttle",911); - } - - if(sched_getaffinity(0, len, &mask) < 0) { - perror("sched_getaffinity"); - armci_die("getaffinity error in ds_init",911); - } - - if(armci_clus_me == 0 && /* verbose */ 0 ) { - printf("%d [ds]: old affinity = 0x%x, ncpus = %d\n", armci_me, mask, ncpus); - } - - if(smp_np == ncpus) { - mask = (1 << ncpus) - 1; /* let the data server roam over all cores */ - } else { - mask = 1 << (ncpus - 1); /* pin the ds to the last core on the node */ - } - - if(sched_setaffinity(0, len, (cpu_set_t *) &mask) < 0) { - perror("sched_setaffinity"); - armci_die("setaffinity error in ds_init",911); - } - - if(sched_getaffinity(0, len, &mask) < 0) { - perror("sched_getaffinity"); - armci_die("getaffinity error (#2) in ds_init",911); - } - - if(armci_clus_me == 0 && verbose) { - printf("%d [ds]: new affinity = 0x%x, ncpus = %d\n", armci_me, mask, ncpus); - } - # endif - - /* --------------------------------------------------------------------- *\ - initialize the network interface - \* --------------------------------------------------------------------- */ - rc = portals_init(&ds_nih); - if (rc != PTL_OK) { - printf("failed to initialize portals on ds; err %d\n",rc); - Fatal_error(rc); - } - - /* --------------------------------------------------------------------- *\ - used for responding to data requests; this keeps the response events - in a separate queue from the multitude of incoming data requests - \* --------------------------------------------------------------------- */ - rc = portals_create_eq(ds_nih, 200, &ds_eqh); - - /* --------------------------------------------------------------------- *\ - used to process incoming data requests. at very large scale we will - have to do some sort of messaging by node group to reduce the worst - case scenario off all to one type operations. use the data server - to message forward from node groups. - \* --------------------------------------------------------------------- */ - i = ARMCI_MAX(6*PORTALS_MAX_DESCRIPTORS*armci_nproc,200); - i = ARMCI_MAX(6*armci_nproc,200); - rc = portals_create_eq(ds_nih, i, &request_eqh); - if (rc != PTL_OK) { - printf("failed to create request event queue"); - Fatal_error(rc); - } - - /* --------------------------------------------------------------------- *\ - create ME list that matches all incoming data requests - this will be a dead ME with no MD ... it will only be used as a - place holder in which the "active" me/md will be placed in front of. - \* --------------------------------------------------------------------- */ - rc = portals_create_matchall_me(&matchall_meh); - if (rc != PTL_OK) { - printf("failed to create matchall ME\n"); - Fatal_error(rc); - } - - /* --------------------------------------------------------------------- *\ - create buffer space for the ds buffer - \* --------------------------------------------------------------------- */ - assert(portalsMaxEagerMessageSize > sizeof(request_header_t)); - bufferSize = portalsMaxEagerMessageSize; - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - bufferSize *= armci_nclus; - # else - bufferSize *= armci_nproc; - # endif - bufferSize = bufferSize/(PORTALS_NREQUEST_BUFFERS-2); - bufferSize = ARMCI_MAX(bufferSize,portalsMaxEagerMessageSize); - - // if(armci_me == 0) printf("%s: bufferSize=%ld\n",Portals_ID(),bufferSize); -/* - if(bufferSize*PORTALS_NREQUEST_BUFFERS > PORTALS_REQUEST_BUFFER_SIZE_WARNING) { - warningSize = (float) bufferSize * PORTALS_NREQUEST_BUFFERS; - warningSize /= ONE_MB; - printf("[data server]: internal request buffer is %.2f MB\n",warningSize); - } -*/ - for(i=0; itag.user_ptr = (void *) &ev; - - if(request->operation == PUT || ARMCI_ACC(request->operation)) { - buffersize = sizeof(request_header_t) + request->dscrlen + request->datalen; - if(buffersize >= portalsMaxEagerMessageSize) { - buffer = (char *) MessageRcvBuffer; - portals_ds_get_from_cp(buffer,buffersize,ev.initiator,ev.hdr_data); - request = (request_header_t *) buffer; - request->tag.user_ptr = (void *) &ev; - armci_data_server(buffer); - // printf("%d: FINISHED RENDEZ-VOUS!\n",armci_me); - break; - } - } - - if(request->operation == GET) { - buffersize = sizeof(request_header_t) + request->dscrlen; - if(buffersize >= portalsMaxEagerMessageSize) { - buffer = (char *) MessageRcvBuffer; - portals_ds_get_from_cp(buffer,buffersize,ev.initiator,ev.hdr_data); - request = (request_header_t *) buffer; - request->tag.user_ptr = (void *) &ev; - armci_data_server(buffer); - // printf("%d: FINISHED RENDEZ-VOUS!\n",armci_me); - break; - } - } - - /* ------------------------------------------------------------- *\ - process request - \* ------------------------------------------------------------- */ - armci_data_server(buffer); - if(request->operation == QUIT) active = 0; - break; - - case PTL_EVENT_UNLINK: -// printf("captured an unlink event!!\n"); -// portals_print_event_details(&ev); - /* - if((long) ev.md.user_ptr != request_buffer_cur_block) { - printf("sanity check failed: user_ptr=%ld; cur_block=%ld\n",(long) ev.md.user_ptr, request_buffer_cur_block); - armci_die("hummm ... unlink issue?",911); - } - */ - portals_ds_requeue_md((long) ev.md.user_ptr); - break; - - default: - printf("unexpected event type %d in recvany\n"); - Fatal_error(911); - break; - } - - } while(active); - -// flush out event q; the only thing that should remain is possibly 1 unlink event; - while( (rc=PtlEQGet(request_eqh, &ev)) != PTL_EQ_EMPTY) { - if(rc == PTL_OK) { - if(ev.type != PTL_EVENT_UNLINK) { - printf("%s: flushing request_eqh: event type=%d\n",Portals_ID(),ev.type); - } else { - portals_ds_requeue_md((long) ev.md.user_ptr); - } - } - else if(rc == PTL_EQ_DROPPED) { - printf("%s: eq dropped\n",Portals_ID()); - } - else { - printf("%s: some error in PtlEQGet; err=%d\n",Portals_ID(),rc); - Fatal_error(rc); - } - } - - return PTL_OK; -} - - - - -int -portals_ds_finalize() -{ - int i,rc; - - // unlink and request buffers - for(i=0; itype, ev->offset, ev->mlength, ev->hdr_data, (long) ev->md.user_ptr); - fflush(stdout); -} - - -int -portals_ds_requeue_md(int i) -{ - int rc; - ptl_handle_me_t meh; - ptl_handle_md_t mdh; - ptl_process_id_t match_id; - ptl_match_bits_t match_bits = MATCH_ALL_MBITS; - ptl_match_bits_t ignore_bits = MATCH_ALL_IBITS; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - - rc = portals_me_insert(matchall_meh,match_id,match_bits,ignore_bits,&meh); - if(rc != PTL_OK) { - printf("me insert failed in ds requeue md; err %d\n",rc); - Fatal_error(rc); - } - - rc = portals_md_attach(meh,request_buffer_md[i],PTL_UNLINK,&mdh); - if(rc != PTL_OK) { - printf("md attach failed in ds requeue md; err %d\n",rc); - Fatal_error(rc); - } - - request_buffer_meh[i] = meh; - request_buffer_cur_block++; - if(request_buffer_cur_block == PORTALS_NREQUEST_BUFFERS) request_buffer_cur_block=0; - - return PTL_OK; -} - - -int -portals_create_matchall_me(ptl_handle_me_t* me_handle) -{ - int rc; - ptl_process_id_t match_id; - ptl_match_bits_t match_bits = MATCH_ALL_MBITS; - ptl_match_bits_t ignore_bits = MATCH_ALL_IBITS; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - - rc = portals_me_attach(ds_nih,match_id,match_bits,ignore_bits,&matchall_meh); - - if (rc != PTL_OK) { - printf("PtlMEAttachAny err %d in portals_create_melist\n",rc); - return rc; - } - - return rc; -} - - -void -portals_ds_send_ack(ptl_process_id_t id, ptl_match_bits_t mbits) -{ - portals_desc_t desc; - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - static int ack = 0; - desc.buffer = &ack; - desc.length = sizeof(int); - # else - desc.buffer = NULL; - desc.length = 0; - # endif - desc.id = id; - desc.mbits = mbits | DS_RESPONSE_ACK; - desc.hdr = mbits; - desc.state = 0; - desc.eqh = ds_eqh; - desc.nih = ds_nih; - portals_put(&desc); - portals_wait(&desc); -} - - -void -portals_ds_send_put(void *buffer, ptl_size_t length, ptl_process_id_t id, ptl_match_bits_t mbits) -{ - portals_desc_t desc; - desc.buffer = buffer; - desc.length = length; - desc.id = id; - desc.mbits = mbits | DS_RESPONSE_PUT; - desc.hdr = mbits; - desc.state = 0; - desc.eqh = ds_eqh; - desc.nih = ds_nih; - portals_put(&desc); - portals_wait(&desc); -} - - -void -portals_ds_get_from_cp(void *buffer, ptl_size_t length, ptl_process_id_t id, ptl_match_bits_t mbits) -{ - portals_desc_t desc; - desc.buffer = buffer; - desc.length = length; - desc.id = id; - desc.mbits = mbits | DS_RESPONSE_GET; - desc.hdr = mbits; - desc.state = 0; - desc.eqh = ds_eqh; - desc.nih = ds_nih; - portals_get(&desc); - portals_wait(&desc); -} - - -#ifdef DDI -static void -ds_handler(DDI_Patch *request, ptl_process_id_t from) -{ - int i,j,nr,nc; - long array[10],*a; - size_t size; - char *data_ptr; - portals_desc_t desc; - ptl_event_t *ev = (ptl_event_t *) request->user_ptr; - - switch(request->oper) { - - case DDI_GET: -// printf("%s received DDI_GET request of size %d\n",Portals_ID(),request->size); - nr = request->ihi - request->ilo + 1; - nc = request->jhi - request->jlo + 1; - if(nr < 0 || nc < 0 || nr > 10 || nc > 1) { - printf("test get dimension problem\n"); - abort(); - } - - if(nr*sizeof(long) != request->size) { - printf("test get request size does not match\n"); - abort(); - } - - for(i=0,j=317; iinitiator; - desc.mbits = ev->hdr_data | DS_RESPONSE_PUT; - desc.hdr = ev->hdr_data; - desc.state = 0; - desc.eqh = ds_eqh; - desc.nih = ds_nih; - portals_put(&desc); - portals_wait(&desc); - break; - - case DDI_PUT: - nr = request->ihi - request->ilo + 1; - nc = request->jhi - request->jlo + 1; - - data_ptr = NULL; - if(ev->mlength > sizeof(DDI_Patch)) { - printf("recv'ed eager put - size %d\n",ev->mlength-sizeof(DDI_Patch)); - data_ptr = (char *) request; - data_ptr += sizeof(DDI_Patch); - } - - if(request->size != ev->mlength-sizeof(DDI_Patch)) { - printf("eager msg buffer length does not match request size %d\n",request->size); - abort(); - } - - a = (long *) data_ptr; - for(i=0; iinitiator,ev->hdr_data); - break; - - case DDI_QUIT: -// printf("%s received DDI_QUIT request\n",Portals_ID()); - portals_ds_send_ack(ev->initiator,ev->hdr_data); -/* - desc.buffer = NULL; - desc.length = 0; - desc.id = ev->initiator; - desc.mbits = ev->hdr_data | DS_RESPONSE_ACK; - desc.hdr = ev->hdr_data; - desc.state = 0; - portals_put(&desc); - portals_wait(&desc); -*/ - break; - - case DDI_MEMORY: - DDI_Memory_server(request->size); - portals_ds_send_ack(ev->initiator,ev->hdr_data); - break; - - default: - printf("%s unknown operation in request=%d\n",Portals_ID(),request->oper); - abort(); - break; - } - - return; -} -#endif diff --git a/armci/src-gemini/request.c b/armci/src-gemini/request.c deleted file mode 100644 index 1de38cc47..000000000 --- a/armci/src-gemini/request.c +++ /dev/null @@ -1,1065 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: request.c,v 1.74.2.11 2007-10-18 06:09:37 d3h325 Exp $ */ -#include "armcip.h" -#include "request.h" -#include "memlock.h" -#include "armci_shmem.h" -#include "copy.h" -#include "gpc.h" -#include -#include - -#define DEBUG_ 0 -#define DEBUG_MEM 0 - -#if 0 -# define MARK_ENTER(func_) { fprintf(stdout, "ENTERING %s\n", func_); fflush(stdout); } -# define MARK_EXIT(func_) { fprintf(stdout, "EXITING %s\n", func_); fflush(stdout); } -#else -# define MARK_ENTER(func_) -# define MARK_EXIT(func_) -#endif - -#if 0 -# define PRNDBG3(m,a1,a2,a3) \ - fprintf(stderr,"DBG %d: " m,armci_me,a1,a2,a3);fflush(stderr) -# define PRNDBG(m) PRNDBG3(m,0,0,0) -# define PRNDBG1(m,a1) PRNDBG3(m,a1,0,0) -# define PRNDBG2(m,a1,a2) PRNDBG3(m,a1,a2,0) -#else -# define PRNDBG(m) -# define PRNDBG1(m,a1) -# define PRNDBG2(m,a1,a2) -# define PRNDBG3(m,a1,a2,a3) -#endif - - -#if !defined(GM) && !defined(VIA) && !defined(LAPI) &&!defined(VAPI) - double _armci_rcv_buf[MSG_BUFLEN_DBL]; - double _armci_snd_buf[MSG_BUFLEN_DBL]; - char* MessageSndBuffer = (char*)_armci_snd_buf; - char* MessageRcvBuffer = (char*)_armci_rcv_buf; -#endif - - -#define MAX_EHLEN 248 -#define ADDBUF(buf,type,val) *(type*)(buf) = (val); (buf) += sizeof(type) -#define GETBUF(buf,type,var) (var) = *(type*)(buf); (buf) += sizeof(type) - -#define ALLIGN8(buf){size_t _adr=(size_t)(buf); \ - _adr>>=3; _adr<<=3; _adr+=8; (buf) = (char*)_adr; } - -#ifndef CLN -# define CLN 1 -#endif -#ifndef SERV -# define SERV 2 -#endif - -/*******************Routines to handle completion descriptor******************/ -/*\ - *Following the the routines to fill a completion descriptor, if necessary - *copy the data to destination based on completion descriptor - *NOTE, THE FOLLOWING ROUTINES ARE FOR CLIENTS ONLY -\*/ - - -/*\Routine to complete a vector request, data is in buf and descriptor in dscr -\*/ -extern int armci_direct_vector_get(request_header_t *msginfo , armci_giov_t darr[], int len, int proc); -static void armci_complete_vector_get(armci_giov_t darr[],int len,void *buf) -{ -int proc; -request_header_t *msginfo = (request_header_t*) buf; - proc = msginfo->to; -#if defined(USE_SOCKET_VECTOR_API) - armci_direct_vector_get(msginfo, darr, len, proc); -#else - armci_rcv_vector_data(proc, msginfo, darr, len); -#endif - FREE_SEND_BUFFER(buf); -} - - - - - - -/*\ Routine called from buffers.c to complete a request for which the buffer was - * used for, so that the buffer can be reused. -\*/ -void armci_complete_req_buf(BUF_INFO_T *info, void *buffer) -{ -request_header_t *msginfo = (request_header_t*) buffer; - ARMCI_PR_DBG("enter",0); - if(info->protocol==0)return; - else if(info->protocol==SDSCR_IN_PLACE){ - char *dscr = info->dscr; - void *loc_ptr; - int stride_levels; - int *loc_stride_arr,*count; - - loc_ptr = *(void**)dscr; dscr += sizeof(void*); - stride_levels = *(int*)dscr; dscr += sizeof(int); - loc_stride_arr = (int*)dscr; dscr += stride_levels*sizeof(int); - count = (int*)dscr; - if(0 || DEBUG_){ - if(armci_me==0){ - printf("\n%d:extracted loc_ptr=%p, stridelevels=%d\n",armci_me, - loc_ptr,stride_levels); - fflush(stdout); - } - } - armci_rcv_strided_data(msginfo->to, msginfo, msginfo->datalen, loc_ptr, - stride_levels,loc_stride_arr,count); - FREE_SEND_BUFFER(msginfo); - } - else if(info->protocol==VDSCR_IN_PLACE || info->protocol==VDSCR_IN_PTR){ - char *dscr; - int len,i; - if(info->protocol==VDSCR_IN_PLACE){ - dscr = info->dscr; - //printf("\n%d:vdscr in place\n",armci_me); - } - else { - dscr = info->ptr.dscrbuf; - //printf("\n%d:vdscr in buf\n",armci_me); - } - GETBUF(dscr, long ,len); - { - armci_giov_t *darr; - darr = (armci_giov_t *)malloc(sizeof(armci_giov_t)*len); - if(!darr)armci_die("malloc in complete_req_buf failed",len); - for(i = 0; i< len; i++){ - int parlen, bytes; - GETBUF(dscr, int, parlen); - GETBUF(dscr, int, bytes); - darr[i].ptr_array_len = parlen; - darr[i].bytes = bytes; - if(msginfo->operation==GET)darr[i].dst_ptr_array=(void **)dscr; - else darr[i].src_ptr_array=(void **)dscr; - dscr+=sizeof(void *)*parlen; - } - if (msginfo->operation==GET) armci_complete_vector_get(darr,len,buffer); - } - } - else - armci_die("armci_complete_req_buf,protocol val invalid",info->protocol); - ARMCI_PR_DBG("exit",0); -} - -extern long x_net_offset(void *,int); -/*\ save a part of strided descriptor needed to complete request - -rmo: it seems as if save_ - -\*/ -void armci_save_strided_dscr(char **bptr, void *rem_ptr,int rem_stride_arr[], - int count[], int stride_levels,int is_nb,int proc) -{ -int i; -char *bufptr=*bptr; -BUF_INFO_T *info=NULL; -long network_offset,tmpoffset; - ARMCI_PR_DBG("enter",0); - - # ifdef PORTALS_UNRESOLVED - if(!is_nb){ - network_offset=x_net_offset(rem_ptr,proc); - if(DEBUG_){printf("\n%d:rem_ptr=%p offset=%d newrem=%p",armci_me,rem_ptr,network_offset,(char *)rem_ptr+network_offset);fflush(stdout);} - rem_ptr = (char *)rem_ptr+network_offset; - } - # endif - - if(is_nb){ - info=BUF_TO_BUFINFO(*bptr); - bufptr = (info->dscr); - } - *(void**)bufptr = rem_ptr; bufptr += sizeof(void*); - *(int*)bufptr = stride_levels; bufptr += sizeof(int); - for(i=0;idscr); - if(armci_me==0) - printf("\n%d:rem_ptr %p=%p stride_levels %d=%d\n",armci_me, - *(void**)bufptr,rem_ptr, - *(int*)(bufptr + sizeof(void*)),stride_levels); - } - /*remote_strided expects the pointer to point to the end of descr hence..*/ - if(is_nb) - info->protocol=SDSCR_IN_PLACE; - else - *bptr=bufptr; - ARMCI_PR_DBG("exit",0); - -} - - -/*\ save a part of vector descriptor needed to complete request -\*/ -void armci_save_vector_dscr(char **bptr,armci_giov_t darr[],int len, - int op,int is_nb, int proc) -{ -int i,size=sizeof(int); -BUF_INFO_T *info; -char *buf,*bufptr=*bptr; -void *rem_ptr; -long offst; - ARMCI_PR_DBG("enter",0); - if(is_nb){ - for(i=0;idscr; - info->protocol=VDSCR_IN_PLACE; - } - else { - info->ptr.dscrbuf = (void *)malloc(size); - buf = (char *)info->ptr.dscrbuf; - info->protocol=VDSCR_IN_PTR; - } - } - else - buf=bufptr; - - ADDBUF(buf,long,len); /* number of sets */ - for(i=0;ibufid to val, else set it to the id of the buf -\*/ -void armci_set_nbhandle_bufid(armci_ihdl_t nb_handle,char *buf,int val) -{ -BUF_INFO_T *info; - if(buf){ - info = BUF_TO_BUFINFO(buf); - val = info->bufid; - } - nb_handle->bufid = val; -} - -/**************End--Routines to handle completion descriptor******************/ - - -/*\ send request to server to LOCK MUTEX -\*/ -void armci_rem_lock(int mutex, int proc, int *ticket) -{ -request_header_t *msginfo; -int *ibuf; -int bufsize = sizeof(request_header_t)+sizeof(int); - - msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,LOCK,proc); - bzero(msginfo,sizeof(request_header_t)); - - msginfo->datalen = sizeof(int); - msginfo->dscrlen = 0; - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->operation = LOCK; - msginfo->format = mutex; - msginfo->bytes = msginfo->datalen + msginfo->dscrlen; - - ibuf = (int*)(msginfo+1); - *ibuf = mutex; - - armci_send_req(proc, msginfo, bufsize, 0); - - /* receive ticket from server */ - *ticket = *(int*)armci_rcv_data(proc,msginfo,0); - FREE_SEND_BUFFER(msginfo); - - if(DEBUG_)fprintf(stderr,"%d receiving ticket %d\n",armci_me, *ticket); -} - - - - -void armci_server_lock(request_header_t *msginfo) -{ -int *ibuf = (int*)(msginfo+1); -int proc = msginfo->from; -int mutex; -int ticket; - ARMCI_PR_DBG("enter",0); - - mutex = *(int*)ibuf; - - /* acquire lock on behalf of requesting process */ - ticket = armci_server_lock_mutex(mutex, proc, msginfo->tag); - - if(ticket >-1){ - /* got lock */ - msginfo->datalen = sizeof(int); - armci_send_data(msginfo, &ticket); - } - ARMCI_PR_DBG("exit",0); -} - - -/*\ send request to server to UNLOCK MUTEX -\*/ -void armci_rem_unlock(int mutex, int proc, int ticket) -{ -request_header_t *msginfo; -int *ibuf; -int bufsize = sizeof(request_header_t)+sizeof(ticket); - - msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,UNLOCK,proc); - bzero(msginfo,sizeof(request_header_t)); - - msginfo->dscrlen = msginfo->bytes = sizeof(ticket); - msginfo->datalen = 0; - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->operation = UNLOCK; - msginfo->format = mutex; - ibuf = (int*)(msginfo+1); - *ibuf = ticket; - - if(DEBUG_)fprintf(stderr,"%d sending unlock\n",armci_me); - armci_send_req(proc, msginfo, bufsize,0); -} - - - -/*\ server unlocks mutex and passes lock to the next waiting process -\*/ -void armci_server_unlock(request_header_t *msginfo, char* dscr) -{ - int ticket = *(int*)dscr; - int mutex = msginfo->format; - int proc = msginfo->to; - int waiting; - - waiting = armci_server_unlock_mutex(mutex,proc,ticket,&msginfo->tag); - - if(waiting >-1){ /* -1 means that nobody is waiting */ - - ticket++; - /* pass ticket to the waiting process */ - msginfo->from = waiting; - msginfo->datalen = sizeof(ticket); - armci_send_data(msginfo, &ticket); - - } -} - -void armci_unlock_waiting_process(msg_tag_t tag, int proc, int ticket) -{ -request_header_t header; -request_header_t *msginfo = &header; - - msginfo->datalen = sizeof(int); - msginfo->tag = tag; - msginfo->from = proc; - msginfo->to = armci_me; - armci_send_data(msginfo, &ticket); -} - -void * armci_server_ptr(int id){ -char *buf; -int bufsize = sizeof(int); -request_header_t *msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,ATTACH,armci_me); - bzero(msginfo,sizeof(request_header_t)); - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(armci_clus_me); - msginfo->dscrlen = 0; - msginfo->datalen = sizeof(int); - msginfo->operation = ATTACH; - msginfo->bytes = msginfo->dscrlen+ msginfo->datalen; - armci_copy(&id, msginfo +1, sizeof(int)); - if(DEBUG_MEM){ - printf("\n%d:attach req:sending id %d \n",armci_me,id);fflush(stdout); - } - armci_send_req(armci_master, msginfo, bufsize,0); - buf= armci_rcv_data(armci_master,msginfo,sizeof(void *));/* receive response */ - if(DEBUG_MEM){ - printf("\n%d:attach req:got %p \n",armci_me,buf);fflush(stdout); - } - FREE_SEND_BUFFER(msginfo); - ARMCI_PR_DBG("exit",0); - return (void *)buf; - -} - -/*\ control message to the server, e.g.: ATTACH to shmem, return ptr etc. -\*/ -void armci_serv_attach_req(void *info, int ilen, long size, void* resp,int rlen) -{ -char *buf; - ARMCI_PR_DBG("enter",0); -int bufsize = 2*sizeof(request_header_t)+ilen + sizeof(long)+sizeof(rlen); -long *idlist=(long *)info; -request_header_t *msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,ATTACH,armci_me); - bzero(msginfo,sizeof(request_header_t)); - - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(armci_clus_me); - msginfo->dscrlen = ilen; - msginfo->datalen = sizeof(long)+sizeof(int); - msginfo->operation = ATTACH; - msginfo->bytes = msginfo->dscrlen+ msginfo->datalen; - - armci_copy(info, msginfo +1, ilen); - if(DEBUG_MEM){printf("\n%d:sending idlist+1 %d, size %d, idlist[0] %d, idlist[1] %d\n",armci_me,idlist+1,size,idlist[0],idlist[1]);} - buf = ((char*)msginfo) + ilen + sizeof(request_header_t); - *((long*)buf) =size; - *(int*)(buf+ sizeof(long)) = rlen; - armci_send_req(armci_master, msginfo, bufsize,0); - if(rlen){ - buf= armci_rcv_data(armci_master, msginfo,rlen); /* receive response */ - bcopy(buf, resp, rlen); - FREE_SEND_BUFFER(msginfo); - - if(DEBUG_MEM){printf("%d:client attaching got ptr=%p %d bytes\n",armci_me,buf,rlen); - fflush(stdout); - } - } - ARMCI_PR_DBG("exit",0); -} - - -/*\ server initializes its copy of the memory lock data structures -\*/ -static void server_alloc_memlock(void *ptr_myclus) -{ -int i; - - /* for protection, set pointers for processes outside local node NULL */ - memlock_table_array = calloc(armci_nproc,sizeof(void*)); - if(!memlock_table_array) armci_die("malloc failed for ARMCI lock array",0); - - /* set pointers for processes on local cluster node - * ptr_myclus - corresponds to the master process - */ - for(i=0; i< armci_clus_info[armci_clus_me].nslave; i++){ - memlock_table_array[armci_master +i] = ((char*)ptr_myclus) - + MAX_SLOTS*sizeof(memlock_t)*i; - } - - /* set pointer to the use flag */ -#ifdef MEMLOCK_SHMEM_FLAG - armci_use_memlock_table = (int*) (MAX_SLOTS*sizeof(memlock_t) + - (char*) memlock_table_array[armci_clus_last]); - - if(DEBUG_) - fprintf(stderr,"server initialized memlock %p\n",armci_use_memlock_table); -#endif -} - - -static int allocate_memlock=1; - -/*\ server actions triggered by client request to ATTACH -\*/ -void armci_server_ipc(request_header_t* msginfo, void* descr, - void* buffer, int buflen) -{ -double *ptr; -long *idlist = (long*)descr; -long size = *(long*)buffer; -int rlen = *(int*)(sizeof(long)+(char*)buffer); -extern int **_armci_int_mutexes; - ARMCI_PR_DBG("enter",0); - if(size<0) armci_die("armci_server_ipc: size<0",(int)size); - if(DEBUG_MEM)printf("\n%d:got idlist+1 %p, size %d, idlist[0] %d, idlist[1] %d",armci_me,idlist+1,size,idlist[0],idlist[1]); - ptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!ptr)armci_die("armci_server_ipc: failed to attach",0); - /* provide data server with access to the memory lock data structures */ - if(allocate_memlock){ - allocate_memlock = 0; - server_alloc_memlock(ptr); - } - if(_armci_int_mutexes==NULL){ - printf("unresolved portals external\n"); - abort(); - # ifdef PORTALS_UNRESOLVED - extern int _armci_server_mutex_ready; - extern void *_armci_server_mutex_ptr; - if(_armci_server_mutex_ready){ - _armci_int_mutexes=(int **)_armci_server_mutex_ptr; - } - # endif - } - if(size>0)armci_set_mem_offset(ptr); - - if(msginfo->datalen != sizeof(long)+sizeof(int)) - armci_die("armci_server_ipc: bad msginfo->datalen ",msginfo->datalen); - - if(rlen==sizeof(ptr)){ - msginfo->datalen = rlen; - armci_send_data(msginfo, &ptr); - } - else armci_die("armci_server_ipc: bad rlen",rlen); - ARMCI_PR_DBG("exit",0); -} - - -/*\ send RMW request to server -\*/ -void armci_rem_rmw(int op, void *ploc, void *prem, int extra, int proc) -{ -request_header_t *msginfo; -char *buf; -void *buffer; -int bufsize = sizeof(request_header_t)+sizeof(long)+sizeof(void*); -long offst; - - ARMCI_PR_DBG("enter",0); - msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,op,proc); - bzero(msginfo,sizeof(request_header_t)); - - msginfo->dscrlen = sizeof(void*); - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->operation = op; - msginfo->datalen = sizeof(long); - # ifdef PORTALS_UNRESOLVED - offst=x_net_offset(prem,proc); - prem = ((char *)prem+offst); - # endif - buf = (char*)(msginfo+1); - ADDBUF(buf, void*, prem); /* pointer is shipped as descriptor */ - - /* data field: extra argument in fetch&add and local value in swap */ - if(op==ARMCI_SWAP){ - ADDBUF(buf, int, *((int*)ploc)); - }else if(op==ARMCI_SWAP_LONG) { - ADDBUF(buf, long, *((long*)ploc) ); - msginfo->datalen = sizeof(long); - }else { - ADDBUF(buf, int, extra); - } - - msginfo->bytes = msginfo->datalen+msginfo->dscrlen ; - - if(DEBUG_){ - printf("%d sending RMW request %d to %d\n",armci_me,op,proc); - fflush(stdout); - } - armci_send_req(proc, msginfo, bufsize,0); - buffer = armci_rcv_data(proc,msginfo,0); /* receive response */ - - if(op==ARMCI_FETCH_AND_ADD || op== ARMCI_SWAP) - *(int*)ploc = *(int*)buffer; - else - *(long*)ploc = *(long*)buffer; - - FREE_SEND_BUFFER(msginfo); - ARMCI_PR_DBG("exit",0); -} - - -/*\ server response to RMW -\*/ -void armci_server_rmw(request_header_t* msginfo,void* ptr, void* pextra) -{ -long lold; -int iold; -void *pold=0; -int op = msginfo->operation; - - ARMCI_PR_DBG("enter",0); - if(DEBUG_){ - printf("%d server: executing RMW from %d. op=%d pextra=%p\n",armci_me,msginfo->from, op, pextra); - fflush(stdout); - } - if(msginfo->datalen != sizeof(long)) - armci_die2("armci_server_rmw: bad datalen=",msginfo->datalen,op); - - /* for swap operations *pextra has the value to swap - * for fetc&add it carries the increment argument - */ - switch(op){ - case ARMCI_SWAP: - iold = *(int*) pextra; - case ARMCI_FETCH_AND_ADD: - pold = &iold; - break; - - case ARMCI_SWAP_LONG: - lold = *(long*) pextra; - case ARMCI_FETCH_AND_ADD_LONG: - pold = &lold; - break; - - default: - armci_die("armci_server_rmw: bad operation code=",op); - } - - armci_generic_rmw(op, pold, *(int**)ptr, *(int*) pextra, msginfo->to); - - armci_send_data(msginfo, pold); - ARMCI_PR_DBG("exit",0); -} - -extern int armci_direct_vector_snd(request_header_t *msginfo , armci_giov_t darr[], int len, int proc); -extern int armci_direct_vector(request_header_t *msginfo , armci_giov_t darr[], int len, int proc); -int armci_rem_vector(int op, void *scale, armci_giov_t darr[],int len,int proc,int flag, armci_ihdl_t nb_handle) -{ - char *buf,*buf0; - request_header_t *msginfo; - int bytes =0, s, slen=0; - size_t adr; - int bufsize = sizeof(request_header_t); - int tag=0; - - if(nb_handle)tag=nb_handle->tag; - - /* compute size of the buffer needed */ - for(s=0; stag,0); - if(nb_handle->bufid == NB_NONE) - armci_set_nbhandle_bufid(nb_handle,buf,0); - } - - buf += sizeof(request_header_t); - - /* fill vector descriptor */ - armci_save_vector_dscr(&buf,darr,len,op,0,proc); - - /* align buf for doubles (8-bytes) before copying data */ - adr = (size_t)buf; - adr >>=3; - adr <<=3; - adr +=8; - buf = (char*)adr; - - msginfo->ehlen = 0; - - /* fill message header */ - msginfo->dscrlen = buf - buf0 - sizeof(request_header_t); - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->operation = op; - msginfo->format = VECTOR; - msginfo->datalen = bytes; - - /* put scale for accumulate */ - switch(op){ - case ARMCI_ACC_INT: - *(int*)buf = *(int*)scale; slen= sizeof(int); break; - case ARMCI_ACC_DCP: - ((double*)buf)[0] = ((double*)scale)[0]; - ((double*)buf)[1] = ((double*)scale)[1]; - slen=2*sizeof(double);break; - case ARMCI_ACC_DBL: - *(double*)buf = *(double*)scale; slen = sizeof(double); break; - case ARMCI_ACC_CPL: - ((float*)buf)[0] = ((float*)scale)[0]; - ((float*)buf)[1] = ((float*)scale)[1]; - slen=2*sizeof(float);break; - case ARMCI_ACC_FLT: - *(float*)buf = *(float*)scale; slen = sizeof(float); break; - default: slen=0; - } - buf += slen; - msginfo->datalen += slen; - msginfo->bytes = msginfo->datalen+msginfo->dscrlen; - - - /* for put and accumulate copy data into buffer */ - if(op != GET){ -/* fprintf(stderr,"sending %lf\n",*(double*)darr[0].src_ptr_array[0]);*/ - armci_vector_to_buf(darr, len, buf); - } - - armci_send_req(proc, msginfo, bufsize,tag); - /*x_buf_send_complete(buf0);*/ - - if(nb_handle && op==GET) armci_save_vector_dscr(&buf0,darr,len,op,1,proc); - if(op == GET&& !nb_handle){ - armci_complete_vector_get(darr,len,msginfo); - } - - return 0; -} - -#define CHUN_ (8*8096) -#define CHUN 200000 - -/*\ client version of remote strided operation -\*/ -int armci_rem_strided(int op, void* scale, int proc, - void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, - ext_header_t *h, int flag,armci_ihdl_t nb_handle) -{ - char *buf, *buf0; - request_header_t *msginfo; - int i, slen=0, bytes; - void *rem_ptr; - int *rem_stride_arr; - int bufsize = sizeof(request_header_t); - int ehlen =0; - msg_tag_t msg_tag; - int tag=0; - - /* we send ext header only for last chunk */ -#if 0 - if(h) ehlen = h->len; -#else - if(h) if(h->last) ehlen = h->len; -#endif - if(ehlen>MAX_EHLEN || ehlen <0) - armci_die2("armci_rem_strided ehlen out of range",MAX_EHLEN,ehlen); - /* calculate size of the buffer needed */ - for(i=0, bytes=1;i<=stride_levels;i++)bytes*=count[i]; - bufsize += bytes+sizeof(void*)+2*sizeof(int)*(stride_levels+1) +ehlen - +2*sizeof(double) + 16; /* +scale+alignment */ - - if (flag){ - printf("%d: flag=%d\n",armci_me,flag); - if(op==GET)bufsize -=bytes; - } - - buf = buf0= GET_SEND_BUFFER((bufsize),op,proc); - msginfo = (request_header_t*)buf; - bzero(msginfo,sizeof(request_header_t)); - - - if(nb_handle) -#ifdef ACC_SMP - if(!ARMCI_ACC(op)) -#endif - { - // printf("%s: non-blocking ops not yet supported\n",Portals_ID()); - // abort(); -/* INIT_SENDBUF_INFO(nb_handle,buf,op,proc); same as _armci_buf_set_tag, why here? */ - _armci_buf_set_tag(buf,nb_handle->tag,0); - if(nb_handle->bufid == NB_NONE) - armci_set_nbhandle_bufid(nb_handle,buf,0); - tag = nb_handle->tag; - } - - if(op == GET){ - rem_ptr = src_ptr; - rem_stride_arr = src_stride_arr; - }else{ - rem_ptr = dst_ptr; - rem_stride_arr = dst_stride_arr; - } - - msginfo->datalen=bytes; - - /* fill strided descriptor */ - buf += sizeof(request_header_t); - /*this function fills the dscr into buf and also moves the buf ptr to the - end of the dscr*/ - armci_save_strided_dscr(&buf,rem_ptr,rem_stride_arr,count,stride_levels,0,proc); - - /* align buf for doubles (8-bytes) before copying data */ - ALLIGN8(buf); - - /* fill message header */ - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->format = STRIDED; - msginfo->operation = op; - - /* put scale for accumulate */ - switch(op){ - case ARMCI_ACC_INT: - *(int*)buf = *(int*)scale; slen= sizeof(int); break; - case ARMCI_ACC_DCP: - ((double*)buf)[0] = ((double*)scale)[0]; - ((double*)buf)[1] = ((double*)scale)[1]; - slen=2*sizeof(double);break; - case ARMCI_ACC_DBL: - *(double*)buf = *(double*)scale; slen = sizeof(double); break; - case ARMCI_ACC_CPL: - ((float*)buf)[0] = ((float*)scale)[0]; - ((float*)buf)[1] = ((float*)scale)[1]; - slen=2*sizeof(float);break; - case ARMCI_ACC_FLT: - *(float*)buf = *(float*)scale; slen = sizeof(float); break; - case ARMCI_ACC_LNG: - *(long*)buf = *(long*)scale; slen = sizeof(long); break; - default: slen=0; - } - - /* - if(ARMCI_ACC(op))printf("%d client len=%d alpha=%lf data=%lf,%lf\n", - armci_me, buf-(char*)msginfo,((double*)buf)[0],*((double*)src_ptr), ((double*)buf)[1]); - */ - - buf += slen; - - /**** add extended header *******/ - if(ehlen){ - bcopy(h->exthdr,buf,ehlen); - i = ehlen%8; ehlen += (8-i); /* make sure buffer is still alligned */ - buf += ehlen; - } - - msginfo->ehlen = ehlen; - msginfo->dscrlen = buf - buf0 - sizeof(request_header_t); - msginfo->bytes = msginfo->datalen+msginfo->dscrlen; - - if(op == GET){ - /* - if(nb_handle) { - printf("%s rem_strided: nb gets not yet available\n",Portals_ID()); - abort(); - } - */ - armci_send_req(proc, msginfo, bufsize,tag); - armci_save_strided_dscr(&buf0,dst_ptr,dst_stride_arr,count, - stride_levels,1,proc); - - if(!nb_handle){ - armci_rcv_strided_data(proc, msginfo, msginfo->datalen, - dst_ptr, stride_levels, dst_stride_arr, count); - FREE_SEND_BUFFER(msginfo); - } - } else { - /* for put and accumulate send data */ - armci_send_strided(proc,msginfo, buf, - src_ptr, stride_levels, src_stride_arr, count,tag); - } - - return 0; -} - - -void armci_process_extheader(request_header_t *msginfo, char *dscr, char* buf, int buflen) -{ - armci_flag_t *h; - int *flag; - - h = (armci_flag_t*)(dscr + msginfo->dscrlen - msginfo->ehlen); -#if 0 - if(msginfo->ehlen)printf("%d:server from=%d len=%d: ptr=%p val=%d\n",armci_me,msginfo->from, msginfo->ehlen,h->ptr,h->val); - fflush(stdout); -#endif - flag = (int*)(h->ptr); - *flag = h->val; -} - -void armci_server(request_header_t *msginfo, char *dscr, char* buf, int buflen) -{ -int buf_stride_arr[MAX_STRIDE_LEVEL+1]; -int *loc_stride_arr,slen; -int *count, stride_levels; -void *buf_ptr, *loc_ptr; -void *scale; -char *dscr_save = dscr; -int rc, i,proc; -int stat; - - ARMCI_PR_DBG("enter",msginfo->datalen);fflush(stdout); - /*return if using readv/socket for put*/ - if(msginfo->operation==PUT && msginfo->datalen==0){ - if(msginfo->ehlen) /* process extra header if available */ - armci_process_extheader(msginfo, dscr, buf, buflen); - return; - } - - /* unpack descriptor record */ - loc_ptr = *(void**)dscr; dscr += sizeof(void*); - stride_levels = *(int*)dscr; dscr += sizeof(int); - loc_stride_arr = (int*)dscr; dscr += stride_levels*sizeof(int); - count = (int*)dscr; - - /* compute stride array for buffer */ - buf_stride_arr[0]=count[0]; - for(i=0; i< stride_levels; i++) - buf_stride_arr[i+1]= buf_stride_arr[i]*count[i+1]; - - /* get scale for accumulate, adjust buf to point to data */ - switch(msginfo->operation){ - case ARMCI_ACC_INT: slen = sizeof(int); break; - case ARMCI_ACC_DCP: slen = 2*sizeof(double); break; - case ARMCI_ACC_DBL: slen = sizeof(double); break; - case ARMCI_ACC_CPL: slen = 2*sizeof(float); break; - case ARMCI_ACC_FLT: slen = sizeof(float); break; - case ARMCI_ACC_LNG: slen = sizeof(long); break; - default: slen=0; - } - - scale = dscr_save+ (msginfo->dscrlen - slen -msginfo->ehlen); -/* - if(ARMCI_ACC(msginfo->operation)) - fprintf(stderr,"%d in server len=%d slen=%d alpha=%lf data=%lf\n", - armci_me, msginfo->dscrlen, slen, *(double*)scale,*(double*)buf); -*/ - - buf_ptr = buf; /* data in buffer */ - - proc = msginfo->to; - - if(msginfo->operation == GET){ - armci_send_strided_data(proc, msginfo, buf, - loc_ptr, stride_levels, loc_stride_arr, count); - /* fprintf(stderr, "GET response sent with tag: %d\n, msginfo->tag", - msginfo->tag); */ - } else{ - if((rc = armci_op_strided(msginfo->operation, scale, proc, - buf_ptr, buf_stride_arr, loc_ptr, loc_stride_arr, - count, stride_levels, 1,NULL))) - armci_die("server_strided: op from buf failed",rc); - } - - if(msginfo->ehlen) /* process extra header if available */ - armci_process_extheader(msginfo, dscr_save, buf, buflen); - ARMCI_PR_DBG("exit",0); -} - - -void armci_server_vector( request_header_t *msginfo, - char *dscr, char* buf, int buflen) -{ - int proc; - long len; - void *scale; - int i,s; - char *sbuf = buf; - if(msginfo->operation==PUT && msginfo->datalen==0)return;/*return if using readv/socket for put*/ - /* unpack descriptor record */ - GETBUF(dscr, long ,len); - - /* get scale for accumulate, adjust buf to point to data */ - scale = buf; - switch(msginfo->operation){ - case ARMCI_ACC_INT: buf += sizeof(int); break; - case ARMCI_ACC_DCP: buf += 2*sizeof(double); break; - case ARMCI_ACC_DBL: buf += sizeof(double); break; - case ARMCI_ACC_CPL: buf += 2*sizeof(float); break; - case ARMCI_ACC_FLT: buf += sizeof(float); break; - } - - proc = msginfo->to; - - /*fprintf(stderr,"scale=%lf\n",*(double*)scale);*/ - /* execute the operation */ - - switch(msginfo->operation) { - case GET: -/* fprintf(stderr, "%d:: Got a vector message!!\n", armci_me); */ - if(msginfo->ehlen) { - armci_die("Unexpected vector message with non-zero ehlen. GPC call?", - msginfo->ehlen); - } - else { - for(i = 0; i< len; i++){ - int parlen, bytes; - void **ptr; - GETBUF(dscr, int, parlen); - GETBUF(dscr, int, bytes); - /* fprintf(stderr,"len=%d bytes=%d parlen=%d\n",len,bytes,parlen);*/ - ptr = (void**)dscr; dscr += parlen*sizeof(char*); - for(s=0; s< parlen; s++){ - armci_copy(ptr[s], buf, bytes); - buf += bytes; - } - } -/* fprintf(stderr,"%d:: VECTOR GET. server sending buffer %p datalen=%d\n",armci_me, sbuf, msginfo->datalen); */ - armci_send_data(msginfo, sbuf); - } - break; - - case PUT: - -/* fprintf(stderr,"received in buffer %lf\n",*(double*)buf);*/ - for(i = 0; i< len; i++){ - int parlen, bytes; - void **ptr; - GETBUF(dscr, int, parlen); - GETBUF(dscr, int, bytes); - ptr = (void**)dscr; dscr += parlen*sizeof(char*); - for(s=0; s< parlen; s++){ -/* - armci_copy(buf, ptr[s], bytes); -*/ - bcopy(buf, ptr[s], (size_t)bytes); - buf += bytes; - } - } - break; - - default: - - /* this should be accumulate */ - if(!ARMCI_ACC(msginfo->operation)) - armci_die("v server: wrong op code",msginfo->operation); - -/* fprintf(stderr,"received first=%lf last =%lf in buffer\n",*/ -/* *((double*)buf),((double*)buf)[99]);*/ - - for(i = 0; i< len; i++){ - int parlen, bytes; - void **ptr; - GETBUF(dscr, int, parlen); - GETBUF(dscr, int, bytes); - ptr = (void**)dscr; dscr += parlen*sizeof(char*); - armci_lockmem_scatter(ptr, parlen, bytes, proc); - for(s=0; s< parlen; s++){ - armci_acc_2D(msginfo->operation, scale, proc, buf, ptr[s], - bytes, 1, bytes, bytes, 0); - buf += bytes; - } - ARMCI_UNLOCKMEM(proc); - } - } -} diff --git a/armci/src-gemini/request.h b/armci/src-gemini/request.h deleted file mode 100644 index 851d1ae01..000000000 --- a/armci/src-gemini/request.h +++ /dev/null @@ -1,375 +0,0 @@ -#ifndef _REQUEST_H_ -#define _REQUEST_H_ - - -/******** client buffer managment ops ****************************/ -extern void _armci_buf_init(); -extern char* _armci_buf_get(int size, int operation, int to); -extern void _armci_buf_release(void *buf); -extern int _armci_buf_to_index(void *buf); -extern char* _armci_buf_ptr_from_id(int id); -extern void _armci_buf_ensure_one_outstanding_op_per_node(void *buf, int node); -#if defined(SERV_QUEUE) -extern void _armci_buf_ensure_pend_outstanding_op_per_node(void *buf, int node); -#endif -extern void _armci_buf_complete_nb_request(int bufid,unsigned int tag, int *retcode); -extern void _armci_buf_test_nb_request(int bufid,unsigned int tag, int *retcode); -extern void _armci_buf_set_tag(void *bufptr,unsigned int tag,short int protocol); -extern void _armci_buf_clear_all(); -extern void x_buf_send_complete(void *); - -extern INLINE char *_armci_buf_get_clear_busy(int size, int operation, int to); -extern INLINE void _armci_buf_set_busy(void *buf, int state); -extern INLINE void _armci_buf_set_busy_idx(int tbl_idx, int state); -extern INLINE int _armci_buf_cmpld(int bufid); -extern INLINE void _armci_buf_set_cmpld(void *buf, int state); -extern INLINE void _armci_buf_set_cmpld_idx(int idx, int state); - -#ifdef LAPI -# include "lapidefs.h" -#elif LIBONESIDED - typedef armci_onesided_msg_tag_t msg_tag_t; -#elif PORTALS -# include "armci_portals.h" -#elif defined(GM) -# include "myrinet.h" -#elif defined(DOELAN4) -# include "elandefs.h" -#elif defined(QUADRICS) -# include - typedef void* msg_tag_t; -# ifdef _ELAN_PUTGET_H -# define NB_CMPL_T ELAN_EVENT* -# endif -#elif defined(VIA) -# include "via.h" - typedef void* msg_tag_t; -#elif defined(VAPI) -# include "armci-vapi.h" -#elif defined(SOCKETS) -# include "sockets.h" - typedef long msg_tag_t; - typedef unsigned short msg_id_t; -# define DTAG_ ((1<<(sizeof(msg_id_t)*8))-1) -# define NB_SOCKETS_ /* define NB_SOCKETS to allow non-blocking path */ -#elif defined(HITACHI) -# include "sr8k.h" -#elif defined(BGML) -# include "bgml.h" -# include "bgmldefs.h" -# define NB_CMPL_T BG1S_t - typedef long msg_tag_t; -#elif defined(MPI_SPAWN) -# include "mpi2.h" -# define MSG_BUFLEN_DBL 500000 - typedef long msg_tag_t; -#else - typedef long msg_tag_t; -#endif - -#ifndef CLEAR_HNDL_FIELD -# define CLEAR_HNDL_FIELD(_x) -#endif - -#define ACK_QUIT 0 -#define QUIT 33 -#define ATTACH 34 -#define REGISTER 35 - -/*\ the internal request structure for non-blocking api. -\*/ -typedef struct{ - unsigned int tag; - int bufid; - int agg_flag; - int op; - int proc; - -#ifdef NB_CMPL_T - NB_CMPL_T cmpl_info; -#endif - - int onesided_direct; - cos_desc_t comm_desc[MAX_OUTSTANDING_ONESIDED_GETS]; -} armci_ireq_t; -/*\ the internal request structure for non-blocking api. -\*/ -typedef armci_ireq_t* armci_ihdl_t; -extern void armci_set_nbhandle_bufid(armci_ihdl_t nb_handle, char *buf, int val); -extern void set_nbhandle(armci_ihdl_t *nbh, armci_hdl_t *nb_handle, - int op, int proc); - -typedef struct { - int to; /* message recipient */ - int from; /* message sender */ - int operation; /* operation code */ - int format; /* data format used */ - int bytes; /* number of bytes requested */ - int datalen; /* >0 in lapi means that data is included */ - int ehlen; /* size of extra header and the end of descr */ - int dscrlen; /* >0 in lapi means that descriptor is included */ - msg_tag_t tag; /* message tag for response to this request, MUST BE LAST */ -}request_header_t; - - -typedef struct _buf_ackresp{ - long val,valc; - cos_request_t req; - struct _buf_ackresp *next, *previous; -} _buf_ackresp_t; - -/*******gpc call strctures*************/ -#include -#define MAX_GPC_REQ 1 -#define MAX_GPC_REPLY_LEN (64*1024) -#define MAX_GPC_SEND_LEN (64*1024) -#define GPC_COMPLETION_SIGNAL SIGUSR1 - -typedef struct { - int hndl; - int hlen, dlen; - void *hdr, *data; - int rhlen, rdlen; - void *rhdr, *rdata; -} gpc_call_t; - -typedef struct { - int active; -/* int zombie; */ - request_header_t msginfo; - gpc_call_t call; - char send[MAX_GPC_SEND_LEN]; - char reply[MAX_GPC_REPLY_LEN]; -} gpc_buf_t; - -/* gpc_buf_t *gpc_req; */ -extern gpc_buf_t *gpc_req; - -extern void block_pthread_signal(int signo); -extern void unblock_pthread_signal(int signo); - -/*******structures copied from async.c for storing cmpl dscr for nb req*******/ -#define UBUF_LEN 112 - -typedef struct { - unsigned int tag; /* request id*/ - _buf_ackresp_t ar; - short int bufid; /* communication buffer id */ - short int protocol; /* what does this buf hold?*/ - union { - void *dscrbuf; /*in case dscr below is not enough, do a*/ - double pad; /*malloc, save pointer in dscrbuf and use it*/ - }ptr; - char dscr[UBUF_LEN]; /*place to store the dscr*/ -}_buf_info_t; - -#define BUF_INFO_T _buf_info_t -extern BUF_INFO_T *_armci_buf_to_bufinfo(void *buf); -#define BUF_TO_BUFINFO _armci_buf_to_bufinfo - -void armci_complete_req_buf(BUF_INFO_T *info, void *buffer); -extern INLINE BUF_INFO_T *_armci_id_to_bufinfo(int bufid); - -#ifndef MAX_BUFS -#define MAX_BUFS 8 -#error "MAX_BUFS set to 8" -#endif - -#ifndef MAX_SMALL_BUFS -#define MAX_SMALL_BUFS 16 -#error "MAX_SMALL_BUFS set to 16" -#endif - - -/* tracks sockets used for receiving responces from data server (GET) */ -typedef struct { - int socks[MAX_BUFS+MAX_SMALL_BUFS]; /* sock # or -1 if not used */ - int ready[MAX_BUFS+MAX_SMALL_BUFS]; /* 1 - ready, 0 - not */ -} active_socks_t; - - - -/*valid values for the element protocol in BUF_INFO_T*/ -#define SDSCR_IN_PLACE 1 /*indicated that strided descriptor is in place*/ -#define VDSCR_IN_PLACE 2 /*indicated that vector descriptor is in place*/ -#define VDSCR_IN_PTR 3 /*indicates that the vector descriptor in allocated - and pointer stored in dscrbuf */ -/****************************************************************************/ - -/* this effects: buf_ext_t, portalsEagerMessageSendSize, portals ds buffer size */ -/* note: MSG_BUFLEN_DBL is being defined earlier in armci-portals.h */ -#ifndef MSG_BUFLEN_DBL -# error "MSG_BUFLEN_DBL not yet defined" -# if defined(HITACHI) -# define MSG_BUFLEN_DBL 0x50000 -# else -# ifdef PORTALS_USE_RENDEZ_VOUS -# define MSG_BUFLEN_DBL 50000 /* for rendez-vous, this can go bigger i think */ -# else -# define MSG_BUFLEN_DBL 8192 /* this is smaller when rendez-vous is off */ -# endif -# endif -#endif - -#define MSG_BUFLEN sizeof(double)*MSG_BUFLEN_DBL -extern char* MessageRcvBuffer; -extern char* MessageSndBuffer; - -#ifdef LAPI -# define GET_SEND_BUFFER_(_size)(MessageSndBuffer+sizeof(lapi_cmpl_t));\ - CLEAR_COUNTER(*((lapi_cmpl_t*)MessageSndBuffer));\ - SET_COUNTER(*((lapi_cmpl_t*)MessageSndBuffer),1); -# define GET_SEND_BUFFER _armci_buf_get -# define GA_SEND_REPLY armci_lapi_send -#else -# ifdef SOCKETS -# define GA_SEND_REPLY(tag, buf, len, p) armci_sock_send(p,buf,len) -# else -# define GA_SEND_REPLY(tag, buf, len, p) -# endif -#endif - -#ifdef QUADRICS_ -# define GET_SEND_BUFFER(_size,_op,_to) MessageSndBuffer;\ - while(((request_header_t*)MessageSndBuffer)->tag)\ - armci_util_spin(100, MessageSndBuffer) -# define FREE_SEND_BUFFER(_ptr) ((request_header_t*)MessageSndBuffer)->tag = (void*)0 -#endif - -#ifndef GET_SEND_BUFFER -# define GET_SEND_BUFFER(_size,_op,_to) MessageSndBuffer -#endif - -#ifndef FREE_SEND_BUFFER -#define FREE_SEND_BUFFER(_ptr) -#endif - -#ifndef INIT_SENDBUF_INFO -#define INIT_SENDBUF_INFO(_hdl,_buf,_op,_proc) -#endif - -typedef struct { - char *buf; char* buf_posted; int count; int proc; int op; int extra; -} buf_arg_t; - -/*includes for SERVER_LOCK*/ -#if defined(SERVER_THREAD) && !defined(VIA) - extern void armci_rem_lock(int mutex, int proc, int *ticket); - extern void armci_rem_unlock(int mutex, int proc, int ticket); - extern void armci_unlock_waiting_process(msg_tag_t tag,int proc, int ticket); -#endif - - -#ifdef PIPE_BUFSIZE - extern void armcill_pipe_post_bufs(void *ptr, int stride_arr[], int count[], - int strides, void* argvoid); - extern void armcill_pipe_extract_data(void *ptr,int stride_arr[],int count[], - int strides, void* argvoid); - extern void armcill_pipe_send_chunk(void *data, int stride_arr[],int count[], - int strides, void* argvoid); -#endif - -extern void armci_send_strided(int proc, request_header_t *msginfo, char *bdata, - void *ptr, int strides, int stride_arr[], int count[],int tag); - -extern void armci_rcv_hdlr(request_header_t* msginfo); - -extern char *armci_rcv_data(int proc, request_header_t *msginfo, int rcvlen); -extern void armci_rcv_strided_data_bypass(int proc, request_header_t *msginfo, - void *ptr, int stride_levels); -extern void armci_send_strided_data_bypass(int proc, request_header_t *msginfo, - void *loc_buf, int msg_buflen, void *loc_ptr, int *loc_stride_arr, - void *rem_ptr, int *rem_stride_arr, int *count, int stride_levels); - -extern void armci_rcv_strided_data(int proc, request_header_t* msginfo, - int datalen, void *ptr, int strides,int stride_arr[],int count[]); -extern void armci_send_strided_data(int proc, request_header_t *msginfo, - char *bdata, void *ptr, int strides, int stride_arr[], int count[]); -extern void armci_send_req(int proc, request_header_t* msginfo, int len,int tag); -extern void armci_server_rmw(request_header_t* msginfo,void* ptr, void* pextra); -extern int armci_rem_vector(int op, void *scale, armci_giov_t darr[],int len, - int proc,int flag,armci_ihdl_t nb_handle); -extern int armci_rem_strided(int op, void* scale, int proc, - void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, - ext_header_t *h, int lockit,armci_ihdl_t nb_handle); - -extern void armci_rem_rmw(int op, void *ploc, void *prem, int extra, int proc); -extern void armci_rem_ack(int clus); -extern void armci_server(request_header_t *msginfo, char *dscr, char* buf, - int buflen); -extern void armci_server_vector(request_header_t *msginfo, - char *dscr, char* buf, int buflen); -extern void *armci_server_ptr(int); -extern void armci_serv_attach_req(void *info, int ilen, long size, - void* resp,int rlen); -extern void armci_server_lock(request_header_t *msginfo); -extern void armci_server_unlock(request_header_t *msginfo, char* dscr); -extern void armci_create_server_thread ( void* (* func)(void*) ); -extern int armci_server_lock_mutex(int mutex, int proc, msg_tag_t tag); -extern void armci_send_data(request_header_t* msginfo, void *data); -extern int armci_server_unlock_mutex(int mutex, int p, int tkt, msg_tag_t* tag); -extern void armci_rcv_vector_data(int p, request_header_t* msginfo, armci_giov_t dr[], int len); - -#if !defined(LAPI) -extern void armci_wait_for_server(); -extern void armci_start_server(); -extern void armci_transport_cleanup(); -extern int armci_send_req_msg(int proc, void *buf, int bytes,int tag); -extern void armci_WriteToDirect(int proc, request_header_t* msginfo, void *buf); -extern char *armci_ReadFromDirect(int proc, request_header_t *msginfo, int len); -extern void armci_init_connections(); -extern void *armci_server_code(void *data); -extern void armci_rcv_req(void *mesg, void *phdr, void *pdescr, - void *pdata, int *buflen); -extern void armci_client_connect_to_servers(); -extern void armci_data_server(void *mesg); -extern void armci_server_initial_connection(); -extern void armci_call_data_server(); -#endif -#ifdef SOCKETS -extern void armci_ReadStridedFromDirect(int proc, request_header_t* msginfo, - void *ptr, int strides, int stride_arr[], int count[]); -extern void armci_WriteStridedToDirect(int proc, request_header_t* msginfo, - void *ptr, int strides, int stride_arr[], int count[]); -extern void armci_serv_quit(); -extern int armci_send_req_msg_strided(int proc, request_header_t *msginfo, - char *ptr, int strides, int stride_arr[],int count[]); -extern void armci_server_goodbye(request_header_t* msginfo); -#endif -#ifdef MPI_SPAWN -extern void armci_serv_quit(); -extern void armci_server_goodbye(request_header_t* msginfo); -#endif -#ifdef HITACHI -extern void armci_server_goodbye(request_header_t* msginfo); -extern void armci_serv_quit(); -#endif -extern void armci_server_ipc(request_header_t* msginfo, void* descr, - void* buffer, int buflen); - -#ifdef PIPE_BUFSIZE -extern void armci_pipe_prep_receive_strided(request_header_t *msginfo,char *buf, - int strides, int stride_arr[], int count[], int bufsize); -extern void armci_pipe_receive_strided(request_header_t* msginfo, void *ptr, - int stride_arr[], int count[], int strides); -extern void armci_pipe_send_req(int proc, void *buf, int bytes); -#endif - -extern void armci_rcv_strided_data_bypass_both(int, request_header_t*,void*, int*, int); -extern int armci_rem_get(int proc, void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], int count[], int stride_levels, - armci_ihdl_t nb_handle,void *mhloc,void *mhrem); - -#if defined(ALLOW_PIN) && defined(VAPI) -extern int armci_two_phase_send(int proc,void *src_ptr,int src_stride_arr[], - void *dst_ptr,int dst_stride_arr[],int count[], - int stride_levels,void ** context_ptr,armci_ihdl_t nbhandle, - ARMCI_MEMHDL_T *mhloc); -extern int armci_two_phase_get(int proc, void*src_ptr, int src_stride_arr[], - void*dst_ptr,int dst_stride_arr[], int count[], - int stride_levels, void**context_ptr, - armci_ihdl_t nbhandle, ARMCI_MEMHDL_T *mhloc); -#endif -#endif diff --git a/armci/src-gemini/rmw.c b/armci/src-gemini/rmw.c deleted file mode 100644 index daf330e3b..000000000 --- a/armci/src-gemini/rmw.c +++ /dev/null @@ -1,152 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: rmw.c,v 1.24.2.5 2007-08-29 17:32:47 manoj Exp $ */ -#include "armcip.h" -#include "locks.h" -#include "copy.h" -#include -#if (defined(__i386__) || defined(__x86_64__)) && !defined(_CRAYC) -# include "atomics-i386.h" -#endif - -#ifdef LIBELAN_ATOMICS - -ELAN_ATOMIC *a; - -int elan_int_fadd(int *target, int inc, int vp) -{ - int result; - - elan_wait(elan_atomic32(a, ELAN_ATOMIC_ADD, target, inc, 0, vp, &result), elan_base->waitType); - return(result); -} - -int elan_long_fadd(long *target, long inc, int vp) -{ - long result; - -#ifdef _LP64 - elan_wait(elan_atomic64(a, ELAN_ATOMIC_ADD, target, inc, 0, vp, &result), elan_base->waitType); -#else - elan_wait(elan_atomic32(a, ELAN_ATOMIC_ADD, target, inc, 0, vp, &result), elan_base->waitType); -#endif - - return(result); -} - -int elan_int_swap(int *target, int value, int vp) -{ - int result; - - elan_wait(elan_atomic32(a, ELAN_ATOMIC_SWAP, target, value, 0, vp, &result), elan_base->waitType); - return(result); -} - -int elan_long_swap(long *target, long value, int vp) -{ - long result; - -#ifdef _LP64 - elan_wait(elan_atomic64(a, ELAN_ATOMIC_SWAP, target, value, 0, vp, &result), elan_base->waitType); -#else - elan_wait(elan_atomic32(a, ELAN_ATOMIC_SWAP, target, value, 0, vp, &result), elan_base->waitType); -#endif - - return(result); -} -#endif /* LIBELAN_ATOMICS */ - -/* enable use of newer interfaces in SHMEM */ -#ifndef CRAY -#ifndef LIBELAN_ATOMICS -/* manpages for shmem_fadd exist on the T3E but library code does not */ -#define SHMEM_FADD -#endif -#endif - - -/* global scope to prevent compiler optimization of volatile code */ -int _a_temp; -long _a_ltemp; - -void armci_generic_rmw(int op, void *ploc, void *prem, int extra, int proc) -{ -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - - ARMCI_PR_DBG("enter",0); - NATIVE_LOCK(lock,proc); - switch (op) { - case ARMCI_FETCH_AND_ADD: - armci_get(prem,ploc,sizeof(int),proc); - _a_temp = *(int*)ploc + extra; - armci_put(&_a_temp,prem,sizeof(int),proc); - break; - case ARMCI_FETCH_AND_ADD_LONG: - armci_get(prem,ploc,sizeof(long),proc); - _a_ltemp = *(long*)ploc + extra; - armci_put(&_a_ltemp,prem,sizeof(long),proc); - break; - case ARMCI_SWAP: -#if (defined(__i386__) || defined(__x86_64__)) && !defined(_CRAYC) - if(SERVER_CONTEXT || armci_nclus==1){ - atomic_exchange(ploc, prem, sizeof(int)); - } - else -#endif - { - armci_get(prem,&_a_temp,sizeof(int),proc); - armci_put(ploc,prem,sizeof(int),proc); - *(int*)ploc = _a_temp; - } - break; - case ARMCI_SWAP_LONG: - armci_get(prem,&_a_ltemp,sizeof(long),proc); - armci_put(ploc,prem,sizeof(long),proc); - *(long*)ploc = _a_ltemp; - break; - default: armci_die("rmw: operation not supported",op); - } - /*TODO memfence here*/ - NATIVE_UNLOCK(lock,proc); - ARMCI_PR_DBG("exit",0); -} - - -int PARMCI_Rmw(int op, void *ploc, void *prem, int extra, int proc) -{ - if(!SAMECLUSNODE(proc)){ - # if defined CRAY_REGISTER_ARMCI_MALLOC && HAVE_ONESIDED_FADD - if(op == ARMCI_FETCH_AND_ADD_LONG) { - armci_onesided_fadd(ploc, prem, extra, proc); - } else { - # endif - armci_rem_rmw(op, ploc, prem, extra, proc); - # if defined CRAY_REGISTER_ARMCI_MALLOC && HAVE_ONESIDED_FADD - } - # endif - return 0; - } - - switch (op) { - case ARMCI_FETCH_AND_ADD_LONG: - # if defined CRAY_REGISTER_ARMCI_MALLOC && HAVE_ONESIDED_FADD - armci_onesided_fadd(ploc, prem, extra, proc); - break; - # endif - case ARMCI_FETCH_AND_ADD: - case ARMCI_SWAP: - case ARMCI_SWAP_LONG: - armci_generic_rmw(op, ploc, prem, extra, proc); - break; - default: armci_die("rmw: operation not supported",op); - } - - return 0; -} - diff --git a/armci/src-gemini/rtinfo.c b/armci/src-gemini/rtinfo.c deleted file mode 100644 index 41165682f..000000000 --- a/armci/src-gemini/rtinfo.c +++ /dev/null @@ -1,26 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: rtinfo.c,v 1.1 2003-03-20 00:57:27 d3h325 Exp $ Run-time system configuration */ - -#include -#include - -/*\ determine number of CPUs on the current SMP node- Linux version for now -\*/ -int armci_getnumcpus(void) -{ -int numproc=0; -FILE* fp; -char line[80]; - fp=fopen("/proc/cpuinfo","r"); - if(fp==NULL) return -1; - while(!feof(fp)){ - fgets(line,80,fp); - if(strncmp(line,"processor",9)==0) numproc++; - } - fclose(fp); - return(numproc); -} - diff --git a/armci/src-gemini/run_test b/armci/src-gemini/run_test deleted file mode 100755 index 120190846..000000000 --- a/armci/src-gemini/run_test +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/tcsh -rm -f core* -limit coredumpsize 128megabytes -# setenv CRAY_PORTALS_USE_BLOCKING_POLL 1 -aprun -n 4 -N 2 ./test.x diff --git a/armci/src-gemini/semaphores.c b/armci/src-gemini/semaphores.c deleted file mode 100644 index cbc2b136f..000000000 --- a/armci/src-gemini/semaphores.c +++ /dev/null @@ -1,98 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: semaphores.c,v 1.12 2005-03-10 19:11:23 vinodtipparaju Exp $ */ -#include "semaphores.h" -#include -#include - -int num_sem_alloc=0; -void perror(); -#ifdef SUN -int fprintf(); -void fflush(); -int semget(),semctl(); -#endif - -extern void armci_die(char*, int); - -struct sembuf sops; -int semaphoreID; - -int SemGet(num_sem) - int num_sem; -{ - semaphoreID = semget(IPC_PRIVATE,num_sem, IPC_CREAT | 0600); - if(semaphoreID<0){ - fprintf(stderr," Semaphore Allocation Failed \nsuggestions to fix the problem: \n"); - fprintf(stderr," 1. run ipcs and ipcrm -s commands to clean any semaphore ids\n"); - fprintf(stderr," 2. verify if constant SEMMSL defined in file semaphore.h is set correctly for your system\n"); - fprintf(stderr," 3. recompile semaphore.c\n"); - sleep(1); - perror("Error message from failed semget:"); - armci_die(" exiting ...", num_sem); - } - - num_sem_alloc = num_sem; - return(semaphoreID); -} - -void SemInit(id,value) - int id,value; -{ - int i, semid, num_sem; - union semun semctl_arg; - - semctl_arg.val = value; - - if(id == ALL_SEMS){ semid = 0; num_sem = num_sem_alloc;} - else { semid = id; num_sem = 1;} - - for(i=0; i< num_sem; i++){ - if( semctl(semaphoreID, semid, SETVAL,semctl_arg )<0){ - perror((char*)0); - armci_die("SemInit error",id); - } - semid++; - } -} - - -/* release semaphore(s) */ -void SemDel() -{ - union semun dummy; - - /* this is only to avoid compiler whinning about the unitialized variable*/ - dummy.val=0; - - (void) semctl(semaphoreID,0,IPC_RMID,dummy); -} - - -void Sem_CreateInitLocks(int num, lockset_t *id) -{ - *id = SemGet(num); - SemInit(ALL_SEMS,1); -} - - -void Sem_InitLocks(int num, lockset_t id) -{ - semaphoreID = id; - num_sem_alloc = num; -} - - -void Sem_DeleteLocks(lockset_t id) -{ - union semun dummy; - - /* this is only to avoid compiler whinning about the unitialized variable*/ - dummy.val=0; - - (void) semctl(id,0,IPC_RMID,dummy); -} - - diff --git a/armci/src-gemini/semaphores.h b/armci/src-gemini/semaphores.h deleted file mode 100644 index 379f95466..000000000 --- a/armci/src-gemini/semaphores.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef _SEMAPHORES_H_ -#define _SEMAPHORES_H_ - -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_SYS_IPC_H -# include -#endif -#if HAVE_SYS_SEM_H -# include -#endif - -#if !HAVE_UNION_SEMUN -union semun { - int val; /* value for SETVAL */ - struct semid_ds *buf; /* buffer for IPC_STAT, IPC_SET */ - unsigned short int *array; /* array for GETALL, SETALL */ - struct seminfo *__buf; /* buffer for IPC_INFO */ -}; -#endif - -/* how many semaphores are available ? */ -#ifndef SEMMSL -# ifdef AIX -# define SEMMSL 8094 -# else -# define SEMMSL 16 -# endif -#endif - -/* on HPUX 10.2 SEMMSL is much bigger than realistically we can allocate */ -#ifdef HPUX -#undef SEMMSL -#define SEMMSL 64 -#endif - -extern struct sembuf sops; -extern int semaphoreID; -int semop(); -#define ALL_SEMS -1 - -#define _P_code -1 -#define _V_code 1 -#define P_semaphore(s) \ -{\ - sops.sem_num = (s);\ - sops.sem_op = _P_code;\ - sops.sem_flg = 0; \ - semop(semaphoreID,&sops,1);\ -} -#define V_semaphore(s) \ -{\ - sops.sem_num = (s);\ - sops.sem_op = _V_code;\ - sops.sem_flg = 0; \ - semop(semaphoreID,&sops,1);\ -} - -typedef int lockset_t; - -#endif diff --git a/armci/src-gemini/shmalloc.c b/armci/src-gemini/shmalloc.c deleted file mode 100644 index f5e1c4311..000000000 --- a/armci/src-gemini/shmalloc.c +++ /dev/null @@ -1,89 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: shmalloc.c,v 1.10 2002-06-20 23:34:17 vinod Exp $ */ -#include -#include -#include "armcip.h" -#include "message.h" -#include "kr_malloc.h" - -static long *offset_arr; - -void armci_shmalloc_exchange_offsets(context_t *ctx_local) -{ - void **ptr_arr; - void *ptr; - armci_size_t bytes = 128; - int i; - - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - offset_arr = (long*) malloc(armci_nproc*sizeof(long)); - if(!ptr_arr || !offset_arr) armci_die("armci_shmalloc_get_offsets: malloc failed", 0); - - /* get memory with same size on all procs */ - ptr = kr_malloc(bytes, ctx_local); - if(!ptr) armci_die("armci_shmalloc_get_offsets: kr_malloc failed",bytes); - - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - ptr_arr[armci_me] = ptr; - - /* now combine individual addresses into a single array */ - armci_exchange_address(ptr_arr, armci_nproc); - - /* identify offets */ - for (i=0; i -#include -#include -#include -#include -#include -#include -#include -#include "armci_shmem.h" -#include "kr_malloc.h" -#include "shmlimit.h" -#include "message.h" -#include "armcip.h" - -#ifdef ALLOC_MUNMAP -#include -#include -static size_t pagesize=0; -static int logpagesize=0; -/* allow only that big shared memory segment (in MB)- incresed from 128 11/02 */ -#define MAX_ALLOC_MUNMAP 128 -#define MAX_ALLOC_MUNMAP_ 368 -static long max_alloc_munmap=MAX_ALLOC_MUNMAP; -#endif - -#if defined(SUN) - extern char *shmat(); -#endif - -#define SHM_UNIT (1024) - - -/* Need to determine the max shmem segment size. There are 2 alternatives: - * 1. use predefined SHMMAX if available or set some reasonable values, or - * 2. trial-and-error search for a max value (default) - * case a) fork a process to determine shmmax size (more accurate) - * case b) search w/o forking until success (less accurate) - */ - -/* under Myrinet GM, we cannot fork */ -#if defined(GM) || defined(VAPI) -# define SHMMAX_SEARCH_NO_FORK -#endif -#if defined(LAPI) || defined(AIX) || defined(SHMMAX_SEARCH_NO_FORK) || defined(CRAY_XT) || defined(CRAY_UGNI) -# define NO_SHMMAX_SEARCH -#endif - -/* Limits for the largest shmem segment are in Kilobytes to avoid passing - * Gigavalues to kr_malloc - * the limit for the KSR is lower than SHMMAX in sys/param.h because - * shmat would fail -- SHMMAX cannot be trusted (a bug) - */ -#define _SHMMAX 4*1024 - -#if defined(SUN)||defined(SOLARIS) -# undef _SHMMAX -# define _SHMMAX (1024) /* memory in KB */ -#elif defined(SGI64) || defined(AIX) || defined(CONVEX) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)512*1024) -#elif defined(SGI) && !defined(SGI64) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)128*1024) -#elif defined(KSR) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)512*1024) -#elif defined(HPUX) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)64*1024) -#elif defined(__FreeBSD__) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)3*1024) -#elif defined(LINUX) -# if !defined(SHMMAX) /* Red Hat does not define SHMMAX */ -# undef _SHMMAX -# if defined(__sparc__) || defined(__powerpc__) -# define _SHMMAX ((unsigned long)16*1024) -# elif defined(__alpha__) -# define _SHMMAX ((unsigned long)4072) -# else - /* Intel */ -# define _SHMMAX ((unsigned long)32*1024) -# endif -# endif -#elif defined(SHMMAX) -# undef _SHMMAX -# define _SHMMAX (((unsigned long)SHMMAX)>>10) -#endif - -static unsigned long MinShmem_per_core = 0; -static unsigned long MaxShmem_per_core = 0; -static unsigned long MinShmem = _SHMMAX; -static unsigned long MaxShmem = MAX_REGIONS*_SHMMAX; -static context_t ctx_shmem; /* kr_malloc context */ -static context_t *ctx_shmem_global; /* kr_malloc context stored in shmem */ -static int create_call=0; - -#ifdef SHMMAX_SEARCH_NO_FORK -static char *ptr_search_no_fork = (char*)0; -static int id_search_no_fork=0; -#endif - - -#ifdef LINUX -#define CLEANUP_CMD(command) sprintf(command,"/usr/bin/ipcrm shm %d",id); -#elif defined(SOLARIS) -#define CLEANUP_CMD(command) sprintf(command,"/bin/ipcrm -m %d",id); -#elif defined(SGI) -#define CLEANUP_CMD(command) sprintf(command,"/usr/sbin/ipcrm -m %d",id); -#else -#define CLEANUP_CMD(command) sprintf(command,"/usr/bin/ipcrm -m %d",id); -#endif - - -#ifdef ALLOC_MUNMAP -#ifdef QUADRICS -# include -# include - static char *armci_elan_starting_address = (char*)0; - -# ifdef __ia64__ -# define ALLOC_MUNMAP_ALIGN 1024*1024 -# else -# define ALLOC_MUNMAP_ALIGN 64*1024 -# endif - -# define ALGN_MALLOC(s,a) elan_allocMain(elan_base->state, (a), (s)) -#else -# define ALGN_MALLOC(s,a) malloc((s)) -#endif - -static char* alloc_munmap(size_t size) -{ -char *tmp; -unsigned long iptr; -size_t bytes = size+pagesize-1; - - if(armci_elan_starting_address){ - tmp = armci_elan_starting_address; - armci_elan_starting_address += size; -# ifdef ALLOC_MUNMAP_ALIGN - armci_elan_starting_address += ALLOC_MUNMAP_ALIGN; -# endif - if(DEBUG_) {printf("%d: address for shm attachment is %p size=%ld\n", - armci_me,tmp,(long)size); fflush(stdout); } - } else { - tmp = ALGN_MALLOC(bytes, getpagesize()); - if(tmp){ - iptr = (unsigned long)tmp + pagesize-1; - iptr >>= logpagesize; iptr <<= logpagesize; - if(DEBUG_) printf("%d:unmap ptr=%p->%p size=%d pagesize=%d\n",armci_me, - tmp,(char*)iptr,(int)size,pagesize); - tmp = (char*)iptr; - if(munmap(tmp, size) == -1) armci_die("munmap failed",0); - if(DEBUG_){printf("%d: unmap OK\n",armci_me); fflush(stdout);} - }else armci_die("alloc_munmap: malloc failed",(int)size); - } - return tmp; -} -#endif - -/*\ A wrapper to shmget. Just to be sure that ID is not 0. -\*/ -int armci_shmget(size_t size,char *from) -{ -int id; - - id = shmget(IPC_PRIVATE, size, (IPC_CREAT | 00600)); - - /*attaching with id 0 somehow fails (Seen on pentium4+linux24+gm163) - *so if id=0, shmget again. */ - while(id==0){ - /* free id=0 and get a new one */ - if(shmctl((int)id,IPC_RMID,(struct shmid_ds *)NULL)) { - fprintf(stderr,"id=%d \n",id); - armci_die("allocate: failed to _delete_ shared region ",id); - } - id = shmget(IPC_PRIVATE, size, (IPC_CREAT | 00600)); - } - if(DEBUG_){ - printf("\n%d:armci_shmget sz=%ld caller=%s id=%d\n",armci_me,(long)size, - from,id); - fflush(stdout); - } - return(id); -} - -static int -Semget(key_t key,int nsems,int semflg) { - int ret; - - if((ret = semget(key,nsems,semflg)) == -1) { - switch(errno) { - case EACCES: fprintf(stdout," semget errno=EACCES.\n"); break; - case EINVAL: fprintf(stdout," semget errno=EINVAL.\n"); break; - case ENOENT: fprintf(stdout," semget errno=ENOENT.\n"); break; - case ENOSPC: fprintf(stdout," semget errno=ENOSPC -- check system limit for sysv semaphores.\n"); break; - case ENOMEM: fprintf(stdout," semget errno=ENOMEM.\n"); break; - case EEXIST: fprintf(stdout," semget errno=EEXIST.\n"); break; - default: - fprintf(stdout," unusual semget errno=%d\n",errno); break; - } - armci_die("semget failed",errno); - } - - return ret; -} - - -int armci_semget(int count) -{ - int id; - id = Semget(IPC_PRIVATE,2,0600); - return id; -} - -int armci_semrm(int id) -{ - semctl(id,0,IPC_RMID); -} - -int armci_shmrm(int id) -{ - int ret; - if((ret = shmctl(id,IPC_RMID,NULL)) != 0) { - fprintf(stdout,"[cp]: shmctl return an error.\n"); - switch(errno) { - case EINVAL: - fprintf(stdout," Error EINVAL: shmid is not a valid shared memory segment.\n"); - break; - case EFAULT: - fprintf(stdout," Error EFAULT: argument 3 is not a valid struct shmid_ds.\n"); - break; - case EPERM: - fprintf(stdout," Error EPREM: permission to access/change shared mem segment denied.\n"); - break; - default: - fprintf(stdout," unusual shmctl errno=%d\n",errno); break; - break; - } - armci_die("error deleting shmid",id); - } - return ret; -} - -/*\ test is a shared memory region of a specified size can be allocated - * return 0 (no) or 1 (yes) -\*/ -int armci_test_allocate(long size) -{ - char *ptr; - int id = armci_shmget((size_t)size,"armci_test_allocate"); - if (id <0) return 0; - - /* attach to segment */ - ptr = shmat(id, (char *) NULL, 0); - - /* delete segment id */ - if(shmctl( id, IPC_RMID, (struct shmid_ds *)NULL)) - fprintf(stderr,"failed to remove shm id=%d\n",id); - - /* test pointer */ - if (((long)ptr) == -1L) return 0; - else return 1; -} - - -/*\ try to allocate a shared memory region of a specified size; return pointer -\*/ -static int armci_shmalloc_try(long size) -{ -#ifdef SHMMAX_SEARCH_NO_FORK - char *ptr; - int id = armci_shmget((size_t) size,"armci_shmalloc_try"); - if (id <0) return 0; - - /* attach to segment */ - ptr = shmat(id, (char *) NULL, 0); - - /* test pointer */ - if (((long)ptr) == -1L) return 0; - - ptr_search_no_fork = ptr; - id_search_no_fork = id; -#endif - return 1; -} - - - - -/* parameters that define range and granularity of search for shm segment size - * UBOUND is chosen to be < 2GB to avoid overflowing on 32-bit systems - * smaller PAGE gives more accurate results but with more search steps - * LBOUND is set to minimum amount for our purposes - * change UBOUND=512MB if you need larger arrays than 512 MB - */ -#define PAGE (16*65536L) -#define LBOUND 1048576L -#if defined(MULTI_CTX) && defined(QUADRICS) -#define UBOUND (256*LBOUND) -#else -#define UBOUND (512*LBOUND) -#endif - -static long get_user_shmmax() -{ -char *uval; -long x=0; - uval = getenv("ARMCI_DEFAULT_SHMMAX"); - if(uval != NULL){ - sscanf(uval,"%ld",&x); - if(x<1L || x> 2048L){ - fprintf(stderr,"incorrect ARMCI_DEFAULT_SHMMAX should be <1,2048>mb and 2^N Found=%ld\n",x); - x=0; - } - } - return x*1048576; /* return value in bytes */ -} - -/*\ determine the max shmem segment size using bisection -\*/ -int armci_shmem_test() -{ -long x; -int i,rc; -long upper_bound=UBOUND; -long lower_bound=0; - - x = get_user_shmmax(); - if(!x) x = upper_bound; - else upper_bound =x; - - if(DEBUG_){printf("%d: x = %ld upper_bound=%ld\n",armci_me, x, upper_bound); fflush(stdout);} - - for(i=1;;i++){ - long step; - rc = armci_test_allocate(x); - if(DEBUG_) - printf("%d:test %d size=%ld bytes status=%d\n",armci_me,i,x,rc); - if(rc){ - lower_bound = x; - step = (upper_bound -x)>>1; - if(step < PAGE) break; - x += step; - }else{ - upper_bound = x; - step = (x-lower_bound)>>1; - if(step>=20; - x <<=20; - } - - if(!lower_bound){ - /* try if can get LBOUND - necessary if search starts from UBOUND */ - lower_bound=LBOUND; - rc = armci_test_allocate(lower_bound); - if(!rc) return(0); - } - - if(DEBUG_) printf("%ld bytes segment size, %d calls \n",lower_bound,i); - return (int)( lower_bound>>20); /* return shmmax in mb */ -} - - -#ifdef SHMMAX_SEARCH_NO_FORK -/*\ determine the max shmem segment size by halving -\*/ -static int armci_shmem_test_no_fork() -{ -long x; -int i,rc; -long lower_bound=_SHMMAX*SHM_UNIT; -#define UBOUND_SEARCH_NO_FORK (256*SHM_UNIT*SHM_UNIT) - - x = get_user_shmmax(); - if(!x) x = UBOUND_SEARCH_NO_FORK; - - for(i=1;;i++){ - - rc = armci_shmalloc_try(x); - if(DEBUG_) - printf("%d:test by halving size=%ld bytes rc=%d\n",armci_me,x,rc); - - if(rc){ - lower_bound = x; - break; - }else{ - x >>= 1 ; - if(x>20); /* return shmmax in mb */ -} -#endif - - -#ifdef MULTI_CTX -void armci_nattach_preallocate_info(int* segments, int *segsize) -{ - int x; - char *uval; - uval = getenv("LIBELAN_NATTACH"); - if(uval != NULL){ - sscanf(uval,"%d",&x); - if(x<2 || x>8) armci_die("Error in LIBELAN_NATTACH <8, >1 ",(int)x); - }else - armci_die("Inconsistent configuration: ARMCI needs LIBELAN_NATTACH",0); - *segments =x; - *segsize = (int) (SHM_UNIT * MinShmem); - -} -#endif - -/* Create shared region to store kr_malloc context in shared memory */ -void armci_krmalloc_init_ctxshmem() { - void *myptr=NULL; - long idlist[SHMIDLEN]; - long size; - int offset = sizeof(void*)/sizeof(int); - - /* to store shared memory context and myptr */ - size = SHMEM_CTX_MEM; - - if(armci_me == armci_master ){ - myptr = Create_Shared_Region(idlist+1,size,idlist); - if(!myptr && size>0 ) armci_die("armci_krmalloc_init_ctxshmem: could not create", (int)(size>>10)); - if(size) *(volatile void**)myptr = myptr; - if(DEBUG_){ - printf("%d:armci_krmalloc_init_ctxshmem addr mptr=%p ref=%p size=%ld\n", armci_me, myptr, *(void**)myptr, size); - fflush(stdout); - } - - /* Bootstrapping: allocate storage for ctx_shmem_global. NOTE:there is - offset,as master places its address at begining for others to see */ - ctx_shmem_global = (context_t*) ( ((int*)myptr)+offset ); - *ctx_shmem_global = ctx_shmem; /*master copies ctx into shared region */ - } - - /* broadcast shmem id to other processes on the same cluster node */ - armci_msg_clus_brdcst(idlist, SHMIDLEN*sizeof(long)); - - if(armci_me != armci_master){ - myptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!myptr)armci_die("armci_krmalloc_init_ctxshmem: could not attach", (int)(size>>10)); - - /* now every process in a SMP node needs to find out its offset - * w.r.t. master - this offset is necessary to use memlock table - */ - if(size) armci_set_mem_offset(myptr); - if(DEBUG_){ - printf("%d:armci_krmalloc_init_ctxshmem attached addr mptr=%p ref=%p size=%ld\n", armci_me,myptr, *(void**)myptr,size); fflush(stdout); - } - /* store context info */ - ctx_shmem_global = (context_t*) ( ((int*)myptr)+offset ); - if(DEBUG_){ - printf("%d:armci_krmalloc_init_ctxshmem: shmid=%d off=%ld size=%ld\n", armci_me, ctx_shmem_global->shmid, ctx_shmem_global->shmoffset, - (long)ctx_shmem_global->shmsize); - fflush(stdout); - } - } -} - -void armci_shmem_init() -{ - -#ifdef ALLOC_MUNMAP - -#if defined(QUADRICS) -# if (defined(__ia64__) || defined(__alpha)) && !defined(DECOSF) - - /* this is to determine size of Elan Main memory allocator for munmap */ - long x; - char *uval; - uval = getenv("LIBELAN_ALLOC_SIZE"); - if(uval != NULL){ - sscanf(uval,"%ld",&x); - if((x>80000000) && (x< 4*1024*1024*1024L)){ - max_alloc_munmap = (x>>20) - 72; - if(DEBUG_){ - printf("%d: max_alloc_munmap is %ld\n",armci_me,max_alloc_munmap); - fflush(stdout); - } - } - } - - /* an alternative approach is to use MMAP area where we get - the address from the Elan environment variable in qsnetlibs 1.4+ */ - uval = getenv("LIBELAN3_MMAPBASE"); - if(uval != NULL){ - sscanf(uval,"%p",&armci_elan_starting_address); - } - -# endif -# if defined(__ia64__) - /* need aligment on 1MB boundary rather than the actual pagesize */ - pagesize = 1024*1024; - logpagesize = 20; -# else - /* determine log2(pagesize) needed for address alignment */ - int tp=512; - logpagesize = 9; - pagesize = getpagesize(); - if(tp>pagesize)armci_die("armci_shmem_init:pagesize",pagesize); - - while(tpmax_alloc_munmap && !armci_elan_starting_address) x=max_alloc_munmap; -# else - x = 10; /* mb */ -# endif -# endif - - if(DEBUG_){ - printf("%d:shmem_init: %d mbytes max segment size\n",armci_me,x);fflush(stdout);} - - MinShmem = (long)(x<<10); /* make sure it is in kb: mb <<10 */ - MaxShmem = MAX_REGIONS*MinShmem; -# ifdef REPORT_SHMMAX - printf("%d using x=%d SHMMAX=%ldKB\n", armci_me,x, MinShmem); - fflush(stdout); -# endif -#else - - /* nothing to do here - limits were given */ - -#endif - } - - armci_krmalloc_init_ctxshmem(); - if(DEBUG_)printf("%d: out of shmem_init\n",armci_me); -} - -void armci_set_shmem_limit_per_node(int nslaves) -{ - if (MaxShmem_per_core > 0) MaxShmem = nslaves*MaxShmem_per_core; - if (MinShmem_per_core > 0) MinShmem = nslaves*MinShmem_per_core; -} - -void armci_set_shmem_limit_per_core(unsigned long shmemlimit) -{ - MaxShmem_per_core = (shmemlimit + SHM_UNIT - 1)/SHM_UNIT; - MinShmem_per_core = (shmemlimit + SHM_UNIT - 1)/SHM_UNIT; -} -/*\ application can reset the upper limit (bytes) for memory allocation -\*/ -void armci_set_shmem_limit(unsigned long shmemlimit) -{ - unsigned long kbytes; - kbytes = (shmemlimit + SHM_UNIT -1)/SHM_UNIT; - if(MaxShmem > kbytes) MaxShmem = kbytes; - if(MinShmem > kbytes) MinShmem = kbytes; -} - - -static void shmem_errmsg(size_t size) -{ -long sz=(long)size; - printf("******************* ARMCI INFO ************************\n"); - printf("The application attempted to allocate a shared memory segment "); - printf("of %ld bytes in size. This might be in addition to segments ",sz); - printf("that were allocated succesfully previously. "); - printf("The current system configuration does not allow enough "); - printf("shared memory to be allocated to the application.\n"); - printf("This is most often caused by:\n1) system parameter SHMMAX "); - printf("(largest shared memory segment) being too small or\n"); - printf("2) insufficient swap space.\n"); - printf("Please ask your system administrator to verify if SHMMAX "); - printf("matches the amount of memory needed by your application and "); - printf("the system has sufficient amount of swap space. "); - printf("Most UNIX systems can be easily reconfigured "); - printf("to allow larger shared memory segments,\n"); - printf("see https://hpc.pnl.gov/globalarrays/support.shtml\n"); - printf("In some cases, the problem might be caused by insufficient swap space.\n"); - printf("*******************************************************\n"); -} - - -static struct shm_region_list{ - char *addr; - long id; - long sz; - long attached; -}region_list[MAX_REGIONS]; -static int alloc_regions=0; -static long occup_blocks=0; - -/* Terminology - * region - actual piece of shared memory allocated from OS - * block - a part of allocated shmem that is given to the requesting process - */ - - -static int last_allocated=-1; - - -unsigned long armci_max_region() -{ - return MinShmem; -} - - -int find_regions(char *addrp, long* id, int *region) -{ -int nreg, reg; - - if(last_allocated!=-1){ - reg=last_allocated; - last_allocated = -1; - } else{ - - for(reg=-1,nreg=0;nreg= region_list[nreg].addr && - addrp < (region_list[nreg].addr + region_list[nreg].sz)) - { - reg = nreg; - break; - } - } - - if(reg == -1) - armci_die("find_regions: failed to locate shared region", 0L); - } - - *region = reg; - *id = region_list[reg].id; - - return 1; -} - -/* returns the shmem info based on the addr */ -int armci_get_shmem_info(char *addrp, int* shmid, long *shmoffset, - size_t *shmsize) -{ - int region; long id; - - find_regions(addrp, &id, ®ion); - *shmid = id; - *shmoffset = (long)(addrp - region_list[region].addr); - *shmsize = region_list[region].sz; - - return 1; -} - -long armci_shm_reg_size(int i, long id) -{ - if(i<0 || i>= MAX_REGIONS)armci_die("armci_shmem_reg_size: bad i",i); - return region_list[i].sz; -} - -void* armci_shm_reg_ptr(int i) -{ - if(i<0 || i>= MAX_REGIONS)armci_die("armci_shmem_reg_ptr: bad i",i); - return (void *)region_list[i].addr; -} - -Header *armci_get_shmem_ptr(int shmid, long shmoffset, size_t shmsize) -{ -/* returns, address of the shared memory region based on shmid, offset. - * (i.e. return_addr = stating address of shmid + offset)*/ - long idlist[SHMIDLEN]; - Header *p = NULL; - - idlist[1] = (long)shmid; - idlist[0] = shmoffset; - idlist[IDLOC+1] = shmsize; /* CHECK : idlist in CreateShmem????*/ - - if(!(p=(Header*)Attach_Shared_Region(idlist+1, shmsize, idlist[0]))) - armci_die("kr_malloc:could not attach",(int)(p->s.shmsize>>10)); -#if DEBUG_ - printf("%d: armci_get_shmem_ptr: %d %ld %ld %p\n", - armci_me, idlist[1], idlist[0], shmsize, p); - fflush(stdout); -#endif - return p; -} - - -char *Attach_Shared_Region(id, size, offset) - long *id, offset, size; -{ -int reg, found, shmflag=0; -static char *temp; - - if(alloc_regions>=MAX_REGIONS) - armci_die("Attach_Shared_Region: to many regions ",0); - - if(DEBUG_){ - printf("%d:AttachSharedRegion %d:size=%ld id=%ld\n", - armci_me, create_call++, size,*id); - fflush(stdout); - } - - - /* under Linux we can get valid id=0 */ -#ifndef LINUX - if(!*id) armci_die("Attach_Shared_Region: shmem ID=0 ",(int)*id); -#endif - - /* first time needs to initialize region_list structure */ - if(!alloc_regions){ - for(reg=0;reg= MAX_REGIONS) - armci_die("Create_Shared_Region:allocate:too many regions allocated ",0); - - last_allocated = alloc_regions; - -#ifdef SHMMAX_SEARCH_NO_FORK - if (ptr_search_no_fork){ - temp = ptr_search_no_fork; - id = id_search_no_fork; - ptr_search_no_fork = (char*)0; /* do not look at it again */ - }else -#endif - { - if ( (id = armci_shmget(sz,"armci_allocate")) < 0 ) { - fprintf(stderr,"id=%d size=%ld\n",id, size); - shmem_errmsg(sz); - armci_die("allocate: failed to create shared region ",id); - } - - if ( (long)( (temp = shmat(id, pref_addr, shmflag))) == -1L){ - char command[64]; - CLEANUP_CMD(command); - if(system(command) == -1) - printf("Please clean shared memory (id=%d): see man ipcrm\n",id); - armci_die("allocate: failed to attach to shared region id=",id); - } - if(DEBUG_){ - printf("%d:allocate:attach:id=%d paddr=%p size=%ld\n",armci_me,id,temp,size); - fflush(stdout); - } -#if 1 - /* delete segment id so that OS cleans it when all attached processes are gone */ - if(shmctl( id, IPC_RMID, (struct shmid_ds *)NULL)) - fprintf(stderr,"failed to remove shm id=%d\n",id); -#endif - - } - POST_ALLOC_CHECK(temp,sz); - - region_list[alloc_regions].addr = temp; - region_list[alloc_regions].id = id; - region_list[alloc_regions].attached=1; - region_list[alloc_regions].sz=sz; - alloc_regions++; - - if(DEBUG2_){ - printf("%d:allocate:id=%d addr=%p size=%ld\n",armci_me,id,temp,size); - fflush(stdout); - } - - return (void*) (temp); -} - -/******************** common code for the two versions *********************/ - - -/*\ Allocate a block of shared memory - called by master process -\*/ -char *Create_Shared_Region(long *id, long size, long *offset) -{ - char *temp; -int reg, refreg=0,nreg; - - if(alloc_regions>=MAX_REGIONS) - armci_die("Create_Shared_Region: to many regions ",0); - - if(DEBUG_){ - printf("%d:CreateSharedRegion %d:size=%ld\n",armci_me,create_call++,size); - fflush(stdout); - } - - /*initialization: 1st allocation request */ - if(!alloc_regions){ - for(reg=0;reg -#include -#include -#include -#include -#include -#include "shmlimit.h" - -#define DEBUG_ 0 - -#if defined(DECOSF) || defined(SOLARIS64) || defined(HPUX) -#define PIPE_AFTER_FORK_BUG -#endif - -void (*armci_sig_chld_orig)(); -static int status=0; -int armci_shmlimit_caught_sigchld=0; - -#if defined(SUN) && !defined(SOLARIS) -static void SigChldHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -static void SigChldHandler(sig) -#endif - int sig; -{ -#ifdef DISABLED - int pid; - pid = wait(&status); -#endif - armci_shmlimit_caught_sigchld=1; -} - -static void TrapSigChld() -{ - if ( (armci_sig_chld_orig = signal(SIGCHLD, SigChldHandler)) == SIG_ERR) - armci_die("TrapSigChld: error from signal setting SIGCHLD",0); -} - - -static void RestoreSigChld() -{ - if ( signal(SIGCHLD, armci_sig_chld_orig) == SIG_ERR) - armci_die("Restore_SigChld: error from restoring signal SIGChld",0); -} - - -static int child_finished() -{ - return armci_shmlimit_caught_sigchld; -} - - -int armci_child_shmem_init() -{ - pid_t pid; - int x; -#ifdef PIPE_AFTER_FORK_BUG - int i; -#endif - - int y; - int fd[2]; - - if(pipe(fd)==-1) armci_die("armci shmem_test pipe failed",0); - - TrapSigChld(); - - if ( (pid = fork() ) < 0) - - armci_die("armci shmem_test fork failed", (int)pid); - - else if(pid == 0){ - - x= armci_shmem_test(); - -#ifdef PIPE_AFTER_FORK_BUG - /* due to a bug in OSF1 V4.0/1229/alpha first item written gets hosed*/ - for(i=0;i<2;i++) -#endif - if(write(fd[1],&x,sizeof(int)) -#include -#ifndef WIN32 -#include -#include -#include -#include -#include -#endif - -#define PAUSE_ON_ERROR - -#define Error armci_die - -#if (defined(ENCORE) || defined(SEQUENT) || defined(ARDENT)) -# define SigType int -#else -# define SigType void -#endif - -#ifndef SIG_ERR -# define SIG_ERR (SigType (*)())-1 -#endif - -#if defined(SUN) || defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || \ - defined(AIX) || defined(NEXT) -#include -#endif - -extern void Error(); -extern int armci_me; - -int AR_caught_sigint=0; -int AR_caught_sigterm=0; -int AR_caught_sigchld=0; -int AR_caught_sigsegv=0; -int AR_caught_sig=0; - -SigType (*SigChldOrig)(), (*SigIntOrig)(), (*SigHupOrig)(), (*SigTermOrig)(); -SigType (*SigSegvOrig)(); - - -/*********************** SIGINT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigIntHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigIntHandler(sig) -#endif - int sig; -{ - AR_caught_sigint = 1; - AR_caught_sig= sig; - Error("SigIntHandler: interrupt signal was caught",(int) sig); -} - -void TrapSigInt() -/* - Trap the signal SIGINT so that we can propagate error - conditions and also tidy up shared system resources in a - manner not possible just by killing everyone -*/ -{ - if ( (SigIntOrig = signal(SIGINT, SigIntHandler)) == SIG_ERR) - Error("TrapSigInt: error from signal setting SIGINT",0); -} - -void RestoreSigInt() -/* - Restore the original signal handler -*/ -{ - if(AR_caught_sigint) SigIntOrig(SIGINT); - if ( signal(SIGINT, SigIntOrig) == SIG_ERR) - Error("RestoreSigInt: error from restoring signal SIGINT",0); -} - - -/*********************** SIGABORT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigAbortHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigAbortHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("SigIntHandler: abort signal was caught: cleaning up",(int) sig); -} - -void TrapSigAbort() -/* - Trap the signal SIGINT so that we can propagate error - conditions and also tidy up shared system resources in a - manner not possible just by killing everyone -*/ -{ - if ( signal(SIGINT, SigAbortHandler) == SIG_ERR) - Error("TrapSigAbort: error from signal setting SIGABORT",0); -} - - - -/*********************** SIGCHLD *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigChldHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigChldHandler(sig) -#endif - int sig; -{ - int status; -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - union wait ustatus; -#endif - -#if defined(LINUX) - pid_t ret; - /* Trap signal as soon as possible to avoid race */ - if ( (SigChldOrig = signal(SIGCHLD, SigChldHandler)) == SIG_ERR) - Error("SigChldHandler: error from signal setting SIGCHLD",0); -#endif - -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - -# if defined(LINUX) - ret = wait(&ustatus); - if((ret == 0) || ((ret == -1) && (errno == ECHILD))) { return; } -# else - (void) wait(&ustatus); -# endif - status = ustatus.w_status; - -#else - -# if defined(LINUX) - ret = waitpid(0, &status, WNOHANG); - if((ret == 0) || ((ret == -1) && (errno == ECHILD))) { return; } -# else - (void)wait(&status); -# endif - -#endif - AR_caught_sigchld=1; - AR_caught_sig= sig; - Error("Child process terminated prematurely, status=",(int) status); -} - -void TrapSigChld() -/* - Trap SIGCHLD so that can tell if children die unexpectedly. -*/ -{ - if ( (SigChldOrig = signal(SIGCHLD, SigChldHandler)) == SIG_ERR) - Error("TrapSigChld: error from signal setting SIGCHLD",0); -} - - -void RestoreSigChld() -/* - Restore the original signal handler -*/ -{ - if(AR_caught_sigchld) SigChldOrig(SIGCHLD); - if ( signal(SIGCHLD, SigChldOrig) == SIG_ERR) - Error("RestoreSigChld: error from restoring signal SIGChld",0); -} - - -void RestoreSigChldDfl() -{ -(void) signal(SIGCHLD, SIG_DFL); -} - - -/*********************** SIGBUS *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigBusHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigBusHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; -#ifdef PAUSE_ON_ERROR - fprintf(stderr,"%d(%d): Bus Error ... pausing\n", - armci_me, getpid() );pause(); -#endif - Error("Bus error, status=",(int) sig); -} - -void TrapSigBus() -/* - Trap SIGBUS -*/ -{ - if ( signal(SIGBUS, SigBusHandler) == SIG_ERR) - Error("TrapSigBus: error from signal setting SIGBUS", 0); -} - - - - -/*********************** SIGFPE *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigFpeHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigFpeHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Floating Point Exception error, status=",(int) sig); -} - -void TrapSigFpe() -/* - Trap SIGFPE -*/ -{ - if ( signal(SIGFPE, SigFpeHandler) == SIG_ERR) - Error("TrapSigFpe: error from signal setting SIGFPE", 0); -} - - - - -/*********************** SIGILL *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigIllHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigIllHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Illegal Instruction error, status=",(int) sig); -} - -void TrapSigIll() -/* - Trap SIGILL -*/ -{ - if ( signal(SIGILL, SigIllHandler) == SIG_ERR) - Error("TrapSigIll: error from signal setting SIGILL", 0); -} - - - - -/*********************** SIGSEGV *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigSegvHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigSegvHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - AR_caught_sigsegv=1; -#ifdef PAUSE_ON_ERROR - fprintf(stderr,"%d(%d): Segmentation Violation ... pausing\n", - armci_me, getpid() );pause(); -#endif - - Error("Segmentation Violation error, status=",(int) sig); -} -#ifdef DO_CKPT -static void * signal_arr[100]; -SigType SigSegvActionSa(int sig,siginfo_t *sinfo, void *ptr) -{ - int (*func)(); - AR_caught_sig= sig; - AR_caught_sigsegv=1; - func = signal_arr[sig]; - /*printf("\n%d:in sigaction %p, %d\n",armci_me,sinfo->si_addr,sinfo->si_errno);fflush(stdout);*/ - - if(func(sinfo->si_addr,sinfo->si_errno,sinfo->si_fd)) - Error("Segmentation Violation error, status=",(int) SIGSEGV); -} - -void TrapSigSegvSigaction() -{ - struct sigaction sa; - sa.sa_sigaction = (void *)SigSegvActionSa; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_RESTART; - sigaction(SIGSEGV, &sa, NULL); -} -#endif - -void TrapSigSegv() -/* - Trap SIGSEGV -*/ -{ - if ( (SigSegvOrig=signal(SIGSEGV, SigSegvHandler)) == SIG_ERR) - Error("TrapSigSegv: error from signal setting SIGSEGV", 0); -} - - -void RestoreSigSegv() -/* - Restore the original signal handler -*/ -{ -/* - if(AR_caught_sigsegv) SigSegvOrig(SIGSEGV); -*/ -#ifdef DO_CKPT__ - struct sigaction sa; - sa.sa_handler = (void *)SigSegvOrig; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_RESTART; - sigaction(SIGSEGV, &sa, NULL); - sigaction(SIGSEGV,&sa,NULL); -#else - if ( signal(SIGSEGV,SigSegvOrig) == SIG_ERR) - Error("RestoreSigSegv: error from restoring signal SIGSEGV",0); -#endif -} - - -/*********************** SIGSYS *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigSysHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigSysHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Bad Argument To System Call error, status=",(int) sig); -} - -void TrapSigSys() -/* - Trap SIGSYS -*/ -{ -#ifndef LINUX - if ( signal(SIGSYS, SigSysHandler) == SIG_ERR) - Error("TrapSigSys: error from signal setting SIGSYS", 0); -#endif -} - - - -/*********************** SIGTRAP *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigTrapHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigTrapHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Trace Trap error, status=",(int) sig); -} - -void TrapSigTrap() -/* - Trap SIGTRAP -*/ -{ - if ( signal(SIGTRAP, SigTrapHandler) == SIG_ERR) - Error("TrapSigTrap: error from signal setting SIGTRAP", 0); -} - - - -/*********************** SIGHUP *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigHupHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigHupHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Hangup error, status=",(int) sig); -} - -void TrapSigHup() -/* - Trap SIGHUP -*/ -{ - if ( (SigHupOrig = signal(SIGHUP, SigHupHandler)) == SIG_ERR) - Error("TrapSigHup: error from signal setting SIGHUP", 0); -} - - -void RestoreSigHup() -/* - Restore the original signal handler -*/ -{ - if(AR_caught_sig== SIGHUP) SigHupOrig(SIGHUP); - if ( signal(SIGHUP, SigHupOrig) == SIG_ERR) - Error("RestoreSigHUP: error from restoring signal SIGHUP",0); -} - - - -/*********************** SIGTERM *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigTermHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigTermHandler(sig) -#endif - int sig; -{ - AR_caught_sigterm = 1; - AR_caught_sig= sig; - Error("Terminate signal was sent, status=",(int) sig); -} - -void TrapSigTerm() -/* - Trap SIGTERM -*/ -{ - if ( (SigTermOrig = signal(SIGTERM, SigTermHandler)) == SIG_ERR) - Error("TrapSigTerm: error from signal setting SIGTERM", 0); -} - -void RestoreSigTerm() -/* - Restore the original signal handler -*/ -{ - if(AR_caught_sigterm && (SigTermOrig != SIG_DFL) ) SigTermOrig(SIGTERM); - if ( signal(SIGTERM, SigTermOrig) == SIG_ERR) - Error("RestoreSigTerm: error from restoring signal SIGTerm",0); -} - - -/*********************** SIGIOT *************************************/ -#ifdef SIGIOT -#if defined(SUN) && !defined(SOLARIS) -SigType SigIotHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigIotHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("IOT signal was sent, status=",(int) sig); -} - -void TrapSigIot() -/* - Trap SIGIOT -*/ -{ - if ( signal(SIGIOT, SigIotHandler) == SIG_ERR) - Error("TrapSigIot: error from signal setting SIGIOT", 0); -} -#endif - - - -/*********************** SIGCONT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigContHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigContHandler(sig) -#endif - int sig; -{ -/* Error("Trace Cont error, status=",(int) sig);*/ - AR_caught_sig= sig; -} - -void TrapSigCont() -/* - Trap SIGCONT -*/ -{ - if ( signal(SIGCONT, SigContHandler) == SIG_ERR) - Error("TrapSigCont: error from signal setting SIGCONT", 0); -} - -/*********************** SIGXCPU *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigXcpuHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigXcpuHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Terminate signal was sent, status=",(int) sig); -} - -void TrapSigXcpu() -/* - Trap SIGXCPU -*/ -{ - if ( signal(SIGXCPU, SigXcpuHandler) == SIG_ERR) - Error("TrapSigXcpu: error from signal setting SIGXCPU", 0); -} - -/******************* external API *********************************/ - -void ARMCI_ChildrenTrapSignals() -{ -#ifndef LAPI - TrapSigBus(); -#endif - TrapSigFpe(); - TrapSigIll(); -#ifdef DO_CKPT - TrapSigSegvSigaction(); -#else - TrapSigSegv(); -#endif - TrapSigSys(); - TrapSigTrap(); - TrapSigAbort(); - TrapSigTerm(); - TrapSigInt(); - -#if defined(LAPI) || defined(SGI) - TrapSigIot(); -#endif - -#ifdef SGI - TrapSigXcpu(); -#endif - -} - - -void ARMCI_ParentTrapSignals() -{ -#ifndef LAPI - TrapSigChld(); -#endif - TrapSigHup(); -} - - -void ARMCI_RestoreSignals() -{ - RestoreSigTerm(); - RestoreSigInt(); - RestoreSigSegv(); -} - - -void ARMCI_ParentRestoreSignals() -{ -#ifndef LAPI - RestoreSigChld(); -#endif - ARMCI_RestoreSignals(); - RestoreSigHup(); -} - -#ifdef DO_CKPT -/*user can register a function with 3 parameters, 1st offending address - * 2nd err number and third file descriptor*/ -void ARMCI_Register_Signal_Handler(int sig, void (*func)()) -{ - signal_arr[sig]=func; -} -#endif diff --git a/armci/src-gemini/signaltrap.h b/armci/src-gemini/signaltrap.h deleted file mode 100644 index 7e961826e..000000000 --- a/armci/src-gemini/signaltrap.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _SIGNALTRAP_H_ -#define _SIGNALTRAP_H_ - -#ifdef SYSV - extern void ARMCI_ChildrenTrapSignals(); - extern void ARMCI_ParentTrapSignals(); - extern void ARMCI_ParentRestoreSignals(); - extern void ARMCI_RestoreSignals(); -#else -# define ARMCI_ChildrenTrapSignals() -# define ARMCI_ParentTrapSignals() -# define ARMCI_ParentRestoreSignals() -#endif - -#endif diff --git a/armci/src-gemini/sockets.h b/armci/src-gemini/sockets.h deleted file mode 100644 index e76b5d277..000000000 --- a/armci/src-gemini/sockets.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef SOCKETS_H_ -#define SOCKETS_H_ -#include "armci.h" -#ifndef WIN32 -#define USE_SOCKET_VECTOR_API -#endif -#if defined(USE_SOCKET_VECTOR_API) -# include -#endif -extern int tcp_sendrcv_bufsize; -extern int armci_PollSocket(int sock); -extern int armci_WaitSock(int *socklist, int num, int *ready); -extern int armci_ReadFromSocket(int sock, void* buffer, int lenbuf); -extern int armci_WriteToSocket (int sock, void* buffer, int lenbuf); - -#if defined(USE_SOCKET_VECTOR_API) -extern int armci_RecvStridedFromSocket(int sock,void* buffer,int *str_arr,int *cnt,int str_level,struct iovec *iov); -extern int armci_SendStridedToSocket(int sock,void* buffer,int *str_arr,int *cnt,int str_level,struct iovec *iov); -extern int armci_RecvVectorFromSocket(int sock,armci_giov_t darr[], int len,struct iovec *iov); -extern int armci_SendVectorToSocket(int sock,armci_giov_t darr[], int len,struct iovec *iov); -extern int armci_ReadVFromSocket(int sock,struct iovec *iov, int iovlength, int totalsize); -extern int armci_WriteVToSocket (int sock,struct iovec *iov, int iovlength, int totalsize); -#endif -extern void armci_ListenSockAll(int* socklist, int num); -extern void armci_AcceptSockAll(int* socklist, int num); -extern int armci_CreateSocketAndConnect(char *hostname, int port); -extern void armci_ShutdownAll(int socklist[], int num); -extern void armci_CreateSocketAndBind(int *sock, int *port); -#define PACKET_SIZE tcp_sendrcv_bufsize -#define TIMEOUT_ACCEPT 60 -#define GET_SEND_BUFFER _armci_buf_get_clear_busy -#define FREE_SEND_BUFFER _armci_buf_release - -#ifndef UIO_MAXIOV -#define MAX_IOVEC 8 -#else -#define MAX_IOVEC (UIO_MAXIOV>100?100:UIO_MAXIOV) -#endif -#endif diff --git a/armci/src-gemini/spawn.c b/armci/src-gemini/spawn.c deleted file mode 100644 index d2a605577..000000000 --- a/armci/src-gemini/spawn.c +++ /dev/null @@ -1,133 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* OS specific server process/thread creation and destruction - * JN/03.25.2000 - */ -#include -#include -#include "armcip.h" - -#ifdef WIN32 -/************************** Windows threads **************************/ -#include -#include - -thread_id_t armci_serv_tid; -unsigned long armci_serv_handle; -#ifndef NO_STDLIBC -#define NEWTHREAD CreateThread -#else -#define NEWTHREAD _beginthreadex -#endif - -unsigned __stdcall armci_wrap_func(void *arg) -{ -void (*func)(void*); - func = arg; - - /* boost the server thread priority be better responsiveness */ - (void)SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_HIGHEST); - - func(NULL); - - return (unsigned)0; -} - - -void armci_create_server_thread ( void* (* func)(void*) ) -{ - /* as we need to use std C rt library we cannot use CreateThread */ - armci_serv_handle = NEWTHREAD(NULL, 0, armci_wrap_func, (void*)func, - 0, &armci_serv_tid); - if(!armci_serv_handle) - armci_die("armci_create_server_thread: create failed",0); - - - -} - -void armci_terminate_server_thread() -{ -/*int rc;*/ -/* TerminateThread(armci_serv_handle,&rc);*/ -} - -/****************************** PTHREADS *****************************/ -#elif defined(PTHREADS) -#include - -thread_id_t armci_serv_tid; - -void armci_create_server_thread ( void* (* func)(void*) ) -{ -pthread_attr_t attr; -int rc; - - if(pthread_attr_init(&attr)) - armci_die("armci_create_server_thread: attr init failed",0); - -#if defined(AIX) || defined(SOLARIS) - pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); -#endif - - rc = pthread_create(&armci_serv_tid, &attr, func, NULL); - if(rc) armci_die("armci_create_server_thread: create failed",errno); - - pthread_attr_destroy(&attr); -} - - -void armci_terminate_server_thread() -{ - if(pthread_join(armci_serv_tid,NULL)) - armci_die("armci_terminate_server_thread: failed",0); -} - - -#else -/**************************** Unix processes ******************************/ - -#include -#include -#include - -pid_t server_pid= (pid_t)0; - - -char child_stack[256*1024]; -char *child_stack_top = &child_stack[256*1024-1]; - -void armci_create_server_process ( void* (* func)(void*) ) -{ -pid_t pid; -/* - if ( (pid = fork() ) < 0) - armci_die("fork failed", (int)pid); -*/ - pid = clone(func, (void*)child_stack_top, - CLONE_THREAD|CLONE_SIGHAND|CLONE_VM, NULL); - - if (pid == -1) { - armci_die("fork failed", (int)pid); - } - - server_pid = pid; -} - - -void armci_wait_server_process() -{ - int stat; - pid_t rc; - - if(!server_pid) return; - rc = wait (&stat); - if (rc != server_pid){ - perror("ARMCI master: wait for child process (server) failed:"); - } - server_pid = (pid_t)0; -} - -#endif diff --git a/armci/src-gemini/spinlock.h b/armci/src-gemini/spinlock.h deleted file mode 100644 index a6b306f10..000000000 --- a/armci/src-gemini/spinlock.h +++ /dev/null @@ -1,235 +0,0 @@ -/** - * @file spinlock.h - * - * This file attempts to implement spin locks for various platforms and/or CPU - * instruction sets. - */ -#ifndef SPINLOCK_H -#define SPINLOCK_H - -#define DEBUG_SPINLOCK 0 - -#define OPENPA 0 - -#if OPENPA -# if DEBUG_SPINLOCK -# warning SPINLOCK: openpa -# endif -# define SPINLOCK -# include "opa_primitives.h" -# define LOCK_T OPA_int_t -# define TESTANDSET(x) OPA_swap_int((x), 1) -# define MEMORY_BARRIER OPA_read_write_barrier - -#elif (defined(PPC) || defined(__PPC__) || defined(__PPC)) -# if DEBUG_SPINLOCK -# warning SPINLOCK: PPC -# endif -# define SPINLOCK -# include "asm-ppc.h" -//# define TESTANDSET testandset -//# define TESTANDSET acquireLock -# define armci_acquire_spinlock acquire_spinlock -# define armci_release_spinlock release_spinlock -# define MEMORY_BARRIER memory_barrier -static int testandset(void *spinlock) { - int v=1; - atomic_exchange(&v,spinlock,sizeof(int)); - return v; -} -static void memory_barrier() { - __asm__ __volatile__ ("sync" : : : "memory"); -} - -#elif defined(__i386__) || defined(__x86_64__) -# if DEBUG_SPINLOCK -# warning SPINLOCK: x86_64 -# endif -# define SPINLOCK -# include "atomics-i386.h" -static int testandset(void *spinlock) { - int v=1; - atomic_exchange(&v,spinlock,sizeof(int)); - return v; -} -# define TESTANDSET testandset - -#elif defined(__ia64) -# if DEBUG_SPINLOCK -# warning SPINLOCK: ia64 -# endif -# define SPINLOCK -# include "atomic_ops_ia64.h" -static int testandset(void *spinlock) { - int val=1; - int res; - atomic_swap_int(spinlock, val, &res); - return res; -} -# define TESTANDSET testandset - -#elif defined(DECOSF) -# if DEBUG_SPINLOCK -# warning SPINLOCK: DECOSF -# endif -# error "no implementation" - -#elif defined(SGI) -# if DEBUG_SPINLOCK -# warning SPINLOCK: SGI -# endif -# include -# define SPINLOCK -# define TESTANDSET(x) __lock_test_and_set((x), 1) -# define RELEASE_SPINLOCK __lock_release - -/*#elif defined(AIX)*/ -#elif HAVE_SYS_ATOMIC_OP_H -# if DEBUG_SPINLOCK -# warning SPINLOCK: sys/atomic_op.h (AIX) -# endif -# include -# define SPINLOCK -# define TESTANDSET(x) (_check_lock((x), 0, 1)==TRUE) -# define RELEASE_SPINLOCK(x) _clear_lock((x),0) - -#elif defined(SOLARIS) -# if DEBUG_SPINLOCK -# warning SPINLOCK: SOLARIS -# endif -# include -# include -# define SPINLOCK -# define TESTANDSET(x) (!_lock_try((x))) -# define RELEASE_SPINLOCK _lock_clear - -#elif defined(MACX) - -#elif defined(HPUX__) -# if DEBUG_SPINLOCK -# warning SPINLOCK: HPUX__ -# endif -extern int _acquire_lock(); -extern void _release_lock(); -# define SPINLOCK -# define TESTANDSET(x) (!_acquire_lock((x))) -# define RELEASE_SPINLOCK _release_lock - -#elif defined(HPUX) && defined(__ia64) /* HPUX on IA64, non gcc */ -# if DEBUG_SPINLOCK -# warning SPINLOCK: HPUX ia64 -# endif -# define SPINLOCK -typedef unsigned int slock_t; -# include -# define TESTANDSET(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE) -# define RELEASE_SPINLOCK(lock) (*((volatile LOCK_T *) (lock)) = 0) - -#elif defined(NEC) -# if DEBUG_SPINLOCK -# warning SPINLOCK: NEC -# endif -extern ullong ts1am_2me(); -# define LOCK_T ullong -# define _LKWD (1ULL << 63) -# define SPINLOCK -# define TESTANDSET(x) ((_LKWD & ts1am_2me(_LKWD, 0xffULL, (ullong)(x)))) -# define MEMORY_BARRIER mpisx_clear_cache -extern void mpisx_clear_cache(); -# define RELEASE_SPINLOCK(x) ts1am_2me(0ULL, 0xffULL, (ullong)x); - -#endif - -#ifdef SPINLOCK - -#if DEBUG_ -# if HAVE_STDIO_H -# include -# endif -#endif - -#if HAVE_UNISTD_H -# include -#endif - -#ifndef DBL_PAD -# define DBL_PAD 16 -#endif - -/* make sure that locks are not sharing the same cache line */ -typedef struct{ - double lock[DBL_PAD]; -}pad_lock_t; - -#ifndef LOCK_T -# define LOCK_T int -#endif -#define PAD_LOCK_T pad_lock_t - -static inline void armci_init_spinlock(LOCK_T *mutex) -{ -#if OPENPA - OPA_store_int(mutex, 0); -#else - *mutex =0; -#endif -} - -#ifdef TESTANDSET - -static inline void armci_acquire_spinlock(LOCK_T *mutex) -{ -#if defined(BGML) || defined(DCMF) - return; -#else - int loop=0, maxloop =10; - - while (TESTANDSET(mutex)){ - loop++; - if(loop==maxloop){ -# if DEBUG_ - extern int armci_me; - printf("%d:spinlock sleeping\n",armci_me); fflush(stdout); -# endif - usleep(1); - loop=0; - } - } -#endif -} - -#ifdef RELEASE_SPINLOCK -# ifdef MEMORY_BARRIER -# define armci_release_spinlock(x) MEMORY_BARRIER(); RELEASE_SPINLOCK(x) -# else -# define armci_release_spinlock(x) RELEASE_SPINLOCK(x) -# endif -#else -static inline void armci_release_spinlock(LOCK_T *mutex) -{ -#if defined(BGML) || defined(DCMF) - return; -#else -# ifdef MEMORY_BARRIER - MEMORY_BARRIER (); -# endif -#if OPENPA - OPA_store_int(mutex, 0); -#else - *mutex =0; -#endif -# ifdef MEMORY_BARRIER - MEMORY_BARRIER (); -# endif -# if (defined(MACX)||defined(LINUX)) && defined(__GNUC__) && defined(__ppc__) - __asm__ __volatile__ ("isync" : : : "memory"); -# endif -#endif -} -#endif /* RELEASE_SPINLOCK */ - -#endif /* TESTANDSET */ - -#endif /* SPINLOCK */ - -#endif /* SPINLOCK_H */ diff --git a/armci/src-gemini/strided.c b/armci/src-gemini/strided.c deleted file mode 100644 index 7a290ff72..000000000 --- a/armci/src-gemini/strided.c +++ /dev/null @@ -1,1812 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "armcip.h" -#include "copy.h" -#include "acc.h" -#include "memlock.h" -#include -#include - -extern int cos_me; - -#define DATA_SERVER_ 1 - -// by default use the ds remote gets -#define ORNL_USE_DS_FOR_REMOTE_GETS - -#ifdef ORNL_USE_DS_FOR_REMOTE_GETS -#define DATA_SERVER_GET_ 1 -#else -#define DATA_SERVER_GET_ 0 -#endif - -#define ARMCI_OP_2D(op, scale, proc, src, dst, bytes, count, src_stride, dst_stride,lockit)\ -if(op == GET || op ==PUT)\ - armci_copy_2D(op, proc, src, dst, bytes, count, src_stride,dst_stride);\ -else if(count==1) armci_acc_1D(op, scale, proc, src, dst, bytes,lockit);\ -else\ - armci_acc_2D(op, scale, proc, src, dst, bytes, count, src_stride,dst_stride,lockit) - -/* macro supports run-time selection of request sending scheme */ -#if defined(CLIENT_BUF_BYPASS) -#define CAN_REQUEST_DIRECTLY _armci_bypass -#else -# if defined(HITACHI) -# define CAN_REQUEST_DIRECTLY 0 -# else -# define CAN_REQUEST_DIRECTLY 1 -# endif -#endif - -#define PREPROCESS_STRIDED(tmp_count) {\ - tmp_count=0;\ - if(stride_levels) \ - for(;stride_levels;stride_levels--)if(count[stride_levels]>1)break;\ - if(stride_levels&&(count[0]==src_stride_arr[0]&&count[0]==dst_stride_arr[0])){\ - tmp_count=seg_count[1];\ - count = seg_count+1;\ - seg_count[1] = seg_count[0] * seg_count[1];\ - stride_levels --;\ - src_stride_arr ++; dst_stride_arr++ ;\ - }\ -} -#define POSTPROCESS_STRIDED(tmp_count) if(tmp_count)seg_count[1]=tmp_count - -#define SERVER_PUT 1 -#define SERVER_NBPUT 2 -#define DIRECT_PUT 3 -#define DIRECT_NBPUT 4 -#define SERVER_GET 5 -#define SERVER_NBGET 6 -#define DIRECT_GET 7 -#define DIRECT_NBGET 8 -#define ONESIDED_PUT 9 -#define ONESIDED_GET 10 - - -# define DO_FENCE(__proc,__prot) \ - if(__prot==SERVER_GET); \ - else if(__prot==SERVER_PUT); \ - else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET) { \ - if(armci_prot_switch_fence[__proc]==SERVER_PUT) \ - ARMCI_DoFence(__proc); \ - } \ - else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT) { \ - if(armci_prot_switch_fence[__proc]==SERVER_PUT) \ - ARMCI_DoFence(__proc); \ - } \ - else if(__prot==ONESIDED_GET) { \ - if(armci_prot_switch_fence[__proc]==SERVER_PUT) { \ - ARMCI_DoFence(__proc); \ - } \ - } \ - else if(__prot==ONESIDED_PUT); \ - else; \ - armci_prot_switch_fence[__proc]=__prot - -#ifndef REGIONS_REQUIRE_MEMHDL -# define ARMCI_MEMHDL_T void -#endif - -ARMCI_MEMHDL_T *mhloc=NULL,*mhrem=NULL; - -#ifdef REGIONS_REQUIRE_MEMHDL - int armci_region_both_found_hndl(void *loc, void *rem, int size, int node, - ARMCI_MEMHDL_T **loc_memhdl,ARMCI_MEMHDL_T **rem_memhdl); -# define ARMCI_REGION_BOTH_FOUND(_s,_d,_b,_p) \ - armci_region_both_found_hndl((_s),(_d),(_b),(_p),&mhloc,&mhrem) -#else -# define ARMCI_REGION_BOTH_FOUND(_s,_d,_b,_p) \ - armci_region_both_found((_s),(_d),(_b),(_p)) -#endif - -#ifdef HAS_RDMA_GET - -# ifdef REGIONS_REQUIRE_MEMHDL - void armci_client_direct_get(int p, void *src_buf, void *dst_buf, int len, - void** cptr,int nbtag,ARMCI_MEMHDL_T *lochdl,ARMCI_MEMHDL_T *remhdl); -# else - void armci_client_direct_get(int p, void *src_buf, void *dst_buf, int len, - void** contextptr,int nbtag,void *mhdl,void *mhdl1); -# endif -# define ARMCI_NBREM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \ - armci_client_direct_get((_p),(_s),(_d),(_cou)[0],&((_hdl)->cmpl_info),(_hdl)->tag,(void *)mhloc,(void *)mhrem); \ - -# define ARMCI_REM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \ - armci_client_direct_get((_p),(_s),(_d),(_cou)[0],NULL,0,(void *)mhloc,(void *)mhrem); \ - -#else - -# define ARMCI_REM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \ - armci_rem_get((_p),(_s),(_sst),(_d),(_dst),(_cou),(_lev),(_hdl),(void *)mhloc,(void *)mhrem) -# define ARMCI_NBREM_GET ARMCI_REM_GET - -#endif - - extern int* armci_prot_switch_fence; - extern int armci_prot_switch_preproc; - extern int armci_prot_switch_preop; - - -int armci_iwork[MAX_STRIDE_LEVEL]; - -/*\ 2-dimensional array copy -\*/ -static void armci_copy_2D(int op, int proc, void *src_ptr, void *dst_ptr, - int bytes, int count, int src_stride, int dst_stride) -{ - int armci_th_idx = ARMCI_THREAD_IDX; - -#ifdef LAPI2__ -# define COUNT 1 -#else -# define COUNT count -#endif - -#ifdef __crayx1 - int shmem = 1; -#else - int shmem = SAMECLUSNODE(proc); -#endif - if(shmem) { - /* data is in local/shared memory -- can use memcpy */ -// printf("%s: shmem==true; count==%d\n",Portals_ID(),count); - if(count==1){ - armci_copy(src_ptr, dst_ptr, bytes); -// printf("%s: shmem==true; finished\n",Portals_ID(),count); - }else { - char *ps=(char*)src_ptr; - char *pd=(char*)dst_ptr; - int j; - for (j = 0; j < count; j++){ - bcopy(ps,pd,bytes); - ps += src_stride; - pd += dst_stride; - } - } - } else { - - /* data not in local/shared memory-access through global address space*/ - - if(op==PUT){ - - // printf("%s: pre UPDATE_FENCE_STATE\n",Portals_ID()); - UPDATE_FENCE_STATE(proc, PUT, COUNT); - // printf("%s: post UPDATE_FENCE_STATE\n",Portals_ID()); -#ifdef LAPI - SET_COUNTER(ack_cntr[armci_th_idx],COUNT); -#endif - if(count==1){ - armci_put(src_ptr, dst_ptr, bytes, proc); - }else{ - armci_put2D(proc, bytes, count, src_ptr, src_stride, - dst_ptr, dst_stride); - } - - }else{ - -#ifdef LAPI - SET_COUNTER(get_cntr[armci_th_idx], COUNT); -#endif - if(count==1){ - armci_get(src_ptr, dst_ptr, bytes, proc); - }else{ - armci_get2D(proc, bytes, count, src_ptr, src_stride, - dst_ptr, dst_stride); - } - } - } -} - - -#if (defined(CRAY) && !defined(__crayx1)) || defined(FUJITSU) -#ifdef CRAY -# define DAXPY SAXPY -#else -# define DAXPY daxpy_ -#endif - -static int ONE=1; -#define THRESH_ACC 32 - -static void daxpy_2d_(void* alpha, int *rows, int *cols, void *a, int *ald, - void* b, int *bld) -{ - int c,r; - double *A = (double*)a; - double *B = (double*)b; - double Alpha = *(double*)alpha; - - if(*rows < THRESH_ACC) - for(c=0;c<*cols;c++) - for(r=0;r<*rows;r++) - A[c* *ald+ r] += Alpha * B[c* *bld+r]; - else for(c=0;c<*cols;c++) - DAXPY(rows, alpha, B + c* *bld, &ONE, A + c* *ald, &ONE); -} -#endif - - -void armci_acc_1D(int op, void *scale, int proc, void *src, void *dst, int bytes, int lockit) -{ -int rows; -void (*func)(void*, void*, void*, int*); - ARMCI_PR_DBG("enter",0); - switch (op){ - case ARMCI_ACC_INT: - rows = bytes/sizeof(int); - func = I_ACCUMULATE_1D; - break; - case ARMCI_ACC_LNG: - rows = bytes/sizeof(long); - func = L_ACCUMULATE_1D; - break; - case ARMCI_ACC_DBL: - rows = bytes/sizeof(double); - func = D_ACCUMULATE_1D; - break; - case ARMCI_ACC_DCP: - rows = bytes/(2*sizeof(double)); - func = Z_ACCUMULATE_1D; - break; - case ARMCI_ACC_CPL: - rows = bytes/(2*sizeof(float)); - func = C_ACCUMULATE_1D; - break; - case ARMCI_ACC_FLT: - rows = bytes/sizeof(float); - func = F_ACCUMULATE_1D; - break; - default: armci_die("ARMCI accumulate: operation not supported",op); - func = F_ACCUMULATE_1D; /*avoid compiler whining */ - } - - - if(lockit){ - ARMCI_LOCKMEM(dst, bytes + (char*)dst, proc); - } - func(scale, dst, src, &rows); - if(lockit)ARMCI_UNLOCKMEM(proc); - ARMCI_PR_DBG("exit",0); -} - -/*\ 2-dimensional accumulate -\*/ -void armci_acc_2D(int op, void* scale, int proc, void *src_ptr, void *dst_ptr, - int bytes, int cols, int src_stride, int dst_stride, int lockit) -{ -int rows, lds, ldd, span; -void (*func)(void*, int*, int*, void*, int*, void*, int*); - - ARMCI_PR_DBG("enter",0); - -/* - if((long)src_ptr%ALIGN)armci_die("src not aligned",(long)src_ptr); - if((long)dst_ptr%ALIGN)armci_die("src not aligned",(long)dst_ptr); -*/ - - switch (op){ - case ARMCI_ACC_INT: - rows = bytes/sizeof(int); - ldd = dst_stride/sizeof(int); - lds = src_stride/sizeof(int); - func = I_ACCUMULATE_2D; - break; - case ARMCI_ACC_LNG: - rows = bytes/sizeof(long); - ldd = dst_stride/sizeof(long); - lds = src_stride/sizeof(long); - func = L_ACCUMULATE_2D; - break; - case ARMCI_ACC_DBL: - rows = bytes/sizeof(double); - ldd = dst_stride/sizeof(double); - lds = src_stride/sizeof(double); - func = D_ACCUMULATE_2D; - break; - case ARMCI_ACC_DCP: - rows = bytes/(2*sizeof(double)); - ldd = dst_stride/(2*sizeof(double)); - lds = src_stride/(2*sizeof(double)); - func = Z_ACCUMULATE_2D; - break; - case ARMCI_ACC_CPL: - rows = bytes/(2*sizeof(float)); - ldd = dst_stride/(2*sizeof(float)); - lds = src_stride/(2*sizeof(float)); - func = C_ACCUMULATE_2D; - break; - case ARMCI_ACC_FLT: - rows = bytes/sizeof(float); - ldd = dst_stride/sizeof(float); - lds = src_stride/sizeof(float); - func = F_ACCUMULATE_2D; - break; - case ARMCI_ACC_RA: - rows = bytes/sizeof(long); - ldd = dst_stride/sizeof(long); - lds = src_stride/sizeof(long); - func = RA_ACCUMULATE_2D_; - break; - default: armci_die("ARMCI accumulate: operation not supported",op); - func = F_ACCUMULATE_2D; /*avoid compiler whining */ - } - - - if(lockit){ - span = cols*dst_stride; - ARMCI_LOCKMEM(dst_ptr, span + (char*)dst_ptr, proc); - } - func(scale, &rows, &cols, dst_ptr, &ldd, src_ptr, &lds); - if(lockit)ARMCI_UNLOCKMEM(proc); - ARMCI_PR_DBG("exit",0); - -} - - -/*\ compute range of strided data AND lock it -\*/ -static void -armci_lockmem_patch(void* dst_ptr, int dst_stride_arr[], int count[], int stride_levels, int proc) -{ - long span = count[stride_levels]; - ARMCI_PR_DBG("enter",0); - span *= dst_stride_arr[stride_levels-1]; - - /* lock region of remote memory */ - ARMCI_LOCKMEM(dst_ptr, span + (char*)dst_ptr, proc); - ARMCI_PR_DBG("exit",0); -} - - -/*\ strided accumulate on top of remote memory copy: - * copies remote data to local buffer, accumulates, puts it back - * Note: if we are here then remote patch must fit in the ARMCI buffer -\*/ -int armci_acc_copy_strided(int optype, void* scale, int proc, - void* src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels) -{ - void *buf_ptr = armci_internal_buffer; - int rc, i, *buf_stride_arr = armci_iwork; - ARMCI_PR_DBG("enter",0); - armci_lockmem_patch(dst_ptr,dst_stride_arr, count, stride_levels, proc); - - /* setup stride array for internal buffer */ - buf_stride_arr[0]=count[0]; - for(i=0; i< stride_levels; i++) { - buf_stride_arr[i+1]= buf_stride_arr[i]*count[i+1]; - } - - /* get remote data to local buffer */ - rc = armci_op_strided(GET, scale, proc, dst_ptr, dst_stride_arr, buf_ptr, - buf_stride_arr, count, stride_levels, 0,NULL); - - if(rc) { ARMCI_UNLOCKMEM(proc); return(rc); } - - /* call local accumulate with lockit=0 (we locked it already) and proc=me */ - rc = armci_op_strided(optype, scale, armci_me, src_ptr, src_stride_arr, - buf_ptr,buf_stride_arr, count, stride_levels,0,NULL); - if(rc) { ARMCI_UNLOCKMEM(proc); return(rc); } - - /* put data back from the buffer to remote location */ - rc = armci_op_strided(PUT, scale, proc, buf_ptr, buf_stride_arr, dst_ptr, - dst_stride_arr, count, stride_levels,0,NULL); - - FENCE_NODE(proc); /* make sure put completes before unlocking */ - ARMCI_UNLOCKMEM(proc); /* release memory lock */ - ARMCI_PR_DBG("exit",0); - - return(rc); -} - - - -/*\ Strided operation -\*/ -int armci_op_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, int lockit, - armci_ihdl_t nb_handle) -{ -char *src = (char*)src_ptr, *dst=(char*)dst_ptr; -int s2, s3, i,j, unlockit=0; -int total_of_2D; -int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL]; - ARMCI_PR_DBG("enter",op); -# if defined(ACC_COPY) - -# ifdef ACC_SMP - if(ARMCI_ACC(op) && !(SAMECLUSNODE(proc)) ) -# else - if ( ARMCI_ACC(op) && proc!=armci_me) -# endif - /* copy remote data, accumulate, copy back*/ - return (armci_acc_copy_strided(op,scale, proc, src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, count, stride_levels)); - - else; /* do it directly through shared/local memory */ -# endif - - - if(ARMCI_ACC(op) && (stride_levels>2) && lockit){ - /* we need one lock operation only - must be done outside 2d acc */ - armci_lockmem_patch(dst_ptr,dst_stride_arr, count, stride_levels, proc); - unlockit=1; - lockit =0; - } -/* if(proc!=armci_me) INTR_OFF;*/ - if(armci_me>=0 && !SAMECLUSNODE(proc)) { - printf("%d network_strided not supported (in op_strided)\n",cos_me); - abort(); - # if 0 - armci_network_strided(op,scale,proc,src_ptr,src_stride_arr,dst_ptr, - dst_stride_arr,count,stride_levels,nb_handle); - # endif - } - else { -// printf("%s in large switch stmt in op_strided (stride_levels=%d)\n",Portals_ID(),stride_levels); - switch (stride_levels) { - case 0: /* 1D copy */ - - ARMCI_OP_2D(op, scale, proc, src_ptr, dst_ptr, count[0], 1, - count[0], count[0], lockit); - - break; - - case 1: /* 2D op */ - ARMCI_OP_2D(op, scale, proc, src_ptr, dst_ptr, count[0], count[1], - src_stride_arr[0], dst_stride_arr[0], lockit); - break; - - case 2: /* 3D op */ - for (s2= 0; s2 < count[2]; s2++){ /* 2D copy */ - ARMCI_OP_2D(op, scale, proc, src+s2*src_stride_arr[1], - dst+s2*dst_stride_arr[1], count[0], count[1], - src_stride_arr[0], dst_stride_arr[0], lockit ); - } - break; - - case 3: /* 4D op */ - for(s3=0; s3< count[3]; s3++){ - src = (char*)src_ptr + src_stride_arr[2]*s3; - dst = (char*)dst_ptr + dst_stride_arr[2]*s3; - for (s2= 0; s2 < count[2]; s2++){ /* 3D copy */ - ARMCI_OP_2D(op, scale, proc, src+s2*src_stride_arr[1], - dst+s2*dst_stride_arr[1], - count[0], count[1],src_stride_arr[0], - dst_stride_arr[0],lockit); - } - } - break; - - default: /* N-dimensional */ - { - /* stride_levels is not the same as ndim. it is ndim-1 - * For example a 10x10x10... array, suppose the datatype is byte - * the stride_arr is 10, 10x10, 10x10x10 .... - */ - index[2] = 0; unit[2] = 1; total_of_2D = count[2]; - for(j=3; j<=stride_levels; j++) { - index[j] = 0; unit[j] = unit[j-1] * count[j-1]; - total_of_2D *= count[j]; - } - - for(i=0; i= count[j]) index[j] = 0; - } - - ARMCI_OP_2D(op, scale, proc, src, dst, count[0], count[1], - src_stride_arr[0], dst_stride_arr[0], lockit); - } - - } - } - } // ends else block - -// printf("%s after switch stmt; prior to fence/lock\n",Portals_ID()); - - if(unlockit){ -# if defined(ACC_COPY) - FENCE_NODE(proc); -# endif - ARMCI_UNLOCKMEM(proc); /* release memory lock */ - } - -// printf("%s after fence/lock; leaving op_strided\n",Portals_ID()); - ARMCI_PR_DBG("exit",op); - return 0; -} - - -int PARMCI_PutS( void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ) -{ -int rc=0, direct=1; -int *count=seg_count, tmp_count=0; - - ARMCI_PR_DBG("enter",proc); - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(seg_count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - - ORDER(PUT,proc); /* ensure ordering */ - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_ - if(stride_levels) direct=SAMECLUSNODE(proc); - direct=SAMECLUSNODE(proc); -#endif - - // printf("%s direct=%d, proc=%d\n",Portals_ID(),direct,proc); - - if(!direct){ - - # ifdef CRAY_REGISTER_ARMCI_MALLOC - if (stride_levels == 0 && armci_onesided_direct_put_enabled) - { - // maybe move this to just before the operation happen of even after it happens - // it's possible we may want to skip the ONESIDED_PUT and do a SERVER_PUT if we can't find the remote mdh - DO_FENCE(proc,SERVER_PUT); - - // local variable within stride_level == 0 scope - cos_desc_t *comm_desc = &__global_1sided_direct_comm_desc; - onesided_hnd_t cp_hnd; - cos_mdesc_t local_mdh, remote_mdh; - - // find remote mdh - armci_onesided_search_remote_mdh_list(dst_ptr, proc, &remote_mdh); - - // register local memory -- this should use abhinav's dreg routines - cpMemRegister(src_ptr, count[0], &local_mdh); - // onesided_mem_register(cp_hnd, src_ptr, count[0], NULL, &local_mdh); - - // get the onesided v2.0 api handle for the compute process - cpGetOnesidedHandle(&cp_hnd); - - // initialize onesided communication descriptor - onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, comm_desc); - - // initiate put - onesided_put_nb(comm_desc); - - // complete put [locally] - onesided_wait(comm_desc); - - // deregister memory -- if we were using the dreg routines, we would let the - // dreg memory do this for us "on demand" = lazy mem deregisteration - cpMemDeregister(&local_mdh); - // onesided_mem_deregister(cp_hnd, &local_mdh); - - // issue a flushing get - does nothing to fix the fence problem - /* - static long flushaddr = 911; - cpMemRegister(&flushaddr, sizeof(long), &local_mdh); - onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, NULL, comm_desc); - onesided_get_nb(comm_desc); - onesided_wait(comm_desc); - cpMemDeregister(&local_mdh); - */ - - // done! - goto fn_exit; - } - else - # endif - { - DO_FENCE(proc,SERVER_PUT); -// printf("%s calling pack_strided in PARMCI_PutS\n",Portals_ID()); - rc = armci_pack_strided(PUT, NULL, proc, src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels, NULL, -1, -1, -1,NULL); - } - } - else - { - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT); -// printf("%s calling op_strided in PARMCI_PutS\n",Portals_ID()); - rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr,count,stride_levels, - 0,NULL); - } - POSTPROCESS_STRIDED(tmp_count); - -#ifdef ARMCI_PROFILE - armci_profile_stop_strided(ARMCI_PROF_PUTS); -#endif - -fn_exit: - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; - -} - - -int PARMCI_PutS_flag( - void* src_ptr, /* pointer to 1st segment at source */ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination */ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level, - count[0] = #bytes */ - int stride_levels, /* number of stride levels */ - int *flag, /* pointer to remote flag */ - int val, /* value to set flag upon completion of - data transfer */ - int proc /* remote process(or) ID */ - ) -{ - int bytes; - /* Put local data on remote processor */ - PARMCI_PutS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, - count, stride_levels, proc); - - /* Send signal to remote processor that data transfer has - * been completed. */ - bytes = sizeof(int); - PARMCI_Put(&val, flag, bytes, proc); - return 1; -} - - -int PARMCI_Put_flag(void *src, void* dst,int bytes,int *f,int v,int proc) { - return PARMCI_PutS_flag(src, NULL, dst, NULL, &bytes, 0, f, v, proc); -} - - -int PARMCI_PutS_flag_dir(void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int seg_count[], int stride_levels, - int *flag, int val, int proc) { - return PARMCI_PutS_flag(src_ptr, src_stride_arr,dst_ptr,dst_stride_arr, - seg_count, stride_levels, flag, val, proc); -} - - -int PARMCI_GetS( void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ) -{ - int rc,direct=1; - int *count=seg_count, tmp_count=0; - ARMCI_PR_DBG("enter",proc); - -#ifdef ARMCI_ONESIDED_GETS_USES_NBGETS - armci_hdl_t nb_handle; - ARMCI_INIT_HANDLE(&nb_handle); - PARMCI_NbGetS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, seg_count, - stride_levels, proc, &nb_handle); - rc = PARMCI_Wait(&nb_handle); -#else - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(seg_count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0||proc>=armci_nproc){printf("\n%d:%s:proc=%d",armci_me,__FUNCTION__,proc);fflush(stdout);return FAIL5;} - - ORDER(GET,proc); /* ensure ordering */ - PREPROCESS_STRIDED(tmp_count); - - direct=SAMECLUSNODE(proc); - - # ifdef CRAY_REGISTER_ARMCI_MALLOC - if(!direct && armci_onesided_direct_get_enabled) - { - - onesided_hnd_t cp_hnd; - cpGetOnesidedHandle(&cp_hnd); - cos_mdesc_t local_mdh, remote_mdh, *mdh = NULL; - int node = armci_clus_id(proc); - - if(stride_levels == 0) - { - - // if a strided put/acc is outstanding to proc, then we need to ensure that is completed - // we allow the maximum possible overlap for strided puts/acc. that means they are not fully blocking - // calls. they are however, guaranteed to be complete prior to another request being sent. - DO_FENCE(proc,ONESIDED_GET); - - // local varaibles - cos_desc_t *comm_desc = &__global_1sided_direct_get_comm_desc; - // printf("[cp %d]: direct remote get - src=%p; dst=%p; tgt_rank=%d; tgt_node=%d\n",armci_me,src_ptr,dst_ptr,proc,node); - - // find remote mdh - armci_onesided_search_remote_mdh_list(src_ptr, proc, &remote_mdh); - - // register local memory -- will use UDREG if ONESIDED_USE_UDREG is active - cpMemRegister(dst_ptr, count[0], &local_mdh); - // onesided_mem_register(cp_hnd, src_ptr, count[0], NULL, &local_mdh); - - // initialize onesided communication descriptor - onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, comm_desc); - - // initiate get - onesided_get_nb(comm_desc); - - // complete put [locally] - onesided_wait(comm_desc); - - // deregister memory -- if we were using the dreg routines, we would let the - // dreg memory do this for us "on demand" = lazy mem deregisteration - cpMemDeregister(&local_mdh); - // onesided_mem_deregister(cp_hnd, &local_mdh); - - // done! - rc=0; - goto fn_exit; - } - else - { - DO_FENCE(proc,ONESIDED_GET); - - int i,j,id; - long src_idx; /* index offset of the current block position to src_ptr */ - long dst_idx; /* index offset of the current block position to dst_ptr */ - int n1dim; /* number of 1-dimensional blocks to xfer */ - int bvalue[MAX_STRIDE_LEVEL]; - int bunit[MAX_STRIDE_LEVEL]; - cos_desc_t cds[MAX_OUTSTANDING_ONESIDED_GETS]; - uint64_t src_addr, dst_addr; - - n1dim = 1; - for(i=1; i<=stride_levels; i++) { - n1dim *= count[i]; - } - - bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) - { - bvalue[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - for(i=0,id=0; i (count[j]-1)) bvalue[j] = 0; - } - - src_addr = (uint64_t) ((char *) src_ptr + src_idx); - dst_addr = (uint64_t) ((char *) dst_ptr + dst_idx); - - if(i >= MAX_OUTSTANDING_ONESIDED_GETS) - { - if(id == MAX_OUTSTANDING_ONESIDED_GETS) id=0; - onesided_wait(&cds[id]); - cpMemDeregister(&cds[id].local_mdesc); - } - - armci_onesided_search_remote_mdh_list((void*)src_addr, proc, &remote_mdh); - cpMemRegister((void*)dst_addr, count[0], &local_mdh); - onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, &cds[id]); - onesided_get_nb(&cds[id]); - id++; - } - - // finish up any outstanding requests - int count = n1dim; - if(MAX_OUTSTANDING_ONESIDED_GETS < n1dim) count = MAX_OUTSTANDING_ONESIDED_GETS; - for(i=0; i MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - - ORDER(optype,proc); /* ensure ordering */ - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_ - direct=SAMECLUSNODE(proc); -#endif - -# if defined(ACC_COPY) && !defined(ACC_SMP) - if(armci_me != proc) direct=0; -# endif - if(direct) - rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels,1,NULL); - else{ - DO_FENCE(proc,SERVER_PUT); - rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,NULL); - } - POSTPROCESS_STRIDED(tmp_count); - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - -int PARMCI_Acc(int optype, void *scale, void *src, void* dst, int bytes, int proc) { - int rc=0; - rc = PARMCI_AccS(optype, scale, src, NULL, dst, NULL, &bytes, 0, proc); - return rc; -} - -/* - whatever original put and get functions were here have been - replaced with the proper ones from the main armci branch. - the old functions were entirely responsible for causing the - test_vector_acc test to fail in test.x -*/ - -int PARMCI_Put(void *src, void* dst, int bytes, int proc) { - int rc=0; -//ARMCI_PROFILE_START_STRIDED(&bytes, 0, proc, ARMCI_PROF_PUT); - rc = PARMCI_PutS(src, NULL, dst, NULL, &bytes, 0, proc); -//ARMCI_PROFILE_STOP_STRIDED(ARMCI_PROF_PUT); - assert(rc==0); - return rc; -} - -int PARMCI_Get(void *src, void* dst, int bytes, int proc) { - int rc=0; -//ARMCI_PROFILE_START_STRIDED(&bytes, 0, proc, ARMCI_PROF_GET); - -#ifdef __crayx1 - memcpy(dst,src,bytes); -#else - rc = PARMCI_GetS(src, NULL, dst, NULL, &bytes, 0, proc); -#endif -//ARMCI_PROFILE_STOP_STRIDED(ARMCI_PROF_GET); -//dassert(1,rc==0); - assert(rc==0); - return rc; -} - -#define PACK1D 1 - -#if PACK1D -# define armci_read_strided1 armci_read_strided -# define armci_write_strided1 armci_write_strided -#else -# define armci_read_strided2 armci_read_strided -# define armci_write_strided2 armci_write_strided -#endif - -void armci_write_strided1(void *ptr, int stride_levels, int stride_arr[], - int count[], char *buf) -{ - int i, j; - long idx; /* index offset of current block position to ptr */ - int n1dim; /* number of 1 dim block */ - int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; - int bytes = count[0]; - ARMCI_PR_DBG("enter",stride_levels); - - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - - /* calculate the destination indices */ - bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) { - bvalue[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - for(i=0; i (count[j]-1)) bvalue[j] = 0; - } - - armci_copy( ((char*)ptr)+idx, buf, bytes); - buf += count[0]; - } - ARMCI_PR_DBG("exit",stride_levels); -} - - -void armci_write_strided2(void *ptr, int stride_levels, int stride_arr[], - int count[], char *buf) -{ - int i, j; - int total; /* number of 2 dim block */ - int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL]; - ARMCI_PR_DBG("enter",stride_levels); - - if(stride_levels == 0){ - armci_copy( ptr, buf, count[0]); - }else if (count[0]%ALIGN_SIZE || (unsigned long)ptr%ALIGN_SIZE ) - armci_write_strided1(ptr,stride_levels, stride_arr,count,buf); - else { - int rows, ld, idx, ldd; - char *src; - rows = count[0]/8; - ld = stride_arr[0]/8; - switch(stride_levels){ - case 1: - DCOPY21(&rows, count+1, ptr, &ld, buf, &idx); - break; - case 2: - ldd = stride_arr[1]/stride_arr[0]; - DCOPY31(&rows, count+1, count+2, ptr, &ld, &ldd, buf,&idx); - - break; - default: - index[2] = 0; unit[2] = 1; total = count[2]; - for(j=3; j<=stride_levels; j++) { - index[j] = 0; unit[j] = unit[j-1] * count[j-1]; - total *= count[j]; - } - for(i=0; i= count[j]) index[j] = 0; - } - DCOPY21(&rows, count+1,src, &ld, buf, &idx); - buf = (char*) ((double*)buf + idx); - } - } /*switch */ - } /*else */ - ARMCI_PR_DBG("exit",stride_levels); -} - - -void armci_read_strided1(void *ptr, int stride_levels, int stride_arr[], - int count[], char *buf) -{ - int i, j; - long idx; /* index offset of current block position to ptr */ - int n1dim; /* number of 1 dim block */ - int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; - int bytes = count[0]; - - ARMCI_PR_DBG("enter",stride_levels); - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - - /* calculate the destination indices */ - bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) { - bvalue[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - for(i=0; i (count[j]-1)) bvalue[j] = 0; - } - - armci_copy(buf, ((char*)ptr)+idx,bytes); - buf += count[0]; - } - ARMCI_PR_DBG("exit",stride_levels); -} - - -void armci_read_strided2(void *ptr, int stride_levels, int stride_arr[], - int count[], char *buf) -{ - int i, j; - int total; /* number of 2 dim block */ - int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL]; - - ARMCI_PR_DBG("enter",stride_levels); - if(stride_levels == 0){ - armci_copy( buf, ptr, count[0]); - }else if (count[0]%ALIGN_SIZE || (unsigned long)ptr%ALIGN_SIZE) - armci_read_strided1(ptr,stride_levels, stride_arr,count,buf); - else { - int rows, ld, idx, ldd; - char *src; - rows = count[0]/8; - ld = stride_arr[0]/8; - switch(stride_levels){ - case 1: - DCOPY12(&rows, count+1, ptr, &ld, buf, &idx); - break; - case 2: - ldd = stride_arr[1]/stride_arr[0]; - DCOPY13(&rows, count+1, count+2, ptr, &ld, &ldd, buf,&idx); - break; - default: - index[2] = 0; unit[2] = 1; total = count[2]; - for(j=3; j<=stride_levels; j++) { - index[j] = 0; unit[j] = unit[j-1] * count[j-1]; - total *= count[j]; - } - for(i=0; i= count[j]) index[j] = 0; - } - DCOPY12(&rows, count+1,src, &ld, buf, &idx); - buf = (char*) ((double*)buf + idx); - } - } /*switch */ - } /*else */ - ARMCI_PR_DBG("exit",stride_levels); -} - -/*\Non-Blocking API -\*/ -int PARMCI_NbPutS( void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /* armci non-blocking call handle*/ - ) -{ -armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; -int *count=seg_count, tmp_count=0; -int rc=0, direct=1; - ARMCI_PR_DBG("enter",proc); - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_ - if(stride_levels)direct=SAMECLUSNODE(proc); - direct=SAMECLUSNODE(proc); -#endif - - /* aggregate put */ - if(nb_handle && nb_handle->agg_flag == SET) { - if(!direct){ - rc= armci_agg_save_strided_descriptor(src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, - count, stride_levels, proc, - PUT, nb_handle); - POSTPROCESS_STRIDED(tmp_count); - return(rc); - } - } - else { - UPDATE_FENCE_INFO(proc); - - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = PUT; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(PUT, proc); - } - - if(!direct){ - DO_FENCE(proc,SERVER_NBPUT); - rc = armci_pack_strided(PUT, NULL, proc, src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels,NULL,-1,-1,-1,nb_handle); - } - else{ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_NBPUT); - rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr, - dst_ptr,dst_stride_arr,count,stride_levels, 0,nb_handle); - } - - POSTPROCESS_STRIDED(tmp_count); - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - -int PARMCI_NbGetS( void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: byte_count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /* armci non-blocking call handle*/ - ) -{ - int rc=0,direct=1; - int *count=seg_count, tmp_count=0; - armci_ihdl_t nb_handle = (armci_ihdl_t) usr_hdl; - ARMCI_PR_DBG("enter",proc); - - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(seg_count[0]<0) return FAIL3; - if(stride_levels<0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0 || proc>=armci_nproc) { - printf("\n%d:%s:proc=%d",armci_me,__FUNCTION__,proc); - fflush(stdout); - return FAIL5; - } - - // ORDER(GET,proc); /* ensure ordering */ - PREPROCESS_STRIDED(tmp_count); - direct = SAMECLUSNODE(proc); // direct ==> local on node operation - - if(nb_handle && nb_handle->agg_flag == SET) { - if(!direct){ - rc= armci_agg_save_strided_descriptor(src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, - count, stride_levels, proc, - GET, nb_handle); - POSTPROCESS_STRIDED(tmp_count); - return(rc); - } - } else { - // ORDER(GET,proc); ensure ordering - // set tag and op in the nb handle - if(nb_handle) { - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = GET; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(GET, proc); - } - - if(nb_handle) { - nb_handle->onesided_direct = 0; - } - - # ifdef CRAY_REGISTER_ARMCI_MALLOC - if(!direct && armci_onesided_direct_get_enabled) - { - // set up the non-blocking descriptor - nb_handle->onesided_direct = 1; - bzero(&nb_handle->comm_desc, MAX_OUTSTANDING_ONESIDED_GETS*sizeof(cos_desc_t)); - - onesided_hnd_t cp_hnd; - cpGetOnesidedHandle(&cp_hnd); - cos_mdesc_t local_mdh, remote_mdh, *mdh = NULL; - int node = armci_clus_id(proc); - - if(stride_levels == 0) - { - - // if a strided put/acc is outstanding to proc, then we need to ensure that is completed - // we allow the maximum possible overlap for strided puts/acc. that means they are not fully blocking - // calls. they are however, guaranteed to be complete prior to another request being sent. - DO_FENCE(proc,ONESIDED_GET); - - // local varaibles - cos_desc_t *comm_desc = &nb_handle->comm_desc[0]; - // printf("[cp %d]: direct remote get - src=%p; dst=%p; tgt_rank=%d; tgt_node=%d\n",armci_me,src_ptr,dst_ptr,proc,node); - - // find remote mdh - armci_onesided_search_remote_mdh_list(src_ptr, proc, &remote_mdh); - - // register local memory -- will use UDREG if ONESIDED_USE_UDREG is active - cpMemRegister(dst_ptr, count[0], &local_mdh); - // onesided_mem_register(cp_hnd, src_ptr, count[0], NULL, &local_mdh); - - // initialize onesided communication descriptor - onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, comm_desc); - - // initiate get - onesided_get_nb(comm_desc); - - // done! - rc=0; - goto fn_exit; - } - else - { - DO_FENCE(proc,ONESIDED_GET); - - int i,j,id; - long src_idx; /* index offset of the current block position to src_ptr */ - long dst_idx; /* index offset of the current block position to dst_ptr */ - int n1dim; /* number of 1-dimensional blocks to xfer */ - int bunit[MAX_STRIDE_LEVEL]; - int bvalue[MAX_STRIDE_LEVEL]; - cos_desc_t *cds = nb_handle->comm_desc; - uint64_t src_addr, dst_addr; - - n1dim = 1; - for(i=1; i<=stride_levels; i++) { - n1dim *= count[i]; - } - - bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) - { - bvalue[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - for(i=0,id=0; i (count[j]-1)) bvalue[j] = 0; - } - - src_addr = (uint64_t) ((char *) src_ptr + src_idx); - dst_addr = (uint64_t) ((char *) dst_ptr + dst_idx); -/* - if(armci_me == 0) { - printf("1dpass=%d of %d; src_idx=%d; dst_idx=%d; count[0]=%d\n",i,n1dim,src_idx, dst_idx,count[0]); - } -*/ - if(i >= MAX_OUTSTANDING_ONESIDED_GETS) - { - if(id == MAX_OUTSTANDING_ONESIDED_GETS) id=0; - onesided_wait(&cds[id]); - cpMemDeregister(&cds[id].local_mdesc); - } - - armci_onesided_search_remote_mdh_list((void*)src_addr, proc, &remote_mdh); - cpMemRegister((void*)dst_addr, count[0], &local_mdh); - onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, &cds[id]); - onesided_get_nb(&cds[id]); - id++; - } - - // done - rc=0; - goto fn_exit; - } - } // end if(!direct && armci_onesided_direct_get_enabled) - # endif // CRAY_REGISTER_ARMCI_MALLOC - - if(!direct) { - DO_FENCE(proc,SERVER_NBGET); - rc = armci_pack_strided(GET, NULL, proc, src_ptr, src_stride_arr, - dst_ptr,dst_stride_arr,count,stride_levels, - NULL,-1,-1,-1,nb_handle); - } else { - // DO_FENCE(proc,DIRECT_GET); - rc = armci_op_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr, - dst_stride_arr,count, stride_levels,0,nb_handle); - } - - POSTPROCESS_STRIDED(tmp_count); - -fn_exit: -#ifdef ARMCI_PROFILE - armci_profile_stop_strided(ARMCI_PROF_GETS); -#endif - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - - -int PARMCI_NbAccS( int optype, /* operation */ - void *scale, /* scale factor x += scale*y */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /* armci non-blocking call handle*/ - ) -{ -armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; -int *count=seg_count, tmp_count=0; -int rc, direct=1; - - ARMCI_PR_DBG("enter",proc); - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(src_stride_arr == NULL || dst_stride_arr ==NULL) return FAIL2; - if(count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - - UPDATE_FENCE_INFO(proc); - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_ - direct=SAMECLUSNODE(proc); -#endif - -# if defined(ACC_COPY) && !defined(ACC_SMP) - if(armci_me != proc) direct=0; -# endif - - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = optype; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(optype, proc); - - - if(direct){ - rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels,1,NULL); - } - else{ - DO_FENCE(proc,SERVER_NBPUT); - rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,nb_handle); - } - - POSTPROCESS_STRIDED(tmp_count); - - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - - -#if !defined(ACC_COPY)&&!defined(CRAY_YMP)&&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML) -# define REMOTE_OP -#endif - -#define INIT_NB_HANDLE(nb,o,p) if(nb){\ - (nb)->tag = 0;\ - (nb)->op = (o); (nb)->proc= (p);\ - (nb)->bufid=NB_NONE;}\ - else { (nb)=armci_set_implicit_handle(o, p); (nb)->tag=0; } - -void set_nbhandle(armci_ihdl_t *nbh, armci_hdl_t *nb_handle, int op, - int proc) -{ - if(nb_handle) - { - *nbh=(armci_ihdl_t)nb_handle; - } - else - { - *nbh=armci_set_implicit_handle(op, proc); - } -} - - -int PARMCI_NbPut(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle) -{ - -int rc=0, direct=0; -armci_ihdl_t nb_handle = (armci_ihdl_t)uhandle; - ARMCI_PR_DBG("enter",proc); - - if(src == NULL || dst == NULL) return FAIL; - - direct =SAMECLUSNODE(proc); - - /* aggregate put */ - if(nb_handle && nb_handle->agg_flag == SET) { - if(direct) { armci_copy(src,dst,bytes); rc=0; } - else - rc=armci_agg_save_descriptor(src,dst,bytes,proc,PUT,0,nb_handle); - return rc; - } - - if(direct) { - /*armci_wait needs proc to compute direct*/ - INIT_NB_HANDLE(nb_handle,PUT,proc); - armci_copy(src,dst,bytes); - } - else{ - # ifdef PORTALS - rc=PARMCI_NbPutS(src, NULL,dst,NULL, &bytes,0,proc,uhandle); - # else -# ifdef ARMCI_NB_PUT - INIT_NB_HANDLE(nb_handle,PUT,proc); - UPDATE_FENCE_STATE(proc, PUT, 1); - ARMCI_NB_PUT(src, dst, bytes, proc, &nb_handle->cmpl_info); -# else - rc=PARMCI_NbPutS(src, NULL,dst,NULL, &bytes,0,proc,uhandle); -# endif - # endif - } - - ARMCI_PR_DBG("exit",proc); - return(rc); -} - - -int PARMCI_NbGet(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle) -{ - -int rc=0, direct=0; -armci_ihdl_t nb_handle = (armci_ihdl_t)uhandle; - ARMCI_PR_DBG("enter",proc); - - if(src == NULL || dst == NULL) return FAIL; - - direct =SAMECLUSNODE(proc); - - if(nb_handle && nb_handle->agg_flag == SET) { - if(direct) { armci_copy(src,dst,bytes); rc=0; } - else - rc=armci_agg_save_descriptor(src,dst,bytes,proc,GET,0,nb_handle); - return rc; - } - - if(direct) { - /*armci_wait needs proc to compute direct*/ - INIT_NB_HANDLE(nb_handle,PUT,proc); - armci_copy(src,dst,bytes); - }else{ - - # ifdef PORTALS - rc=PARMCI_NbGetS(src, NULL,dst,NULL, &bytes,0,proc,uhandle); - # else -# ifdef ARMCI_NB_GET - /*set tag and op in the nb handle*/ - INIT_NB_HANDLE(nb_handle,GET,proc); - - ARMCI_NB_GET(src, dst, bytes, proc, &nb_handle->cmpl_info); -# else - rc=PARMCI_NbGetS(src, NULL,dst,NULL, &bytes,0,proc,uhandle); -# endif - # endif - } - ARMCI_PR_DBG("exit",proc); - return(rc); -} - - -static void _armci_rem_value(int op, void *src, void *dst, int proc, - int bytes) { - int rc=0; - int armci_th_idx = ARMCI_THREAD_IDX; - - ORDER(op,proc); /* ensure ordering */ - -#if defined(REMOTE_OP) && !defined(QUADRICS) - rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL, - &bytes, 0, NULL, 0, NULL); - if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6); -#else - - if(op==PUT) { - UPDATE_FENCE_STATE(proc, PUT, 1); -# ifdef LAPI - SET_COUNTER(ack_cntr[armci_th_idx], 1); -# endif -#if defined(BGML) - /* fprintf(stderr,"bytes: %d\n",bytes); */ - /* this call is blocking, so local count is fine */ - BG1S_t req; - unsigned count=1; - BGML_Callback_t cb_wait={wait_callback, &count}; - BG1S_Memput(&req, proc, src, 0, dst, bytes, &cb_wait, 1); - BGML_Wait(&count); -#else - - armci_put(src, dst, bytes, proc); -#endif - } - else { -# ifdef LAPI - SET_COUNTER(get_cntr[armci_th_idx], 1); -# endif -#if defined(BGML) - /* fprintf(stderr,"before memget\n"); */ - BG1S_t req; - unsigned count=1; - BGML_Callback_t cb_wait={wait_callback, &count}; - BG1S_Memget(&req, proc, dst, 0, src, bytes, &cb_wait, 1); - BGML_Wait(&count); - -#else - armci_get(src, dst, bytes, proc); -#endif - } - - /* deal with non-blocking loads and stores */ -# if defined(LAPI) || defined(_ELAN_PUTGET_H) - if(proc != armci_me){ - if(op == GET){ - WAIT_FOR_GETS; /* wait for data arrival */ - }else { - WAIT_FOR_PUTS; /* data must be copied out*/ - } - } -#endif -#endif -} - -/* non-blocking remote value put/get operation */ -static void _armci_nb_rem_value(int op, void *src, void *dst, int proc, - int bytes, armci_ihdl_t nb_handle) { - int rc=0, pv=0; - int armci_th_idx = ARMCI_THREAD_IDX; - - if(nb_handle && nb_handle->agg_flag == SET) { - if(op==PUT) pv = 1; - (void)armci_agg_save_descriptor(src,dst,bytes,proc,op,pv,nb_handle); - return; - } - else { - if(op==PUT) UPDATE_FENCE_INFO(proc); - - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = op; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(op, proc); - } - -#if defined(REMOTE_OP) && !defined(QUADRICS) - rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL, - &bytes, 0, NULL, 0, nb_handle); - if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6); -#else - - if(op==PUT) { - UPDATE_FENCE_STATE(proc, PUT, 1); -# ifdef LAPI - SET_COUNTER(ack_cntr[armci_th_idx], 1); -# endif - armci_put(src, dst, bytes, proc); - } - else { -# ifdef LAPI - SET_COUNTER(get_cntr[armci_th_idx], 1); -# endif - armci_get(src, dst, bytes, proc); - } - - /* deal with non-blocking loads and stores */ -# if defined(LAPI) || defined(_ELAN_PUTGET_H) -# ifdef LAPI - if(!nb_handle) -# endif - { - if(proc != armci_me){ - if(op == GET){ - WAIT_FOR_GETS; /* wait for data arrival */ - }else { - WAIT_FOR_PUTS; /* data must be copied out*/ - } - } - } -# endif -#endif -} - - -#define CHK_ERR(dst, proc) \ - if(dst==NULL) armci_die("PARMCI_PutValue: NULL pointer passed",FAIL); \ - if(proc<0) armci_die("PARMCI_PutValue: Invalid process rank", proc); - -#define CHK_ERR_GET(src, dst, proc, bytes) \ - if(src==NULL || dst==NULL) armci_die("PARMCI_GetValue: NULL pointer passed",FAIL); \ - if(proc<0) armci_die("PARMCI_GetValue: Invalid process rank", proc); \ - if(bytes<0) armci_die("PARMCI_GetValue: Invalid size", bytes); - -/** - * Register-Originated Put. - */ -int PARMCI_PutValueInt(int src, void *dst, int proc) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(int *)dst = src; - else _armci_rem_value(PUT, &src, dst, proc, sizeof(int)); - return 0; -} - -int PARMCI_PutValueLong(long src, void *dst, int proc) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(long *)dst = src; - else _armci_rem_value(PUT, &src, dst, proc, sizeof(long)); - return 0; -} - -int PARMCI_PutValueFloat(float src, void *dst, int proc) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(float *)dst = src; - else _armci_rem_value(PUT, &src, dst, proc, sizeof(float)); - return 0; -} - -int PARMCI_PutValueDouble(double src, void *dst, int proc) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(double *)dst = src; - else _armci_rem_value(PUT, &src, dst, proc, sizeof(double)); - return 0; -} - -/** - * Non-Blocking register-originated put. - */ -int PARMCI_NbPutValueInt(int src, void *dst, int proc, armci_hdl_t* usr_hdl) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(int *)dst = src; - else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(int),(armci_ihdl_t)usr_hdl); - return 0; -} - -int PARMCI_NbPutValueLong(long src, void *dst, int proc, armci_hdl_t* usr_hdl) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(long *)dst = src; - else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(long),(armci_ihdl_t)usr_hdl); - return 0; -} - -int PARMCI_NbPutValueFloat(float src, void *dst, int proc, armci_hdl_t* usr_hdl) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(float *)dst = src; - else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(float),(armci_ihdl_t)usr_hdl); - return 0; -} - -int PARMCI_NbPutValueDouble(double src, void *dst, int proc, armci_hdl_t* usr_hdl) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(double *)dst = src; - else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(double),(armci_ihdl_t)usr_hdl); - return 0; - } - -#if 1 -/** - * Register-Originated Get. - */ -int PARMCI_GetValueInt(void *src, int proc) -{ - int dst; - if( SAMECLUSNODE(proc) ) return *(int *)src; - else _armci_rem_value(GET, src, &dst, proc, sizeof(int)); - return dst; -} - -long PARMCI_GetValueLong(void *src, int proc) -{ - long dst; - if( SAMECLUSNODE(proc) ) return *(long *)src; - else _armci_rem_value(GET, src, &dst, proc, sizeof(long)); - return dst; -} - -float PARMCI_GetValueFloat(void *src, int proc) -{ - float dst; - if( SAMECLUSNODE(proc) ) return *(float *)src; - else _armci_rem_value(GET, src, &dst, proc, sizeof(float)); - return dst; -} - -double PARMCI_GetValueDouble(void *src, int proc) -{ - double dst; - if( SAMECLUSNODE(proc) ) return *(double *)src; - else _armci_rem_value(GET, src, &dst, proc, sizeof(double)); - return dst; -} - -#endif - -#if 0 -/** - * Register-Originated Get. - */ -int PARMCI_GetValue(void *src, void *dst, int proc, int bytes) -{ - CHK_ERR_GET(src, dst, proc, bytes); - if( SAMECLUSNODE(proc) ) { armci_copy(src, dst, bytes); } - else _armci_rem_value(GET, src, dst, proc, bytes); - return 0; -} - -/** - * Non-Blocking register-originated get. - */ -int PARMCI_NbGetValue(void *src, void *dst, int proc, int bytes, armci_hdl_t* usr_hdl) -{ - CHK_ERR_GET(src, dst, proc, bytes); - if( SAMECLUSNODE(proc) ) { armci_copy(src, dst, bytes); } - else _armci_nb_rem_value(GET, src, dst, proc, bytes, (armci_ihdl_t)usr_hdl); - return 0; -} -#endif - diff --git a/armci/src-gemini/threads.c b/armci/src-gemini/threads.c deleted file mode 100644 index 68857520a..000000000 --- a/armci/src-gemini/threads.c +++ /dev/null @@ -1,117 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: threads.c,v 1.1.2.5 2007-08-28 21:29:46 manoj Exp $ */ - -#if 0 -# define PRNDBG3(m,a1,a2,a3) \ - fprintf(stderr,"DBG %d: " m,armci_me,a1,a2,a3);fflush(stderr) -# define PRNDBG(m) PRNDBG3(m,0,0,0) -# define PRNDBG1(m,a1) PRNDBG3(m,a1,0,0) -# define PRNDBG2(m,a1,a2) PRNDBG3(m,a1,a2,0) -#else -# define PRNDBG(m) -# define PRNDBG1(m,a1) -# define PRNDBG2(m,a1,a2) -# define PRNDBG3(m,a1,a2,a3) -#endif - - -#include -#include "armcip.h" - -armci_user_threads_t armci_user_threads; - -void armci_init_threads() -{ - int i, bytes; - char *uval = getenv("ARMCI_MAX_THREADS"); - - armci_user_threads.max = 1; - armci_user_threads.avail = 0; - - if (uval != NULL) sscanf(uval, "%d", &armci_user_threads.max); - - if (armci_user_threads.max < 1 || - armci_user_threads.max > ARMCI_THREADS_LIMIT) - { - printf("Error: Only 1-%d threads are supported. ",ARMCI_THREADS_LIMIT); - printf("Set ARMCI_MAX_THREADS appropriately\n"); fflush(stdout); - armci_die("armci_init_threads: failed", 0); - } - - bytes = sizeof(thread_id_t) * armci_user_threads.max; - if ( !(armci_user_threads.ids = (thread_id_t*) malloc(bytes)) ) - { - armci_die("armci_init_threads: armci_user_threads.ids malloc failed", - armci_user_threads.max); - } - memset(armci_user_threads.ids, 0, bytes); - -#if 0 /* spinlock has void return value */ - if (THREAD_LOCK_INIT(armci_user_threads.lock) || - THREAD_LOCK_INIT(armci_user_threads.buf_lock) || - THREAD_LOCK_INIT(armci_user_threads.net_lock)) - armci_die("armci_init_threads:locks initialization failed", 0); -#else - THREAD_LOCK_INIT(armci_user_threads.lock); - THREAD_LOCK_INIT(armci_user_threads.buf_lock); - THREAD_LOCK_INIT(armci_user_threads.net_lock); -#endif - -#if 0 - /* using one lock per socket for now, it might be feasible (and usefull) - * to use two (one for sending and one for receiving) */ - armci_user_threads.sock_locks = malloc(armci_nclus *sizeof(thread_lock_t)); - for (i = 0; i < armci_nclus; i++) - if (THREAD_LOCK_INIT(armci_user_threads.sock_locks[i])) - armci_die("armci_init_threads:sock locks initialization failed", i); -#endif -} - -void armci_finalize_threads() -{ - THREAD_LOCK_DESTROY(armci_user_threads.lock); - THREAD_LOCK_DESTROY(armci_user_threads.net_lock); - THREAD_LOCK_DESTROY(armci_user_threads.buf_lock); - free(armci_user_threads.ids); -} - -/* calling armci_thread_idx for every function that accesses thread-private data - * might be expensive -- needs optiomization */ -INLINE int armci_thread_idx() -{ - int i, n = ARMCI_MIN(armci_user_threads.avail, armci_user_threads.max); - thread_id_t id = THREAD_ID_SELF(); - - for (i = 0; i < n; i++) if (id == armci_user_threads.ids[i]) { - /*PRNDBG2("thread id=%ld already registered, idx=%d\n", id, i);*/ - return i; - } - - /* see this thread for the first time */ - return armci_register_thread(id); -} - -INLINE int armci_register_thread(thread_id_t id) -{ - int i; - - THREAD_LOCK(armci_user_threads.lock); - - i = armci_user_threads.avail; - armci_user_threads.avail++; - - THREAD_UNLOCK(armci_user_threads.lock); - - if (i < armci_user_threads.max) - armci_user_threads.ids[i] = id; - else - armci_die("armci_thread_idx: too many threads, adjust ARMCI_MAX_THREADS", - armci_user_threads.avail); - - PRNDBG2("registered a new thread: idx=%d, id=%ld\n", i, id); - return i; -} - diff --git a/armci/src-gemini/timer.c b/armci/src-gemini/timer.c deleted file mode 100644 index 6c1a2977b..000000000 --- a/armci/src-gemini/timer.c +++ /dev/null @@ -1,43 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: timer.c,v 1.3 2004-04-09 22:03:51 manoj Exp $ */ -#ifdef WIN32 - static double msec; /* reference for timer */ -# include -#else -# include -# include - static unsigned firstsec=0; /* Reference for timer */ - static unsigned firstusec=0; /* Reference for timer */ -#endif -static int first_call=1; - -double armci_timer() -{ -#ifdef WIN32 - double t0 = (double)GetCurrentTime(); - if(first_call){ - first_call=0; msec=t0; return 0.0; - } - t0 -=msec; - if(t0<0.0)t0 += (double)0xffffffff; - return 0.01*t0; -#else - double low, high; - struct timeval tp; - struct timezone tzp; - (void) gettimeofday(&tp,&tzp); - - if (first_call) { - firstsec = tp.tv_sec; - firstusec = tp.tv_usec; - first_call = 0; - } - low = (double)(tp.tv_usec>>1) - (double) (firstusec>>1); - high = (double) (tp.tv_sec - firstsec); - return high + 1.0e-6*(low+low); -#endif -} - diff --git a/armci/src-gemini/utils.c b/armci/src-gemini/utils.c deleted file mode 100644 index df43fc60a..000000000 --- a/armci/src-gemini/utils.c +++ /dev/null @@ -1,230 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* - * A barrier causes threads to wait until a set of threads has - * all "reached" the barrier. The number of threads required is - * set when the barrier is initialized, and cannot be changed - * except by reinitializing. - * - * The barrier_init() and barrier_destroy() functions, - * respectively, allow you to initialize and destroy the - * barrier. - * - * The barrier_wait() function allows a thread to wait for a - * barrier to be completed. One thread (the one that happens to - * arrive last) will return from barrier_wait() with the status - * -1 on success -- others will return with 0. The special - * status makes it easy for the calling code to cause one thread - * to do something in a serial region before entering another - * parallel section of code. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "utils.h" - -#define DEBUG_ - -int mt_size; /* number of processes: needed for collective mt ops */ -int mt_tpp; /* number of threads used for collective ops */ -thread_barrier_t mt_barrier; /* static barrier used for multi-threaded MT_BARRIER */ - -int armci_malloc_mt(void *ptr[], int bytes) -{ - int rc, th_size, i, j; - - th_size = mt_size * mt_tpp; - if (thread_barrier_wait(&mt_barrier)==-1) { - rc = PARMCI_Malloc(ptr, bytes * mt_tpp); -#ifdef DEBUG - printf("bytes=%d\n", bytes); - for (i = 0; i < mt_size; i++) printf("ptr[%d]=%p\n",i,ptr[i]); -#endif - /* at this point proc ptrs are at beggining of the list */ - for (i = mt_size - 1; i >= 0; i--) for (j = mt_tpp - 1; j >= 0; j--) { -#ifdef DEBUG - printf("mt_size=%d,mt_tpp=%d,i=%d,j=%d,ptr[%d]=%p+%d\n", - mt_size,mt_tpp,i,j,i*mt_tpp+j,ptr[i],j*bytes); - fflush(stdout); -#endif - ptr[i * mt_tpp + j] = ((char*)ptr[i]) + j * bytes; - } - } - thread_barrier_wait(&mt_barrier); - - return rc; -} - -int armci_free_mt(void *ptr, int th_idx) -{ -} - -#ifdef POSIX_THREADS -/* - * Initialize a barrier for use. - */ -int thread_barrier_init (thread_barrier_t *barrier, int count) -{ - int status; - - barrier->threshold = barrier->counter = count; - barrier->cycle = 0; - status = pthread_mutex_init (&barrier->mutex, NULL); - if (status != 0) - return status; - status = pthread_cond_init (&barrier->cv, NULL); - if (status != 0) { - pthread_mutex_destroy (&barrier->mutex); - return status; - } - barrier->valid = BARRIER_VALID; - return 0; -} - -/* - * Destroy a barrier when done using it. - */ -int thread_barrier_destroy (thread_barrier_t *barrier) -{ - int status, status2; - - if (barrier->valid != BARRIER_VALID) - return EINVAL; - - status = pthread_mutex_lock (&barrier->mutex); - if (status != 0) - return status; - - /* - * Check whether any threads are known to be waiting; report - * "BUSY" if so. - */ - if (barrier->counter != barrier->threshold) { - pthread_mutex_unlock (&barrier->mutex); - return EBUSY; - } - - barrier->valid = 0; - status = pthread_mutex_unlock (&barrier->mutex); - if (status != 0) - return status; - - /* - * If unable to destroy either 1003.1c synchronization - * object, return the error status. - */ - status = pthread_mutex_destroy (&barrier->mutex); - status2 = pthread_cond_destroy (&barrier->cv); - return (status == 0 ? status : status2); -} - -/* - * Wait for all members of a barrier to reach the barrier. When - * the count (of remaining members) reaches 0, broadcast to wake - * all threads waiting. - */ -int thread_barrier_wait (thread_barrier_t *barrier) -{ - int status, cancel, tmp, cycle; - - if (barrier->valid != BARRIER_VALID) - return EINVAL; - - status = pthread_mutex_lock (&barrier->mutex); - if (status != 0) - return status; - - cycle = barrier->cycle; /* Remember which cycle we're on */ - - if (--barrier->counter == 0) { - barrier->cycle = !barrier->cycle; - barrier->counter = barrier->threshold; - status = pthread_cond_broadcast (&barrier->cv); - /* - * The last thread into the barrier will return status - * -1 rather than 0, so that it can be used to perform - * some special serial code following the barrier. - */ - if (status == 0) - status = -1; - } else { - /* - * Wait with cancellation disabled, because barrier_wait - * should not be a cancellation point. - */ - pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &cancel); - - /* - * Wait until the barrier's cycle changes, which means - * that it has been broadcast, and we don't want to wait - * anymore. - */ - while (cycle == barrier->cycle) { - status = pthread_cond_wait ( - &barrier->cv, &barrier->mutex); - if (status != 0) break; - } - - pthread_setcancelstate (cancel, &tmp); - } - /* - * Ignore an error in unlocking. It shouldn't happen, and - * reporting it here would be misleading -- the barrier wait - * completed, after all, whereas returning, for example, - * EINVAL would imply the wait had failed. The next attempt - * to use the barrier *will* return an error, or hang, due - * to whatever happened to the mutex. - */ - pthread_mutex_unlock (&barrier->mutex); - return status; /* error, -1 for waker, or 0 */ -} -#endif - -#if 0 - -/*** - NAME - timing.c - PURPOSE - Timing routines for calculating the execution time: - void start_timer(void); Set the timer. - double elapsed_time(void); Return the timing elapsed since - the timer has been set. - NOTES - Jialin Ju - Oct 16, 1995 Created. -***/ - -/* Timing routines that use standard Unix gettingofday() */ -static struct timezone tz; -static struct timeval start_time, finish_time; - -/* Start measuring a time delay */ -void start_timer(void) -{ - gettimeofday( &start_time, &tz); -} - -/* Retunrn elapsed time in milliseconds */ -double elapsed_time(void) -{ - gettimeofday( &finish_time, &tz); - return(1000.0*(finish_time.tv_sec - start_time.tv_sec) + - (finish_time.tv_usec - start_time.tv_usec)/1000.0 ); -} - -/* Return the stopping time in milliseconds */ -double stop_time(void) -{ - gettimeofday( &finish_time, &tz); - return(1000.0*finish_time.tv_sec + finish_time.tv_usec/1000.0); -} - -#endif diff --git a/armci/src-gemini/utils.h b/armci/src-gemini/utils.h deleted file mode 100644 index d975e8b97..000000000 --- a/armci/src-gemini/utils.h +++ /dev/null @@ -1,116 +0,0 @@ -/* $Id: utils.h,v 1.1.2.3 2007-07-02 05:35:31 d3p687 Exp $ - * - * primitives for transparent handling of multi-threading - */ - -#ifndef UTILS_H -#define UTILS_H - -/* - * This header file describes the "barrier" synchronization - * construct. The type barrier_t describes the full state of the - * barrier including the POSIX 1003.1c synchronization objects - * necessary. - * - * A barrier causes threads to wait until a set of threads has - * all "reached" the barrier. The number of threads required is - * set when the barrier is initialized, and cannot be changed - * except by reinitializing. - */ - - -#ifdef THREAD_SAFE -# ifdef POSIX_THREADS - -# include - -#if 1 - typedef pthread_mutex_t thread_lock_t; -# define THREAD_LOCK_INIT(x) pthread_mutex_init(&x,NULL) -# define THREAD_LOCK_DESTROY(x) pthread_mutex_destroy(&x) -# define THREAD_LOCK(x) pthread_mutex_lock(&x) -# define THREAD_UNLOCK(x) pthread_mutex_unlock(&x) -#else - -#ifndef INLINE -# define INLINE -# include "spinlock.h" -# undef INLINE -#else -# include "spinlock.h" -#endif - - typedef LOCK_T thread_lock_t; -# define THREAD_LOCK_INIT(x) armci_init_spinlock(&x) -# define THREAD_LOCK_DESTROY(x) 0 -# define THREAD_LOCK(x) armci_acquire_spinlock(&x) -# define THREAD_UNLOCK(x) armci_release_spinlock(&x) -#endif - typedef pthread_t thread_t; -# define THREAD_CREATE(th_,func_,arg_) pthread_create(th_,NULL,func_,arg_) -# define THREAD_JOIN(th_,ret_) pthread_join(th_,ret_) - - /* structure describing a barrier */ - typedef struct thread_barrier_tag { - pthread_mutex_t mutex; /* Control access to barrier */ - pthread_cond_t cv; /* wait for barrier */ - int valid; /* set when valid */ - int threshold; /* number of threads required */ - int counter; /* current number of threads */ - int cycle; /* alternate wait cycles (0 or 1) */ - } thread_barrier_t; - -# define BARRIER_VALID 0xdbcafe - - /* support static initialization of barriers */ -# define BARRIER_INITIALIZER(cnt) {\ - PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER,\ - BARRIER_VALID, cnt, cnt, 0} - -# else -# error ONLY PTHREADS SUPPORT HAS BEEN IMPLEMENTED -# endif - -# define TH2PROC(th_) (th_/mt_tpp) /* computes processor from thread id */ - - /* barrier functions */ - int thread_barrier_init (thread_barrier_t *barrier, int count); - int thread_barrier_destroy (thread_barrier_t *barrier); - int thread_barrier_wait (thread_barrier_t *barrier); - - /* multi-threaded memory functions */ - int armci_malloc_mt(void *ptr[], int bytes); - int armci_free_mt(void *ptr, int th_idx); -# define ARMCI_MALLOC_MT armci_malloc_mt -# define ARMCI_FREE_MT armci_free_mt - - -# define TH_INIT(p_,t_) mt_size=p_;mt_tpp=t_;\ - thread_barrier_init(&mt_barrier,mt_tpp) -# define TH_FINALIZE() thread_barrier_destroy(&mt_barrier) -# define MT_BARRIER() if (thread_barrier_wait(&mt_barrier)==-1) armci_msg_barrier();\ - thread_barrier_wait(&mt_barrier) - - extern int mt_size; - extern int mt_tpp; - extern thread_barrier_t mt_barrier; -#else -# define THREAD_LOCK_INIT(x) -# define THREAD_LOCK_DESTROY(x) -# define THREAD_LOCK(x) -# define THREAD_UNLOCK(x) -# define TH_INIT(p_,t_) -# define TH_FINALIZE() -# define MT_BARRIER armci_msg_barrier -# define ARMCI_MALLOC_MT PARMCI_Malloc -# define ARMCI_FREE_MT(p_,th_) PARMCI_Free(p_) -#endif - - - - - - -#endif/*UTILS_H*/ - - diff --git a/armci/src-gemini/vector.c b/armci/src-gemini/vector.c deleted file mode 100644 index 052d97363..000000000 --- a/armci/src-gemini/vector.c +++ /dev/null @@ -1,605 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: vector.c,v 1.32.6.4 2007-08-29 17:32:32 manoj Exp $ */ -#include "armcip.h" -#include "copy.h" -#include "acc.h" -#include "memlock.h" -#include -#include - -#define SERVER_GET 1 -#define SERVER_NBGET 2 -#define DIRECT_GET 3 -#define DIRECT_NBGET 4 -#define SERVER_PUT 5 -#define SERVER_NBPUT 6 -#define DIRECT_PUT 7 -#define DIRECT_NBPUT 8 - - -# define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\ - else if(__prot==SERVER_PUT);\ - else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\ - if(armci_prot_switch_fence[__proc]==SERVER_PUT)\ - ARMCI_DoFence(__proc);\ - }\ - else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\ - if(armci_prot_switch_fence[__proc]==SERVER_PUT)\ - ARMCI_DoFence(__proc);\ - }\ - else;\ - armci_prot_switch_fence[__proc]=__prot - -/* defined in acc.h so don't redefine here -#ifndef ARMCI_COMPLEX_TYPES -typedef struct { - float real; - float imag; -} complex_t; - -typedef struct { - double real; - double imag; -} dcomplex_t; -#endif -*/ - -/* -void I_ACCUMULATE(void* scale, int elems, void*src, void* dst) -{ - int j; - int *a=(int*)dst, *b=(int*)src; - int alpha = *(int*)scale; - - for(j=0;j BUFSIZE/2){ - /* for large segments use strided implementation */ - for(j=0; j< dr.ptr_array_len; j++){ - rc = armci_acc_copy_strided(op, scale,proc, - dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL, - &dr.bytes, 0); - if(rc)return(rc); - } - }else{ - armci_giov_t dl; - /*lock memory:should optimize it to lock only a chunk at a time*/ - armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc); - /* copy as many blocks as possible into the local buffer */ - dl.bytes = dr.bytes; - nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes); - for(j=0; j< dr.ptr_array_len; j+= nb){ - int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j); - int k; - /* setup vector descriptor for remote memory copy - to bring data into buffer*/ - dl.ptr_array_len = nblocks; - dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */ - for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer; - dl.dst_ptr_array = pwork; - /* get data to the local buffer */ - rc = armci_copy_vector(GET, &dl, 1, proc); - if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);} - /* update source array for accumulate */ - dl.src_ptr_array = dr.src_ptr_array +j; - /* do scatter accumulate updating copy of data in buffer */ - armci_scatter_acc(op, scale, dl, armci_me, 0); - /* modify descriptor-now source becomes destination for PUT*/ - dl.dst_ptr_array = dr.dst_ptr_array + j; - dl.src_ptr_array = pwork; - /* put data back */ - rc = armci_copy_vector(PUT, &dl, 1, proc); - FENCE_NODE(proc); - if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);} - } - ARMCI_UNLOCKMEM(proc); - } - }/*endfor*/ - } -#endif - - return 0; -} - - - - -int armci_copy_vector(int op, /* operation code */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ) -{ - int i,s,shmem= SAMECLUSNODE(proc); - int armci_th_idx = ARMCI_THREAD_IDX; - - if(shmem){ - /* local/shared memory copy */ - for(i = 0; i< len; i++){ - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_copy(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],darr[i].bytes); - } - } - - }else { - switch(op){ - case PUT: - - for(i = 0; i< len; i++){ - - UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len); - - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s], - darr[i].bytes, proc); - } - } - break; - case GET: - for(i = 0; i< len; i++){ - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s], - darr[i].bytes,proc); - } - } - break; - default: - armci_die("armci_copy_vector: wrong optype",op); - } - } - - return 0; -} - - -void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf) -{ -int i,s; -char *ptr = (char*)buf; - for(i = 0; i< len; i++){ - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_copy(darr[i].src_ptr_array[s],ptr,darr[i].bytes); - ptr += darr[i].bytes; - } - } -} - - -void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf) -{ -int i,s; -char *ptr = (char*)buf; - - for(i = 0; i< len; i++){ - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_copy(ptr, darr[i].dst_ptr_array[s],darr[i].bytes); - ptr += darr[i].bytes; - } - } -} - -int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ) -{ - int rc=0, i,direct=1; - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - ORDER(PUT,proc); /* ensure ordering */ - direct=SAMECLUSNODE(proc); - - if(direct){ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT); - rc = armci_copy_vector(PUT, darr, len, proc); - } - else{ - DO_FENCE(proc,SERVER_PUT); - rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL); - } - - if(rc) return FAIL6; - else return 0; - -} - - -int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ) -{ - int rc=0, i,direct=1; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - ORDER(GET,proc); /* ensure ordering */ -#ifndef QUADRICS - direct=SAMECLUSNODE(proc); -#endif - - if(direct){ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET); - rc = armci_copy_vector(GET, darr, len, proc); - } - else{ - DO_FENCE(proc,SERVER_GET); - rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL); - } - - if(rc) return FAIL6; - else return 0; -} - - - - -int PARMCI_AccV( int op, /* oeration code */ - void *scale, /*scaling factor for accumulate */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ) -{ - int rc=0, i,direct=0; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - ORDER(op,proc); /* ensure ordering */ - direct=SAMECLUSNODE(proc); -# if defined(ACC_COPY) && !defined(ACC_SMP) - if(armci_me != proc) direct=0; -# error "grrr" -# endif - if(direct) { - rc = armci_acc_vector( op, scale, darr, len, proc); - } else { - DO_FENCE(proc,SERVER_PUT); - rc = armci_pack_vector(op, scale, darr, len, proc,NULL); - } - - if(rc) return FAIL6; - else return 0; -} - - -/*****************************************************************************/ - -/*\ Non-blocking vector API -\*/ -int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /*non-blocking request handle*/ - ) -{ - armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; - int rc=0, i,direct=1; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - direct=SAMECLUSNODE(proc); - /* aggregate put */ - if(nb_handle && nb_handle->agg_flag == SET) { - if(!direct) { - rc=armci_agg_save_giov_descriptor(darr, len, proc, PUT, nb_handle); - return rc; - } - } - else { - - /*ORDER(PUT,proc); ensure ordering */ - UPDATE_FENCE_INFO(proc); - - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = PUT; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(PUT, proc); - } - - if(direct){ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT); - rc = armci_copy_vector(PUT, darr, len, proc); - } - else{ - DO_FENCE(proc,SERVER_NBPUT); - rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle); - } - - if(rc) return FAIL6; - else return 0; -} - -int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /*non-blocking request handle*/ - ) -{ - armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; - int rc=0, i,direct=1; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - direct=SAMECLUSNODE(proc); - - /* aggregate get */ - if(nb_handle && nb_handle->agg_flag == SET) { - if(!direct) { - rc=armci_agg_save_giov_descriptor(darr, len, proc, GET, nb_handle); - return rc; - } - } - else { - /* ORDER(GET,proc); ensure ordering */ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = GET; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(GET, proc); - } - - if(direct){ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET); - rc = armci_copy_vector(GET, darr, len, proc); - } - else{ - DO_FENCE(proc,SERVER_NBGET); - rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle); - } - - if(rc) return FAIL6; - else return 0; -} - - -int PARMCI_NbAccV( int op, /* oeration code */ - void *scale, /*scaling factor for accumulate */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /*non-blocking request handle*/ - ) -{ - armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; - int rc=0, i,direct=1; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - UPDATE_FENCE_INFO(proc); - direct=SAMECLUSNODE(proc); - - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = op; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(op, proc); - -# if defined(ACC_COPY) && !defined(ACC_SMP) - if(armci_me != proc) direct=0; -# endif - - if(direct) - rc = armci_acc_vector( op, scale, darr, len, proc); - else{ - DO_FENCE(proc,SERVER_NBPUT); - rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle); - } - - if(rc) return FAIL6; - else return 0; -} -/*****************************************************************************/ diff --git a/armci/src-portals/Makefile.inc b/armci/src-portals/Makefile.inc deleted file mode 100644 index 244492e7a..000000000 --- a/armci/src-portals/Makefile.inc +++ /dev/null @@ -1,67 +0,0 @@ -libarmci_la_SOURCES += src-portals/acc.h -libarmci_la_SOURCES += src-portals/aggregate.c -libarmci_la_SOURCES += src-portals/armci.c -libarmci_la_SOURCES += src-portals/armcip.h -libarmci_la_SOURCES += src-portals/armci_portals.c -libarmci_la_SOURCES += src-portals/armci_portals.h -libarmci_la_SOURCES += src-portals/atomics-i386.h -libarmci_la_SOURCES += src-portals/buffers.c -libarmci_la_SOURCES += src-portals/caccumulate.c -libarmci_la_SOURCES += src-portals/ccopy.c -libarmci_la_SOURCES += src-portals/clusterinfo.c -libarmci_la_SOURCES += src-portals/copy.h -libarmci_la_SOURCES += src-portals/ds-shared.c -libarmci_la_SOURCES += src-portals/fence.c -libarmci_la_SOURCES += src-portals/kr_malloc.c -libarmci_la_SOURCES += src-portals/kr_malloc.h -libarmci_la_SOURCES += src-portals/locks.c -libarmci_la_SOURCES += src-portals/locks.h -libarmci_la_SOURCES += src-portals/memlock.c -libarmci_la_SOURCES += src-portals/memlock.h -libarmci_la_SOURCES += src-portals/memory.c -libarmci_la_SOURCES += src-portals/message.c -libarmci_la_SOURCES += src-portals/mutex.c -libarmci_la_SOURCES += src-portals/pack.c -libarmci_la_SOURCES += src-portals/pendbufs.h -libarmci_la_SOURCES += src-portals/portals_cp.c -libarmci_la_SOURCES += src-portals/portals_ds.c -libarmci_la_SOURCES += src-portals/portals.c -libarmci_la_SOURCES += src-portals/request.c -libarmci_la_SOURCES += src-portals/request.h -libarmci_la_SOURCES += src-portals/rmw.c -libarmci_la_SOURCES += src-portals/rtinfo.c -libarmci_la_SOURCES += src-portals/semaphores.c -libarmci_la_SOURCES += src-portals/semaphores.h -libarmci_la_SOURCES += src-portals/shmalloc.h -libarmci_la_SOURCES += src-portals/shmem.c -libarmci_la_SOURCES += src-portals/armci_shmem.h -libarmci_la_SOURCES += src-portals/shmlimit.c -libarmci_la_SOURCES += src-portals/shmlimit.h -libarmci_la_SOURCES += src-portals/signaltrap.c -libarmci_la_SOURCES += src-portals/signaltrap.h -libarmci_la_SOURCES += src-portals/sockets.h -libarmci_la_SOURCES += src-portals/spawn.c -libarmci_la_SOURCES += src-portals/spinlock.h -libarmci_la_SOURCES += src-portals/strided.c -libarmci_la_SOURCES += src-portals/utils.h -libarmci_la_SOURCES += src-portals/vector.c -if PORTALS_ENABLE_NEW_MALLOC -libarmci_la_SOURCES += src-portals/new_memory.c -endif -if MSG_COMMS_MPI -libarmci_la_SOURCES += src-portals/groups.c -endif -if ARMCI_ENABLE_GPC_CALLS -libarmci_la_SOURCES += src-portals/gpc.c -endif -if THREAD_SAFE -libarmci_la_SOURCES += src-portals/threads.c -libarmci_la_SOURCES += src-portals/utils.c -endif - -include_HEADERS += src-portals/armci.h -include_HEADERS += src-portals/gpc.h -include_HEADERS += src-portals/message.h - -AM_CPPFLAGS += -I$(top_srcdir)/src-portals -AM_CPPFLAGS += -I$(top_srcdir)/src/include diff --git a/armci/src-portals/acc.c b/armci/src-portals/acc.c deleted file mode 100644 index b86678939..000000000 --- a/armci/src-portals/acc.c +++ /dev/null @@ -1,174 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: acc.c,v 1.13 2006-09-13 23:43:36 andriy Exp $ */ - -#if defined(__crayx1) -#define MAYBE_RESTRICT restrict -#else -#define MAYBE_RESTRICT -#endif - -void L_ACCUMULATE_2D(long* MAYBE_RESTRICT alpha, - int* MAYBE_RESTRICT rows, int* MAYBE_RESTRICT cols, - long* MAYBE_RESTRICT a, int* MAYBE_RESTRICT lda, - long* MAYBE_RESTRICT b, int* MAYBE_RESTRICT ldb) -{ -int i,j; - -#ifdef __crayx1 -#pragma _CRI concurrent -#endif - - for(j=0;j< *cols; j++){ - long * MAYBE_RESTRICT aa = a + j* *lda; - long * MAYBE_RESTRICT bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} - -void L_ACCUMULATE_1D(long * MAYBE_RESTRICT alpha, - long * MAYBE_RESTRICT a, long * MAYBE_RESTRICT b, - int * MAYBE_RESTRICT rows) -{ -int i; - for(i=0;i< *rows; i++) - a[i] += *alpha * b[i]; -} - - -#if defined(CRAY_T3E) || defined(CATAMOUNT) -void F_ACCUMULATE_2D_(float* alpha, int* rows, int* cols, float* a, - int* lda, float* b, int* ldb) -{ -int i,j; - for(j=0;j< *cols; j++){ - float *aa = a + j* *lda; - float *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} -#endif - -void RA_ACCUMULATE_2D_(long* alpha, int* rows, int* cols, long* a, - int* lda, long* b, int* ldb) -{ -int i,j; - for(j=0;j< *cols; j++){ - long *aa = a + j* *lda; - long *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] ^= bb[i]; - } -} - -#if NOFORT - -typedef struct { - float imag; - float real; -} cmpl_t; - -typedef struct { - double imag; - double real; -} dcmpl_t; - -void I_ACCUMULATE_2D(int* alpha, int* rows, int* cols, int* a, - int* lda, int* b, int* ldb) -{ -int i,j; - - for(j=0;j< *cols; j++){ - int *aa = a + j* *lda; - int *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} - -#if !defined(CRAY_T3E) && !defined(CATAMOUNT) -void F_ACCUMULATE_2D(float* alpha, int* rows, int* cols, float* a, - int* lda, float* b, int* ldb) -{ -int i,j; - for(j=0;j< *cols; j++){ - float *aa = a + j* *lda; - float *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} -#endif - -void D_ACCUMULATE_2D(double* alpha, int* rows, int* cols, double* a, - int* lda, double* b, int* ldb) -{ -int i,j; - - for(j=0;j< *cols; j++){ - double *aa = a + j* *lda; - double *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] += *alpha * bb[i]; - } -} - - -void C_ACCUMULATE_2D(cmpl_t* alpha, int* rows, int* cols, cmpl_t* a, - int* lda, cmpl_t* b, int* ldb) -{ -int i,j; - - for(j=0;j< *cols; j++){ - cmpl_t *aa = a + j* *lda; - cmpl_t *bb = b + j* *ldb; - for(i=0;i< *rows; i++){ - aa[i].real += alpha->real * bb[i].real - alpha->imag * bb[i].imag; - aa[i].imag += alpha->imag * bb[i].real + alpha->real * bb[i].imag; - } - } -} - - -void Z_ACCUMULATE_2D(dcmpl_t* alpha, int* rows, int* cols, dcmpl_t* a, - int* lda, dcmpl_t* b, int* ldb) -{ -int i,j; - - - for(j=0;j< *cols; j++){ - dcmpl_t *aa = a + j* *lda; - dcmpl_t *bb = b + j* *ldb; - for(i=0;i< *rows; i++){ - aa[i].real += alpha->real * bb[i].real - alpha->imag * bb[i].imag; - aa[i].imag += alpha->imag * bb[i].real + alpha->real * bb[i].imag; - } - } -} - -void FORT_DADD(int *n, double *x, double *work){ -int i; - for(i=0;i<*n;i++) - x[i] = x[i] + work[i]; -} -void FORT_DADD2(int *n, double *x, double *work, double *work2){ -int i; - for(i=0;i<*n;i++) - x[i] = work[i] + work2[i]; -} -void FORT_DMULT(int *n, double *x, double *work){ -int i; - for(i=0;i<*n;i++) - x[i] = x[i]*work[i]; -} -void FORT_DMULT2(int *n, double *x, double *work, double *work2){ -int i; - for(i=0;i<*n;i++) - x[i] = work[i]*work2[i]; -} - -#endif diff --git a/armci/src-portals/acc.h b/armci/src-portals/acc.h deleted file mode 100644 index 3e58396f2..000000000 --- a/armci/src-portals/acc.h +++ /dev/null @@ -1,406 +0,0 @@ -#ifndef _ACC_H_ -#define _ACC_H_ - -typedef struct { - float real; - float imag; -} complex_t; - -typedef struct { - double real; - double imag; -} dcomplex_t; - -void c_d_accumulate_1d_(const double* const restrict alpha, - double* const restrict A, - const double* const restrict B, - const int* const restrict rows); -void c_f_accumulate_1d_(const float* const restrict alpha, - float* const restrict A, - const float* const restrict B, - const int* const restrict rows); -void c_c_accumulate_1d_(const complex_t* const restrict alpha, - complex_t* const restrict A, - const complex_t* const restrict B, - const int* const restrict rows); -void c_z_accumulate_1d_(const dcomplex_t* const restrict alpha, - dcomplex_t* const restrict A, - const dcomplex_t* const restrict B, - const int* const restrict rows); -void c_i_accumulate_1d_(const int* const restrict alpha, - int* const restrict A, - const int* const restrict B, - const int* const restrict rows); -void c_l_accumulate_1d_(const long* const restrict alpha, - long* const restrict A, - const long* const restrict B, - const int* const restrict rows); -void c_ll_accumulate_1d_(const long long* const restrict alpha, - long long* const restrict A, - const long long* const restrict B, - const int* const restrict rows); - -void c_d_accumulate_2d_(const double* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict B, - const int* const restrict bld); -void c_f_accumulate_2d_(const float* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - float* const restrict A, - const int* const restrict ald, - const float* const restrict B, - const int* const restrict bld); -void c_c_accumulate_2d_(const complex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - complex_t* const restrict A, - const int* const restrict ald, - const complex_t* const restrict B, - const int* const restrict bld); -void c_z_accumulate_2d_(const dcomplex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - dcomplex_t* const restrict A, - const int* const restrict ald, - const dcomplex_t* const restrict B, - const int* const restrict bld); -void c_i_accumulate_2d_(const int* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - int* const restrict A, - const int* const restrict ald, - const int* const restrict B, - const int* const restrict bld); -void c_l_accumulate_2d_(const long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long* const restrict A, - const int* const restrict ald, - const long* const restrict B, - const int* const restrict bld); -void c_ll_accumulate_2d_(const long long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long long* const restrict A, - const int* const restrict ald, - const long long* const restrict B, - const int* const restrict bld); - -void c_d_accumulate_2d_u_(const double* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict B, - const int* const restrict bld); -void c_f_accumulate_2d_u_(const float* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - float* const restrict A, - const int* const restrict ald, - const float* const restrict B, - const int* const restrict bld); -void c_c_accumulate_2d_u_(const complex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - complex_t* const restrict A, - const int* const restrict ald, - const complex_t* const restrict B, - const int* const restrict bld); -void c_z_accumulate_2d_u_(const dcomplex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - dcomplex_t* const restrict A, - const int* const restrict ald, - const dcomplex_t* const restrict B, - const int* const restrict bld); -void c_i_accumulate_2d_u_(const int* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - int* const restrict A, - const int* const restrict ald, - const int* const restrict B, - const int* const restrict bld); -void c_l_accumulate_2d_u_(const long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long* const restrict A, - const int* const restrict ald, - const long* const restrict B, - const int* const restrict bld); -void c_ll_accumulate_2d_u_(const long long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long long* const restrict A, - const int* const restrict ald, - const long long* const restrict B, - const int* const restrict bld); - -void c_dadd_(const int* const restrict n, - double* const restrict x, - const double* const restrict work); -void c_dadd2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2); -void c_dmult_(const int* const restrict n, - double* const restrict x, - const double* const restrict work); -void c_dmult2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2); - -#if ENABLE_F77 -# ifdef WIN32 -# define ATR __stdcall -# else -# define ATR -# endif -# define i_accumulate_1d_ F77_FUNC_(i_accumulate_1d,I_ACCUMULATE_2D) -# define l_accumulate_1d_ c_l_accumulate_1d_ -# define ll_accumulate_1d_ c_ll_accumulate_1d_ -# define f_accumulate_1d_ F77_FUNC_(f_accumulate_1d,F_ACCUMULATE_2D) -# define d_accumulate_1d_ F77_FUNC_(d_accumulate_1d,D_ACCUMULATE_2D) -# define c_accumulate_1d_ F77_FUNC_(c_accumulate_1d,C_ACCUMULATE_2D) -# define z_accumulate_1d_ F77_FUNC_(z_accumulate_1d,Z_ACCUMULATE_2D) -# define i_accumulate_2d_ F77_FUNC_(i_accumulate_2d,I_ACCUMULATE_2D) -# define l_accumulate_2d_ c_l_accumulate_2d_ -# define ll_accumulate_2d_ c_ll_accumulate_2d_ -# define f_accumulate_2d_ F77_FUNC_(f_accumulate_2d,F_ACCUMULATE_2D) -# define d_accumulate_2d_ F77_FUNC_(d_accumulate_2d,D_ACCUMULATE_2D) -# define c_accumulate_2d_ F77_FUNC_(c_accumulate_2d,C_ACCUMULATE_2D) -# define z_accumulate_2d_ F77_FUNC_(z_accumulate_2d,Z_ACCUMULATE_2D) -# define i_accumulate_2d_u_ F77_FUNC_(i_accumulate_2d_u,I_ACCUMULATE_2D_U) -# define l_accumulate_2d_u_ c_l_accumulate_2d_u_ -# define ll_accumulate_2d_u_ c_ll_accumulate_2d_u_ -# define f_accumulate_2d_u_ F77_FUNC_(f_accumulate_2d_u,F_ACCUMULATE_2D_U) -# define d_accumulate_2d_u_ F77_FUNC_(d_accumulate_2d_u,D_ACCUMULATE_2D_U) -# define c_accumulate_2d_u_ F77_FUNC_(c_accumulate_2d_u,C_ACCUMULATE_2D_U) -# define z_accumulate_2d_u_ F77_FUNC_(z_accumulate_2d_u,Z_ACCUMULATE_2D_U) -# define fort_dadd_ F77_FUNC_(fort_dadd,FORT_DADD) -# define fort_dadd2_ F77_FUNC_(fort_dadd2,FORT_DADD2) -# define fort_dmult_ F77_FUNC_(fort_dmult,FORT_DMULT) -# define fort_dmult2_ F77_FUNC_(fort_dmult2,FORT_DMULT2) -void ATR d_accumulate_1d_(const double* const restrict alpha, - double* const restrict A, - const double* const restrict B, - const int* const restrict rows); -void ATR f_accumulate_1d_(const float* const restrict alpha, - float* const restrict A, - const float* const restrict B, - const int* const restrict rows); -void ATR c_accumulate_1d_(const complex_t* const restrict alpha, - complex_t* const restrict A, - const complex_t* const restrict B, - const int* const restrict rows); -void ATR z_accumulate_1d_(const dcomplex_t* const restrict alpha, - dcomplex_t* const restrict A, - const dcomplex_t* const restrict B, - const int* const restrict rows); -void ATR i_accumulate_1d_(const int* const restrict alpha, - int* const restrict A, - const int* const restrict B, - const int* const restrict rows); -void ATR l_accumulate_1d_(const long* const restrict alpha, - long* const restrict A, - const long* const restrict B, - const int* const restrict rows); -void ATR ll_accumulate_1d_(const long long* const restrict alpha, - long long* const restrict A, - const long long* const restrict B, - const int* const restrict rows); - -void ATR d_accumulate_2d_(const double* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict B, - const int* const restrict bld); -void ATR f_accumulate_2d_(const float* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - float* const restrict A, - const int* const restrict ald, - const float* const restrict B, - const int* const restrict bld); -void ATR c_accumulate_2d_(const complex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - complex_t* const restrict A, - const int* const restrict ald, - const complex_t* const restrict B, - const int* const restrict bld); -void ATR z_accumulate_2d_(const dcomplex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - dcomplex_t* const restrict A, - const int* const restrict ald, - const dcomplex_t* const restrict B, - const int* const restrict bld); -void ATR i_accumulate_2d_(const int* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - int* const restrict A, - const int* const restrict ald, - const int* const restrict B, - const int* const restrict bld); -void ATR l_accumulate_2d_(const long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long* const restrict A, - const int* const restrict ald, - const long* const restrict B, - const int* const restrict bld); -void ATR ll_accumulate_2d_(const long long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long long* const restrict A, - const int* const restrict ald, - const long long* const restrict B, - const int* const restrict bld); - -void ATR d_accumulate_2d_u_(const double* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict B, - const int* const restrict bld); -void ATR f_accumulate_2d_u_(const float* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - float* const restrict A, - const int* const restrict ald, - const float* const restrict B, - const int* const restrict bld); -void ATR c_accumulate_2d_u_(const complex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - complex_t* const restrict A, - const int* const restrict ald, - const complex_t* const restrict B, - const int* const restrict bld); -void ATR z_accumulate_2d_u_(const dcomplex_t* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - dcomplex_t* const restrict A, - const int* const restrict ald, - const dcomplex_t* const restrict B, - const int* const restrict bld); -void ATR i_accumulate_2d_u_(const int* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - int* const restrict A, - const int* const restrict ald, - const int* const restrict B, - const int* const restrict bld); -void ATR l_accumulate_2d_u_(const long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long* const restrict A, - const int* const restrict ald, - const long* const restrict B, - const int* const restrict bld); -void ATR ll_accumulate_2d_u_(const long long* const restrict alpha, - const int* const restrict rows, - const int* const restrict cols, - long long* const restrict A, - const int* const restrict ald, - const long long* const restrict B, - const int* const restrict bld); - -void ATR fort_dadd_(const int* const restrict n, - double* const restrict x, - const double* const restrict work); -void ATR fort_dadd2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2); -void ATR fort_dmult_(const int* const restrict n, - double* const restrict x, - const double* const restrict work); -void ATR fort_dmult2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2); -#endif - -#if NOFORT -# define I_ACCUMULATE_1D c_i_accumulate_1d_ -# define L_ACCUMULATE_1D c_l_accumulate_1d_ -# define LL_ACCUMULATE_1D c_ll_accumulate_1d_ -# define D_ACCUMULATE_1D c_d_accumulate_1d_ -# define C_ACCUMULATE_1D c_c_accumulate_1d_ -# define Z_ACCUMULATE_1D c_z_accumulate_1d_ -# define F_ACCUMULATE_1D c_f_accumulate_1d_ -# define I_ACCUMULATE_2D c_i_accumulate_2d_ -# define L_ACCUMULATE_2D c_l_accumulate_2d_ -# define LL_ACCUMULATE_2D c_ll_accumulate_2d_ -# define D_ACCUMULATE_2D c_d_accumulate_2d_ -# define C_ACCUMULATE_2D c_c_accumulate_2d_ -# define Z_ACCUMULATE_2D c_z_accumulate_2d_ -# define F_ACCUMULATE_2D c_f_accumulate_2d_ -# define FORT_DADD c_dadd_ -# define FORT_DADD2 c_dadd2_ -# define FORT_DMULT c_dmult_ -# define FORT_DMULT2 c_dmult2_ -#else -# if defined(AIX) || defined(BGML) || defined(SGI_) -# define I_ACCUMULATE_2D i_accumulate_2d_u_ -# define L_ACCUMULATE_2D c_l_accumulate_2d_u_ -# define LL_ACCUMULATE_2D c_ll_accumulate_2d_u_ -# define D_ACCUMULATE_2D d_accumulate_2d_u_ -# define C_ACCUMULATE_2D c_accumulate_2d_u_ -# define Z_ACCUMULATE_2D z_accumulate_2d_u_ -# define F_ACCUMULATE_2D f_accumulate_2d_u_ -# else -# define I_ACCUMULATE_2D i_accumulate_2d_ -# define L_ACCUMULATE_2D c_l_accumulate_2d_ -# define LL_ACCUMULATE_2D c_ll_accumulate_2d_ -# define D_ACCUMULATE_2D d_accumulate_2d_ -# define C_ACCUMULATE_2D c_accumulate_2d_ -# define Z_ACCUMULATE_2D z_accumulate_2d_ -# define F_ACCUMULATE_2D f_accumulate_2d_ -# endif -# if defined(CRAY) && !defined(__crayx1) -# undef D_ACCUMULATE_2D -# define D_ACCUMULATE_2D F77_FUNC_(daxpy_2d,DAXPY_2D) -# endif -# define I_ACCUMULATE_1D i_accumulate_1d_ -# define L_ACCUMULATE_1D c_l_accumulate_1d_ -# define LL_ACCUMULATE_1D c_ll_accumulate_1d_ -# define D_ACCUMULATE_1D d_accumulate_1d_ -# define C_ACCUMULATE_1D c_accumulate_1d_ -# define Z_ACCUMULATE_1D z_accumulate_1d_ -# define F_ACCUMULATE_1D f_accumulate_1d_ -# define FORT_DADD fort_dadd_ -# define FORT_DADD2 fort_dadd2_ -# define FORT_DMULT fort_dmult_ -# define FORT_DMULT2 fort_dmult2_ -#endif /* !NOFORT */ - -// specific to src-gemini -#if defined(AIX) || defined(NOUNDERSCORE) -# define RA_ACCUMULATE_2D ra_accumulate_2d_u -#elif defined(BGML) -# define RA_ACCUMULATE_2D ra_accumulate_2d_u__ -#elif defined(SGI_) -# define RA_ACCUMULATE_2D RA_ACCUMULATE_2D_ -#elif !defined(CRAY) && !defined(WIN32) && !defined(HITACHI) ||defined(__crayx1) -# define RA_ACCUMULATE_2D RA_ACCUMULATE_2D_ -#endif - -#ifndef CRAY_T3E -void ATR RA_ACCUMULATE_2D(long*, int*, int*, long*, int*, long*, int*); -#else -#define RA_ACCUMULATE_2D RA_ACCUMULATE_2D_ -void RA_ACCUMULATE_2D_(long*, int*, int*, long*, int*, long*, int*); -#endif - -#endif /* _ACC_H_ */ diff --git a/armci/src-portals/aggregate.c b/armci/src-portals/aggregate.c deleted file mode 100644 index f945d75af..000000000 --- a/armci/src-portals/aggregate.c +++ /dev/null @@ -1,351 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** $Id: aggregate.c,v 1.6 2003-10-22 22:12:14 d3h325 Exp $ - * Aggregate Put/Get requests - */ - -#include "armcip.h" -#include /* memcpy */ -#include - -#define _MAX_AGG_BUFFERS 32 /* Maximum # of aggregation buffers available*/ -#define _MAX_AGG_BUFSIZE 2048 /* size of each buffer. should be < 2^15 */ -#define _MAX_PTRS 256 /* < 2^15, as it is "short int" in agg_req_t */ -#define _MAX_AGG_HANDLE _MAX_AGG_BUFFERS /* Max # of aggregation handles */ - -/* aggregate request handle */ -typedef struct { - unsigned int tag; /* non-blocking request tag */ - short int proc; /* remote process id */ - short int request_len ; /* number of requests */ - short int ptr_array_len; /* pointer length for this request */ - short int buf_pos_end; /* position of buffer (from right end) */ - armci_giov_t *darr; /* giov vectors */ -}agg_req_t; -static agg_req_t *aggr[_MAX_AGG_HANDLE]; /* aggregate request handle */ - - -/* data structure for dynamic buffer management */ -typedef struct { - int size; /* represents the size of the list (not linked list) */ - int index[_MAX_AGG_HANDLE]; -} agg_list_t; -static agg_list_t ulist, alist;/*in-use & available aggr buffer index list*/ - - -/* aggregation buffer */ -static char agg_buf[_MAX_AGG_BUFFERS][_MAX_AGG_BUFSIZE]; -/* aggregation buffer to store the pointers */ -static void* agg_src_ptr[_MAX_AGG_BUFFERS][_MAX_PTRS]; -static void* agg_dst_ptr[_MAX_AGG_BUFFERS][_MAX_PTRS]; - -/** - * --------------------------------------------------------------------- - * fill descriptor from this side (left to right) - * ---> - * _______________________________________________ - * | | | |. . . . . . . . . . | | | | - * |__|__|__|_____________________________|__|__|__| - * - * <--- - * fill src and dst pointer (arrays) in this direction - * (right to left) - * - * Once they are about to cross each other (implies buffer is full), - * complete the data transfer. - * --------------------------------------------------------------------- - */ - -#define AGG_INIT_NB_HANDLE(op_type, p, nb_handle) \ - if(nb_handle->proc < 0) { \ - nb_handle->tag = GET_NEXT_NBTAG(); \ - nb_handle->op = op_type; \ - nb_handle->proc = p; \ - nb_handle->bufid= NB_NONE; \ - } \ - else if(nb_handle->op != op_type) \ - armci_die("ARMCI_NbXXX: AGG_INIT_NB_HANDLE(): Aggregate Failed, Invalid non-blocking handle", nb_handle->op); \ - else if(nb_handle->proc != p) \ - armci_die("ARMCI_NbXXX: AGG_INIT_NB_HANDLE(): Aggregate Failed, Invalid non-blocking handle", p) - - -/* initialize/set the fields in the buffer*/ -#define _armci_agg_set_buffer(index, tag, proc, len) { \ - aggr[(index)]->tag = (tag); \ - aggr[(index)]->proc = (proc); \ - aggr[(index)]->request_len = (len); \ - ulist.index[ulist.size++] = (index);/* add the new index to the in-use list and increment it's size*/ \ -} - -/* get the index of the aggregation buffer to be used */ -static int _armci_agg_get_bufferid(armci_ihdl_t nb_handle) { - int i, index, tag = nb_handle->tag, proc = nb_handle->proc; - - /* check if there is an entry for this handle in the existing list*/ - for(i=ulist.size-1; i>=0; i--) { - index = ulist.index[i]; - if(aggr[index]->tag == tag && aggr[index]->proc == proc) - return index; - } - - /* else it is a new handle, so get a aggr buffer from either - of the lists. ???? don't throw exception here */ - if(ulist.size >= _MAX_AGG_BUFFERS && alist.size == 0) - armci_die("_armci_agg_get_index: Too many outstanding aggregation requests\n", ulist.size); - - /*If there is a buffer in readily available list,use it*/ - if(alist.size > 0) index = alist.index[--alist.size]; - else { /* else use/get a buffer from the main list */ - index = ulist.size; - - /* allocate memory for aggregate request handle */ - aggr[index] = (agg_req_t *)agg_buf[index]; - - aggr[index]->request_len = 0; - aggr[index]->ptr_array_len = 0; - aggr[index]->buf_pos_end = _MAX_AGG_BUFSIZE; - - /* allocate memory for giov vector field in aggregate request handler */ - aggr[index]->darr = (armci_giov_t *)(agg_buf[index]+sizeof(agg_req_t)); - } - - _armci_agg_set_buffer(index, tag, proc, 0); - return index; -} - -static void _armci_agg_update_lists(int index) { - int i; - /* remove that index from the in-use list and bring the last element - in the in-use list to the position of the removed one. */ - for(i=0; irequest_len; /* index of giov descriptor */ - bytes_remaining = aggr[index]->buf_pos_end - - (sizeof(agg_req_t) + aggr[index]->request_len*sizeof(armci_giov_t)); - - /* extra bytes required to store registered put data */ - if(is_registered_put) bytes_needed = bytes; - - /* if (byte-)sizes are equal, use previously created descriptor - else get a new descriptor */ - if( rid && bytes==aggr[index]->darr[rid-1].bytes) --rid; - else { get_new_descr=1; bytes_needed += sizeof(armci_giov_t); } - - /* If buffer is full, then complete data transfer. After completion, - if still ptr array_len is greater than maximum limit(_MAX_PTRS), - then do it by parts. Determine new ptr_array_len that fits buffer */ - if( (bytes_needed > bytes_remaining) || - (_MAX_PTRS - aggr[index]->ptr_array_len < *ptr_array_len)) { - armci_agg_complete(nb_handle, SET); - rid = 0; get_new_descr=1; - if(*ptr_array_len > _MAX_PTRS) *ptr_array_len = _MAX_PTRS; - } - - /* if new descriptor, allocate memory for src_ptr & dst_ptr arrays */ - if(get_new_descr) { - int i = aggr[index]->ptr_array_len; - aggr[index]->darr[rid].src_ptr_array = (void **)&agg_src_ptr[index][i]; - aggr[index]->darr[rid].dst_ptr_array = (void **)&agg_dst_ptr[index][i]; - aggr[index]->darr[rid].ptr_array_len = 0; - aggr[index]->request_len++; - } - - /* store registered put data */ - if(is_registered_put) { - aggr[index]->buf_pos_end -= bytes; - memcpy(&((char *)aggr[index])[aggr[index]->buf_pos_end], - *((char **)registered_put_data), bytes); - *(char **)registered_put_data = (char *)&((char *)aggr[index])[aggr[index]->buf_pos_end]; - } - - aggr[index]->ptr_array_len += *ptr_array_len; - return (&aggr[index]->darr[rid]); -} - -int armci_agg_save_descriptor(void *src, void *dst, int bytes, int proc, int op, - int is_registered_put, armci_ihdl_t nb_handle) { - - int one=1, idx; - armci_giov_t * darr; - - /* set up the handle if it is a new aggregation request */ - AGG_INIT_NB_HANDLE(op, proc, nb_handle); - - darr = _armci_agg_get_descriptor(&one, bytes, nb_handle, - is_registered_put, &src); - idx = darr->ptr_array_len; - - darr->src_ptr_array[idx] = src; - darr->dst_ptr_array[idx] = dst; - darr->bytes = bytes; - darr->ptr_array_len += 1; - - fflush(stdout); - return 0; -} - - -int armci_agg_save_giov_descriptor(armci_giov_t dscr[], int len, int proc, - int op, armci_ihdl_t nb_handle) { - int i, j, k, idx, bytes, ptr_array_len; - armci_giov_t * darr; - - /* set up the handle if it is a new aggregation request */ - AGG_INIT_NB_HANDLE(op, proc, nb_handle); - - for(i=0; iptr_array_len; - - for(j=idx; jsrc_ptr_array[j] = dscr[i].src_ptr_array[k]; - darr->dst_ptr_array[j] = dscr[i].dst_ptr_array[k]; - } - darr->bytes = dscr[i].bytes; - darr->ptr_array_len += ptr_array_len; - - ptr_array_len = dscr[i].ptr_array_len - ptr_array_len; - if(ptr_array_len <0) armci_die("agg_save_giov_descr failed", 0L); - } while(k < darr[i].ptr_array_len); - } - return 0; -} - -int armci_agg_save_strided_descriptor(void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, int proc, - int op, armci_ihdl_t nb_handle) { - - int i, j, k, idx, ptr_array_len=1, total1D=1, num1D=0; - int offset1, offset2, factor[MAX_STRIDE_LEVEL]; - armci_giov_t * darr; - - /* set up the handle if it is a new aggregation request */ - AGG_INIT_NB_HANDLE(op, proc, nb_handle); - - for(i=1; i<=stride_levels; i++) { - total1D *= count[i]; - factor[i-1]=0; - } - ptr_array_len = total1D; - - do { - darr=_armci_agg_get_descriptor(&ptr_array_len,count[0],nb_handle,0,0); - idx = darr->ptr_array_len; - - /* converting stride into giov vector */ - for(i=idx; isrc_ptr_array[i] = (char *)src_ptr + offset1; - darr->dst_ptr_array[i] = (char *)dst_ptr + offset2; - ++factor[0]; - ++num1D; - for(j=1; jbytes = count[0]; - darr->ptr_array_len += ptr_array_len; - ptr_array_len = total1D - ptr_array_len; - if(ptr_array_len <0) armci_die("agg_save_strided_descr failed", 0L); - } while(num1D < total1D); - - return 0; -} - - -void armci_agg_complete(armci_ihdl_t nb_handle, int condition) { - int i, index=0, rc; - - /* get the buffer index for this handle */ - for(i=ulist.size-1; i>=0; i--) { - index = ulist.index[i]; - if(aggr[index]->tag == nb_handle->tag && - aggr[index]->proc == nb_handle->proc) - break; - } - if(i<0) return; /* implies this handle has no requests at all */ - -#if 0 - printf("%d: Aggregation Complete to remote process %d (%d:%d requests)\n", - armci_me, nb_handle->proc, index, aggr[index]->request_len); -#endif - - /* complete the data transfer. NOTE: in LAPI, Non-blocking calls - (followed by wait) performs better than blocking put/get */ - if(aggr[index]->request_len) { - switch(nb_handle->op) { -#ifdef LAPI - armci_hdl_t usr_hdl; - case PUT: - ARMCI_INIT_HANDLE(&usr_hdl); - if((rc=PARMCI_NbPutV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc, (armci_hdl_t*)&usr_hdl))) - ARMCI_Error("armci_agg_complete: nbputv failed",rc); - PARMCI_Wait((armci_hdl_t*)&usr_hdl); - break; - case GET: - ARMCI_INIT_HANDLE(&usr_hdl); - if((rc=PARMCI_NbGetV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc, (armci_hdl_t*)&usr_hdl))) - ARMCI_Error("armci_agg_complete: nbgetv failed",rc); - PARMCI_Wait((armci_hdl_t*)&usr_hdl); - break; -#else - case PUT: - if((rc=PARMCI_PutV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc))) - ARMCI_Error("armci_agg_complete: putv failed",rc); - break; - case GET: - if((rc=PARMCI_GetV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc))) - ARMCI_Error("armci_agg_complete: getv failed",rc); - break; -#endif - } - } - - /* setting request length to zero, as the requests are completed */ - aggr[index]->request_len = 0; - aggr[index]->ptr_array_len = 0; - aggr[index]->buf_pos_end = _MAX_AGG_BUFSIZE; - - /* If armci_agg_complete() is called PARMCI_Wait(), then unset nb_handle*/ - if(condition==UNSET) { - nb_handle->proc = -1; - _armci_agg_update_lists(index); - } -} - diff --git a/armci/src-portals/armci.c b/armci/src-portals/armci.c deleted file mode 100644 index 99d55a72e..000000000 --- a/armci/src-portals/armci.c +++ /dev/null @@ -1,1001 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: armci.c,v 1.114.2.17 2007-08-30 22:58:18 manoj Exp $ */ - -/* DISCLAIMER - * - * This material was prepared as an account of work sponsored by an - * agency of the United States Government. Neither the United States - * Government nor the United States Department of Energy, nor Battelle, - * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR - * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY, - * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT, - * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT - * INFRINGE PRIVATELY OWNED RIGHTS. - * - * - * ACKNOWLEDGMENT - * - * This software and its documentation were produced with United States - * Government support under Contract Number DE-AC06-76RLO-1830 awarded by - * the United States Department of Energy. The United States Government - * retains a paid-up non-exclusive, irrevocable worldwide license to - * reproduce, prepare derivative works, perform publicly and display - * publicly by or for the US Government, including the right to - * distribute to other US Government contractors. - */ - -#define EXTERN -/*#define PRINT_BT*/ - -#include -#include -#include -#if defined(CRAY) && !defined(__crayx1) -# include -# include -# include -#endif -#ifdef LAPI -# include "lapidefs.h" -#endif -#include -#include "armcip.h" -#include "copy.h" -#include "memlock.h" -#include "parmci.h" -#include "armci_shmem.h" -#include "signaltrap.h" - -#ifdef ARMCIX -#include "x/armcix.h" -#endif -#ifdef BGML -#include "bgml.h" -#include -#include "bgmldefs.h" -extern void armci_msg_barrier(void); -#endif - -#ifdef CRAY_SHMEM -# ifdef CRAY_XT -# include -# else -# include -# endif -#endif - -#include - -/* global variables */ -int armci_me, armci_Sme, armci_nproc; -int armci_clus_me, armci_nclus, armci_master; -int armci_clus_first, armci_clus_last; -int *_armci_argc=NULL; -char ***_armci_argv=NULL; -int _armci_initialized_args=0; -int _armci_initialized=0; -int _armci_terminating =0; -thread_id_t armci_usr_tid; -armci_ireq_t armci_inb_handle[ARMCI_MAX_IMPLICIT];/*implicit non-blocking handle*/ -#ifndef HITACHI -double armci_internal_buffer[BUFSIZE_DBL]; -#endif -#if defined(SYSV) || defined(WIN32) || defined(MMAP) || defined(HITACHI) || defined(CATAMOUNT) || defined(BGML) -# include "locks.h" - lockset_t lockid; -#endif - -int* armci_prot_switch_fence=NULL; -int armci_prot_switch_preproc = -1; -int armci_prot_switch_preop = -1; - -#ifdef BGML -/* void armci_allocate_locks(); */ - void armci_init_memlock(); -#endif - -#ifdef LIBELAN_ATOMICS -ELAN_ATOMIC *a; -#warning "Enabling new atomics" -#endif - -typedef struct{ - int sent; - int received; - int waited; -}armci_notify_t; - -armci_notify_t **_armci_notify_arr; - -void ARMCI_Cleanup() -{ -#if defined(DATA_SERVER) - if(armci_nclus >1){ - armci_wait_for_server(); - } -#endif - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP))&& !defined(HITACHI) - Delete_All_Regions(); - if(armci_nproc>1) -#if !defined(LAPI) - DeleteLocks(lockid); -#endif - -#ifndef WIN32 - ARMCI_RestoreSignals(); -#endif - -#endif - armci_transport_cleanup(); -} - -int armci_getbufsize() -{ - return(BUFSIZE); -} - -void armci_notify_init() -{ - int rc,bytes=sizeof(armci_notify_t)*armci_nproc; - -#ifdef DOELAN4 - armci_elan_notify_init(); - return; -#endif - - _armci_notify_arr= - (armci_notify_t**)malloc(armci_nproc*sizeof(armci_notify_t*)); - if(!_armci_notify_arr)armci_die("armci_notify_ini:malloc failed",armci_nproc); - - if((rc=PARMCI_Malloc((void **)_armci_notify_arr, bytes))) - armci_die(" armci_notify_init: armci_malloc failed",bytes); - bzero(_armci_notify_arr[armci_me], bytes); -} - - -static void armci_perror_msg() -{ - char perr_str[80]; - if(!errno)return; - sprintf(perr_str,"Last System Error Message from Task %d:",armci_me); - perror(perr_str); -} - -static void armci_abort(int code) -{ - abort(); -#if !defined(BGML) - armci_perror_msg(); -#endif - ARMCI_Cleanup(); - /* data server process cannot use message-passing library to abort - * it simply exits, parent will get SIGCHLD and abort the program - */ -#if defined(DATA_SERVER) - if(armci_me<0)_exit(1); - else -#endif - armci_msg_abort(code); -} - - -void armci_die(char *msg, int code) -{ - void *bt[100]; - - if(_armci_terminating)return; - else _armci_terminating=1; - - if(SERVER_CONTEXT){ - fprintf(stdout,"%d(s):%s: %d\n",armci_me, msg, code); fflush(stdout); - fprintf(stderr,"%d(s):%s: %d\n",armci_me, msg, code); - }else{ - fprintf(stdout,"%d:%s: %d\n",armci_me, msg, code); fflush(stdout); - fprintf(stderr,"%d:%s: %d\n",armci_me, msg, code); - } - -#ifdef PRINT_BT - backtrace_symbols_fd(bt, backtrace(bt, 100), 2); -#endif - - armci_abort(code); -} - - -void armci_die2(char *msg, int code1, int code2) -{ - void *bt[100]; - - if(_armci_terminating)return; - else _armci_terminating=1; - - if(SERVER_CONTEXT){ - fprintf(stdout,"%d(s):%s: (%d,%d)\n",armci_me,msg,code1,code2); - fflush(stdout); - fprintf(stderr,"%d(s):%s: (%d,%d)\n",armci_me,msg,code1,code2); - }else{ - fprintf(stdout,"%d:%s: (%d,%d)\n",armci_me,msg,code1,code2); - fflush(stdout); - fprintf(stderr,"%d:%s: (%d,%d)\n",armci_me,msg,code1,code2); - } -#ifdef PRINT_BT - backtrace_symbols_fd(bt, backtrace(bt, 100), 2); -#endif - armci_abort(code1); -} - - -void ARMCI_Error(char *msg, int code) -{ - armci_die(msg,code); -} - - -void armci_allocate_locks() -{ - /* note that if ELAN_ACC is defined the scope of locks is limited to SMP */ -#if !defined(CRAY_SHMEM) && (defined(HITACHI) || defined(CATAMOUNT) || \ - (defined(QUADRICS) && defined(_ELAN_LOCK_H) && !defined(ELAN_ACC))) - armcill_allocate_locks(NUM_LOCKS); -#elif (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(HITACHI) - if(armci_nproc == 1)return; -# if defined(SPINLOCK) || defined(PMUTEX) || defined(PSPIN) - CreateInitLocks(NUM_LOCKS, &lockid); -# else - if(armci_master==armci_me)CreateInitLocks(NUM_LOCKS, &lockid); - armci_msg_clus_brdcst(&lockid, sizeof(lockid)); - if(armci_master != armci_me)InitLocks(NUM_LOCKS, lockid); -# endif -#endif -} - - -void ARMCI_Set_shm_limit(unsigned long shmemlimit) -{ -#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(HITACHI) -#define EXTRASHM 1024 /* extra shmem used internally in ARMCI */ -unsigned long limit; - limit = shmemlimit + EXTRASHM; - armci_set_shmem_limit_per_core(limit); -#endif -} - - - -/*\ allocate and initialize memory locking data structure -\*/ -void armci_init_memlock() -{ - int bytes = MAX_SLOTS*sizeof(memlock_t); - int rc, msize_per_proc=bytes; - -#ifdef MEMLOCK_SHMEM_FLAG - /* last proc on node allocates memlock flag in shmem */ - if(armci_clus_last == armci_me) bytes += sizeof(int); -#endif - - memlock_table_array = malloc(armci_nproc*sizeof(void*)); - if(!memlock_table_array) armci_die("malloc failed for ARMCI lock array",0); - - rc = PARMCI_Malloc(memlock_table_array, bytes); - if(rc) armci_die("failed to allocate ARMCI memlock array",rc); - - armci_msg_barrier(); - - bzero(memlock_table_array[armci_me],bytes); - -#ifdef BGML - bgml_init_locks ((void *) memlock_table_array[armci_me]); -#elif ARMCIX - ARMCIX_init_memlock ((memlock_t *) memlock_table_array[armci_me]); -#endif - - -#ifdef MEMLOCK_SHMEM_FLAG - /* armci_use_memlock_table is a pointer to local memory variable=1 - * we overwrite the pointer with address of shared memory variable - * armci_use_memlock_table and initialize it >0 - */ - armci_use_memlock_table = (int*) (msize_per_proc + - (char*) memlock_table_array[armci_clus_last]); - - /* printf("%d: last=%d bytes=%d ptr =(%d, %d)\n", - armci_me,armci_clus_last,bytes,armci_use_memlock_table, - memlock_table_array[armci_clus_last]); fflush(stdout); */ - - if(armci_clus_last == armci_me) *armci_use_memlock_table =1+armci_me; - -#endif - - *armci_use_memlock_table = 0; - armci_msg_barrier(); -} - - -#if defined(SYSV) || defined(WIN32) -static void armci_check_shmmax() -{ - long mylimit, limit; - mylimit = limit = (long) armci_max_region(); - armci_msg_bcast_scope(SCOPE_MASTERS, &limit, sizeof(long), 0); - if(mylimit != limit){ - printf("%d:Shared mem limit in ARMCI is %ld bytes on node %s vs %ld on %s\n", - armci_me,mylimit<<10,armci_clus_info[armci_clus_me].hostname, - limit<<10, armci_clus_info[0].hostname); - fflush(stdout); sleep(1); - armci_die("All nodes must have the same SHMMAX limit if NO_SHM is not defined",0); - } -} -#endif - -extern void armci_region_shm_malloc(void *ptr_arr[], size_t bytes); - - -void ARMCI_NetInit() -{ - /*armci_portals_net_init();*/ -} - -int PARMCI_Init_args(int *argc, char ***argv) -{ - armci_msg_init(argc,argv); - - _armci_argc = argc; - _armci_argv = argv; - _armci_initialized_args=1; - return PARMCI_Init(); -} - -extern void *sbrk(intptr_t); - -int _armci_init(MPI_Comm comm) -{ - caddr_t atbeginbrval = (caddr_t)sbrk(0); - if(_armci_initialized>0) return 0; -#ifdef NEW_MALLOC - mallopt(M_MMAP_MAX, 0); - mallopt(M_TRIM_THRESHOLD, -1); -#endif - - armci_msg_init_comm(comm); - - armci_nproc = armci_msg_nproc(); - armci_me = armci_msg_me(); - armci_usr_tid = THREAD_ID_SELF(); /*remember the main user thread id */ - armci_init_clusinfo(); - armci_prot_switch_fence = malloc(sizeof(int*)*armci_nproc); - assert(armci_prot_switch_fence !=NULL); - armci_init_portals(atbeginbrval); -#ifdef MSG_COMMS_MPI - armci_group_init(); -#endif -#ifndef NEW_MALLOC - armci_krmalloc_init_localmem(); -#endif -#if defined(SYSV) || defined(WIN32) || defined(MMAP) - if(ARMCI_Uses_shm() ) { - armci_shmem_init(); - } -#endif - armci_allocate_locks(); - armci_init_fence(); -#if ARMCI_ENABLE_GPC_CALLS - gpc_init_signals(); -#endif - armci_msg_barrier(); - armci_init_memlock(); /* allocate data struct for locking memory areas */ - armci_msg_barrier(); - //if(armci_me == 0) code_summary(); - armci_msg_barrier(); - armci_msg_gop_init(); - _armci_initialized++; - return 0; -} - - -int PARMCI_Init() -{ - return _armci_init(MPI_COMM_WORLD); -} - - -int PARMCI_Init_mpi_comm(MPI_Comm comm) -{ - return _armci_init(comm); -} - - -void PARMCI_Finalize() -{ - if(!_armci_initialized)return; - _armci_initialized--; - if(_armci_initialized)return; - - _armci_terminating =1; - armci_msg_barrier(); - if(armci_me==armci_master) ARMCI_ParentRestoreSignals(); - -#ifdef PORTALS - request_header_t msg; - portals_ds_req_t req; - ptl_process_id_t dsid = portals_id_map[armci_me]; - msg.operation = QUIT; - - if(armci_me == armci_master) { - portalsBlockingRemoteOperationToNode(&msg,sizeof(request_header_t),armci_clus_me); - } - - armci_msg_barrier(); - portals_cp_finalize(); - -#else - - ARMCI_Cleanup(); - armci_msg_barrier(); - armci_group_finalize(); - free(armci_prot_switch_fence); -#endif -#ifdef MSG_COMMS_MPI - MPI_Comm_free(&ARMCI_COMM_WORLD); -#endif -} - - -/* Indicates whether ARMCI_Init or ARMCI_Init_args has been called. */ -int PARMCI_Initialized() -{ - return (_armci_initialized > 0) ? 1 : 0; -} - - -#if !(defined(SYSV) || defined(WIN32)) -void ARMCI_Set_shmem_limit(unsigned long shmemlimit) -{ - /* not applicable here - * aborting would make user's life harder - */ -} -#endif - - - -void ARMCI_Copy(void *src, void *dst, int n) -{ - armci_copy(src,dst,n); -} - -extern void cpu_yield(); -void armci_util_wait_int(volatile int *p, int val, int maxspin) -{ -int count=0; -extern void cpu_yield(); - while(*p != val) - if((++count)proc); - - if(direct) { - return(success); - } - if(nb_handle) { - if(nb_handle->agg_flag) { - armci_agg_complete(nb_handle, UNSET); - return (success); - } - if(nb_handle->tag!=0 && nb_handle->bufid==NB_NONE){ - ARMCI_NB_WAIT(nb_handle->cmpl_info); - __asm__ __volatile__ ("mfence" ::: "memory"); - __asm__ __volatile__ ("sfence" ::: "memory"); - return(success); - } -# ifdef COMPLETE_HANDLE - COMPLETE_HANDLE(nb_handle->bufid,nb_handle->tag,(&success)); -# endif - } - - __asm__ __volatile__ ("mfence" ::: "memory"); - __asm__ __volatile__ ("sfence" ::: "memory"); - return(success); -} - -/** - * implicit handle - */ -static char hdl_flag[ARMCI_MAX_IMPLICIT]; -static int impcount=0; -armci_ihdl_t armci_set_implicit_handle (int op, int proc) { - - int i=impcount%ARMCI_MAX_IMPLICIT; - if(hdl_flag[i]=='1') - PARMCI_Wait((armci_hdl_t*)&armci_inb_handle[i]); - -#ifdef BGML - armci_inb_handle[i].count=0; -#endif - armci_inb_handle[i].tag = GET_NEXT_NBTAG(); - armci_inb_handle[i].op = op; - armci_inb_handle[i].proc = proc; - armci_inb_handle[i].bufid = NB_NONE; - armci_inb_handle[i].agg_flag = 0; - hdl_flag[i]='1'; - ++impcount; - return &armci_inb_handle[i]; -} - - -/* wait for all non-blocking operations to finish */ -int PARMCI_WaitAll () { -#ifdef BGML - BGML_WaitAll(); -#elif ARMCIX - ARMCIX_WaitAll (); -#else - int i; - if(impcount) { - for(i=0; iagg_flag = 1; - ((armci_ihdl_t)(nb_handle))->proc = -1; -} - -void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t* nb_handle) { - ((armci_ihdl_t)(nb_handle))->agg_flag = 0; - ((armci_ihdl_t)(nb_handle))->proc = -1; -} - -int parmci_notify(int proc) -{ -#ifdef DOELAN4 - if(proc==armci_me){ - return 0; - } -#endif -#if defined(GM) || (defined(DOELAN4) && defined(ELAN_ACC)) - { - extern int armci_inotify_proc(int); - return(armci_inotify_proc(proc)); - } -#else - armci_notify_t *pnotify = _armci_notify_arr[armci_me]+proc; - pnotify->sent++; -# ifdef MEM_FENCE - if(SAMECLUSNODE(proc)) MEM_FENCE; -# endif - PARMCI_Put(&pnotify->sent,&(_armci_notify_arr[proc]+armci_me)->received, - sizeof(pnotify->sent),proc); - return(pnotify->sent); -#endif -} - - -/*\ blocks until received count becomes >= waited count - * return received count and store waited count in *pval -\*/ -int parmci_notify_wait(int proc,int *pval) -{ - int retval; -#ifdef DOELAN4 - if(proc==armci_me){ -#ifdef MEM_FENCE - MEM_FENCE; -#endif - return 0; - } -#endif - -#if defined(GM) || (defined(DOELAN4) && defined(ELAN_ACC)) - { - extern int armci_inotify_wait(int,int*); - retval=armci_inotify_wait(proc,pval); - } -#else - { - long loop=0; - armci_notify_t *pnotify = _armci_notify_arr[armci_me]+proc; - pnotify->waited++; - while( pnotify->waited > pnotify->received) { - if(++loop == 1000) { loop=0;cpu_yield(); } - armci_util_spin(loop, pnotify); - } - *pval = pnotify->waited; - retval=pnotify->received; - } -#endif - - return retval; -} - -long armci_util_long_getval(long* p) -{ - return *p; -} - -int armci_util_int_getval(int* p) -{ - return *p; -} - - -int PARMCI_Test(armci_hdl_t *usr_hdl) -{ -armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; -int success=0; -#ifdef BGML - success=(int)nb_handle->count; -#else -int direct=SAMECLUSNODE(nb_handle->proc); - if(direct)return(success); - if(nb_handle) { - if(nb_handle->agg_flag) { - armci_die("test for aggregate handle not yet implemented\n",0); - } - } - if(nb_handle){ -# ifdef ARMCI_NB_TEST - if(nb_handle->tag==0){ - ARMCI_NB_TEST(nb_handle->cmpl_info,&success); - return(success); - } -# ifdef LAPI - if(nb_handle->tag!=0 && nb_handle->bufid==NB_NONE){ - ARMCI_NB_TEST(nb_handle->cmpl_info,&success); - return(success); - } -# endif -# endif -# ifdef TEST_HANDLE - TEST_HANDLE(nb_handle->bufid,nb_handle->tag,(&success)); -# endif - } -#endif - return(success); -} - -#ifdef DO_CKPT -void ARMCI_Ckpt_create_ds(armci_ckpt_ds_t *ckptds, int count) -{ - armci_create_ckptds(ckptds,count); -} - -int ARMCI_Ckpt_init(char *filename, ARMCI_Group *grp, int savestack, int saveheap, armci_ckpt_ds_t *ckptds) -{ -int rid; - rid = armci_icheckpoint_init(filename,grp,savestack,saveheap,ckptds); - return(rid); -} - -int ARMCI_Ckpt(int rid) -{ - return(armci_icheckpoint(rid)); -} - -void ARMCI_Ckpt_Recover(int rid, int iamreplacement) -{ - armci_irecover(rid, iamreplacement); -} -void ARMCI_Ckpt_finalize(int rid) -{ - armci_icheckpoint_finalize(rid); -} -#endif -#if ARMCI_ENABLE_GPC_CALLS -int armci_gpc(int hndl, int proc, void *hdr, int hlen, void *data, int dlen, - void *rhdr, int rhlen, void *rdata, int rdlen, - armci_hdl_t* nbh) { -armci_ihdl_t nb_handle = (armci_ihdl_t)nbh; -armci_giov_t darr[2]; /* = {{&rhdr, &rhdr, 1, rhlen}, {&rdata, &rdata, 1, rdlen}};*/ -gpc_send_t send; -char *ptr; - - /* initialize giov */ - darr[0].src_ptr_array = &rhdr; - darr[0].dst_ptr_array = &rhdr; - darr[0].ptr_array_len = 1; - darr[0].bytes = rhlen; - - darr[1].src_ptr_array = &rdata; - darr[1].dst_ptr_array = &rdata; - darr[1].ptr_array_len = 1; - darr[1].bytes = rdlen; - - -/* if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen()) */ -/* return FAIL2; */ -/* if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen()) */ -/* return FAIL2; */ -/* if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) */ -/* return FAIL2; */ -/* if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) */ -/* return FAIL2; */ - - if(hlen>0 && hdr==NULL) - return FAIL3; - if(rhlen>0 && rhdr==NULL) - return FAIL3; - if(dlen>0 && data==NULL) - return FAIL3; - if(rdlen>0 && rdata==NULL) - return FAIL3; - - if(proc<0 || proc >= armci_nproc) - return FAIL4; - - send.hndl = hndl; - send.hlen = hlen; - send.dlen = dlen; - send.hdr = hdr; - send.data = data; - - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = GET; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else { - ORDER(GET,proc); /*ensure ordering */ - nb_handle = NULL; - } - -#if defined(LAPI) || defined(GM) || defined(VAPI) || defined(QUADRICS) - if(armci_rem_gpc(GET, darr, 2, &send, proc, 1, nb_handle)) -#endif - return FAIL2; - return 0; -} - -int armci_sameclusnode(int proc) { - return SAMECLUSNODE(proc); -} -#endif - -void _armci_init_handle(armci_hdl_t *hdl) -{ - ((double *)((hdl)->data))[0]=0; - ((double *)((hdl)->data))[1]=0; -} - -static inline int val_to_char(int v) -{ - if (v >= 0 && v < 10) - return '0' + v; - else if (v >= 10 && v < 16) - return ('a' - 10) + v; - else - return -1; -} -static const char *nexttoken(const char *q, int sep) -{ - if (q) - q = strchr(q, sep); - if (q) - q++; - return q; -} - -int cstr_to_cpuset(cpu_set_t * mask, const char *str) -{ -const char *p, *q; -q = str; - CPU_ZERO(mask); - - while (p = q, q = nexttoken(q, ','), p) { - unsigned int a; /* beginning of range */ - unsigned int b; /* end of range */ - unsigned int s; /* stride */ - const char *c1, *c2; - if (sscanf(p, "%u", &a) < 1) - return 1; - b = a; - s = 1; - c1 = nexttoken(p, '-'); - c2 = nexttoken(p, ','); - if (c1 != NULL && (c2 == NULL || c1 < c2)) { - if (sscanf(c1, "%u", &b) < 1) - return 1; - c1 = nexttoken(c1, ':'); - if (c1 != NULL && (c2 == NULL || c1 < c2)) - if (sscanf(c1, "%u", &s) < 1) { - return 1; - } - } - if (!(a <= b)) - return 1; - while (a <= b) { - CPU_SET(a, mask); - a += s; - } - } - return 0; -} - -char *cpuset_to_cstr(cpu_set_t * mask, char *str) -{ -int i; -char *ptr = str; -int entry_made = 0; - for (i = 0; i < CPU_SETSIZE; i++) { - if (CPU_ISSET(i, mask)) { - int j; - int run = 0; - entry_made = 1; - for (j = i + 1; j < CPU_SETSIZE; j++) { - if (CPU_ISSET(j, mask)) - run++; - else - break; - } - if (!run) - sprintf(ptr, "%d,", i); - else if (run == 1) { - sprintf(ptr, "%d,%d,", i, i + 1); - i++; - } else { - sprintf(ptr, "%d-%d,", i, i + run); - i += run; - } - while (*ptr != 0) - ptr++; - } - } - ptr -= entry_made; - *ptr = 0; - return str; -} - -char *cpuset_to_str(cpu_set_t * mask, char *str) -{ -int base; -char *ptr = str; -char *ret = 0; - for (base = CPU_SETSIZE - 4; base >= 0; base -= 4) { - char val = 0; - if (CPU_ISSET(base, mask)) - val |= 1; - if (CPU_ISSET(base + 1, mask)) - val |= 2; - if (CPU_ISSET(base + 2, mask)) - val |= 4; - if (CPU_ISSET(base + 3, mask)) - val |= 8; - if (!ret && val) - ret = ptr; - *ptr++ = val_to_char(val); - } - *ptr = 0; - return ret ? ret : ptr - 1; -} - -long armci_cksm_copy(char *src, char *dst, size_t bytes) -{ -long sum = 0; -size_t count=bytes; - while( count > 1 ) { - sum += * (unsigned int *) src++; - count -= 4; - } - - if( count > 0 ){ - printf("\nblistering barnicles"); - sum += * (unsigned char *) src; - } - - while (sum>>32) - sum = (sum & 0xffffffff) + (sum >> 32); - return(~sum); -} - -void code_summary() { - printf("\nActive #defines that could affect ARMCI"); - printf("\n----------------------------------------"); -# ifdef ORNL_USE_DS_FOR_REMOTE_GETS - printf("\n#define ORNL_USE_DS_FOR_REMOTE_GETS"); -# endif - -# ifdef PORTALS_USE_RENDEZ_VOUS - printf("\n#define PORTALS_USE_RENDEZ_VOUS"); -# endif - -# ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - printf("\n#define PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE"); -# endif - -# ifdef PORTALS_AFFINITY - printf("\n#define PORTALS_AFFINITY"); -# endif - -/* -# ifdef CRAY_USE_MDMD_COPY - printf("\n#define CRAY_USE_MDMD_COPY"); -# endif -*/ - printf("\n----------------------------------------"); - printf("\nInfo @ armci/src/code_options.h"); - printf("\n----------------------------------------\n"); - -# ifdef PORTALS - portals_print_summary(); -# endif -} diff --git a/armci/src-portals/armci.h b/armci/src-portals/armci.h deleted file mode 100644 index fe5fef018..000000000 --- a/armci/src-portals/armci.h +++ /dev/null @@ -1,414 +0,0 @@ -/*$id$*/ -/* ARMCI header file */ -#ifndef _ARMCI_H -#define _ARMCI_H - -/* for size_t */ -#include - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -typedef unsigned long long u64Int; -typedef long long s64Int; - -extern int armci_sameclusnode(int proc); - -typedef struct { - void **src_ptr_array; - void **dst_ptr_array; - int ptr_array_len; - int bytes; -} armci_giov_t; -typedef long armci_size_t; -extern int armci_notify(int proc); -extern int armci_notify_wait(int proc,int *pval); -extern int ARMCI_Init(void); /* initialize ARMCI */ -extern int ARMCI_Init_mpi_comm(MPI_Comm comm); /* initialize ARMCI */ -extern int ARMCI_Init_args(int *argc, char ***argv); -extern void ARMCI_Barrier(void); /* ARMCI Barrier*/ - -extern int ARMCI_Put(void *src, void* dst, int bytes, int proc); -extern int ARMCI_Put_flag(void *src, void* dst,int bytes,int *f,int v,int proc); - -#define ARMCI_Put1(_s,_d,_b,_p) memcpy(_d,_s,_b), 0 - -extern int ARMCI_PutS( /* strided put */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_PutS_flag_dir( /* put with flag that uses direct put */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int *flag, /* pointer to remote flag */ - int val, /* value to set flag upon completion of - data transfer */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_PutS_flag( - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int *flag, /* pointer to remote flag */ - int val, /* value to set flag upon completion of - data transfer */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_Acc(int optype, void *scale, void *src, void *dst, int bytes, int proc); - -extern int ARMCI_AccS( /* strided accumulate */ - int optype, /* operation */ - void *scale, /* scale factor x += scale*y */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ); - - -extern int ARMCI_Get(void *src, void* dst, int bytes, int proc); - -extern int ARMCI_GetS( /* strided get */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_GetV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_PutV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_AccV( int op, /* operation code */ - void *scale, /* scaling factor for accumulate */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int ARMCI_PutValueInt(int src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc /* remote process (or) ID */ - ); - -extern int ARMCI_PutValueLong(long src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc /* remote process (or) ID */ - ); - -extern int ARMCI_PutValueFloat(float src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc /* remote process (or) ID */ - ); - -extern int ARMCI_PutValueDouble(double src,/* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc /* remote process (or) ID */ - ); - -extern int ARMCI_GetValueInt(void *src, int proc); -extern long ARMCI_GetValueLong(void *src, int proc); -extern float ARMCI_GetValueFloat(void *src, int proc); -extern double ARMCI_GetValueDouble(void *src, int proc); - - -extern int ARMCI_Malloc(void* ptr_arr[], armci_size_t bytes); -extern int ARMCI_Malloc_memdev(void* ptr_arr[], armci_size_t bytes, const char *device); -extern int ARMCI_Free(void *ptr); -extern int ARMCI_Free_memdev(void *ptr); -extern void* ARMCI_Malloc_local(armci_size_t bytes); -extern int ARMCI_Free_local(void *ptr); -extern int ARMCI_Same_node(int proc); - -extern void ARMCI_Finalize(); /* terminate ARMCI */ -extern void ARMCI_Error(char *msg, int code); -extern void ARMCI_Fence(int proc); -extern void ARMCI_DoFence(int proc); -extern void ARMCI_AllFence(void); -extern int ARMCI_Rmw(int op, void *ploc, void *prem, int extra, int proc); -extern void ARMCI_Cleanup(void); -extern int ARMCI_Create_mutexes(int num); -extern int ARMCI_Destroy_mutexes(void); -extern void ARMCI_Lock(int mutex, int proc); -extern void ARMCI_Unlock(int mutex, int proc); -extern void ARMCI_Set_shm_limit(unsigned long shmemlimit); -extern int ARMCI_Uses_shm(); -extern void ARMCI_Copy(void *src, void *dst, int n); - -#define FAIL -1 -#define FAIL2 -2 -#define FAIL3 -3 -#define FAIL4 -4 -#define FAIL5 -5 -#define FAIL6 -6 -#define FAIL7 -7 -#define FAIL8 -8 - -#define ARMCI_SWAP 10 -#define ARMCI_SWAP_LONG 11 -#define ARMCI_FETCH_AND_ADD 12 -#define ARMCI_FETCH_AND_ADD_LONG 13 - -#define ARMCI_ACC_OFF 36 -#define ARMCI_ACC_INT (ARMCI_ACC_OFF + 1) -#define ARMCI_ACC_DBL (ARMCI_ACC_OFF + 2) -#define ARMCI_ACC_FLT (ARMCI_ACC_OFF + 3) -#define ARMCI_ACC_CPL (ARMCI_ACC_OFF + 4) -#define ARMCI_ACC_DCP (ARMCI_ACC_OFF + 5) -#define ARMCI_ACC_LNG (ARMCI_ACC_OFF + 6) -#define ARMCI_ACC_RA (ARMCI_ACC_OFF + 7) - -#define ARMCI_MAX_STRIDE_LEVEL 8 - -#ifdef BGML -#define ARMCI_CRITICAL_SECTION_ENTER() BGML_CriticalSection_enter(); -#define ARMCI_CRITICAL_SECTION_EXIT() BGML_CriticalSection_exit(); -#else -#define ARMCI_CRITICAL_SECTION_ENTER() -#define ARMCI_CRITICAL_SECTION_EXIT() -#endif - -/************ locality information **********************************************/ -typedef int armci_domain_t; -#define ARMCI_DOMAIN_SMP 0 /* SMP node domain for armci_domain_XXX calls */ -extern int armci_domain_nprocs(armci_domain_t domain, int id); -extern int armci_domain_id(armci_domain_t domain, int glob_proc_id); -extern int armci_domain_glob_proc_id(armci_domain_t domain, int id, int loc_proc_id); -extern int armci_domain_my_id(armci_domain_t domain); -extern int armci_domain_count(armci_domain_t domain); -extern int armci_domain_same_id(armci_domain_t domain, int proc); -extern int armci_smp_master(int); - - -/* PVM group - * On CrayT3E: the default group is the global group which is (char *)NULL - * It is the only working group. - * On Workstations: the default group is "mp_working_group". User can set - * the group name by calling the ARMCI_PVM_init (defined - * in message.c) and passing the group name to the library. - */ - -extern char *mp_group_name; - -/*********************stuff for non-blocking API******************************/ -/*\ the request structure for non-blocking api. -\*/ -typedef struct{ -#ifdef BGML - int data[4]; /* tag, bufid, agg_flag, op, proc */ - double dummy[72]; /* bg1s_t, count, extra */ -#else - int data[4]; -#if defined(_AIX) -# if defined(__64BIT__) - double dummy[27]; /*lapi_cntr_t is 200 bytes, using 216 just to be safe*/ -# else - double dummy[24]; /*lapi_cntr_t is 148 bytes, using 166 just to be safe*/ -# endif -#elif defined(ALLOW_PIN) - void *dummy[2];/*2 cause itshould be aligned after we cast hdl_t to ihdl_t*/ -#else - double dummy; -#endif -#endif -} armci_hdl_t; - -#define armci_req_t armci_hdl_t - -typedef int ARMCI_Group; - -extern void ARMCI_Group_create(int n, int *pid_list, ARMCI_Group *group_out); -extern void ARMCI_Group_create_child(int n, int *pid_list, - ARMCI_Group *group_out, ARMCI_Group *group_parent); -extern void ARMCI_Group_free(ARMCI_Group *group); -extern int ARMCI_Group_rank(ARMCI_Group *group, int *rank); -extern void ARMCI_Group_size(ARMCI_Group *group, int *size); -extern void ARMCI_Group_set_default(ARMCI_Group *group); -extern void ARMCI_Group_get_default(ARMCI_Group *group_out); -extern void ARMCI_Group_get_world(ARMCI_Group *group_out); - -extern int ARMCI_Malloc_group(void *ptr_arr[], armci_size_t bytes,ARMCI_Group *group); -extern int ARMCI_Malloc_group_memdev(void *ptr_arr[], armci_size_t bytes,ARMCI_Group *group, const char *device); -extern int ARMCI_Free_group(void *ptr, ARMCI_Group *group); - -extern int ARMCI_NbPut(void *src, void* dst, int bytes, int proc,armci_hdl_t* nb_handle); - -extern int ARMCI_NbPutS( /* strided put */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbAccS( /* strided accumulate */ - int optype, /* operation */ - void *scale, /* scale factor x += scale*y */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbGet(void *src, void* dst, int bytes, int proc,armci_hdl_t* nb_handle); - -extern int ARMCI_NbGetS( /* strided get */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level count[0]=bytes */ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handler/*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbAccV( int op, /* operation code */ - void *scale, /* scaling factor for accumulate */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking request handle*/ - ); - -extern int ARMCI_NbPutValueInt(int src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc, /* remote process (or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking - request handle */ - ); - -extern int ARMCI_NbPutValueLong(long src, /* value in a register to put */ - void *dst, /* dest starting addr to put data */ - int proc, /* remote process (or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking - request handle */ - ); - -extern int ARMCI_NbPutValueFloat(float src,/* value in a register to put */ - void *dst,/* dest starting addr to put data */ - int proc, /* remote process (or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking - request handle */ - ); - -extern int ARMCI_NbPutValueDouble(double src,/* value in a register to put */ - void *dst,/* dest starting addr to put data*/ - int proc, /* remote process (or) ID */ - armci_hdl_t* nb_handle /*armci_non-blocking - request handle */ - ); - -extern int ARMCI_Wait(armci_hdl_t* nb_handle); /*non-blocking request handle*/ - -extern int ARMCI_Test(armci_hdl_t* nb_handle); /*non-blocking request handle*/ - -extern int ARMCI_WaitAll (void); - -extern int ARMCI_WaitProc (int proc); - -extern void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t* nb_handle); - -extern void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t* nb_handle); - -#define ARMCI_INIT_HANDLE(hdl) do {((double *)((hdl)->data))[0]=0; \ - ((double *)((hdl)->data))[1]=0; }while(0) - -/* -------------- ARMCI Non-collective memory allocator ------------- */ -typedef struct armci_meminfo_ds { - char * armci_addr; /* remote address of the creator which can be - used in ARMCI communication */ - char *addr; /* local address of creator which can be used in - to set SMP memoffset, armci_set_mem_offset() */ - size_t size; /* size of remote pid's segment (bytes) */ - int cpid; /* armci pid of creator */ - long idlist[64]; -} armci_meminfo_t; - -extern void ARMCI_Memget(size_t bytes, armci_meminfo_t *meminfo, int memflg); - -extern void* ARMCI_Memat(armci_meminfo_t *meminfo, long offset); - -extern void ARMCI_Memdt(armci_meminfo_t *meminfo, long offset); - -extern void ARMCI_Memctl(armci_meminfo_t *meminfo); - -/* ------------------- ARMCI Checkpointing/Recovery ----------------- */ -#ifdef DO_CKPT -#define ARMCI_CKPT 0 -#define ARMCI_RESTART 1 -typedef struct { - void **ptr_arr; - size_t *sz; - int *saveonce; - int count; -}armci_ckpt_ds_t; -void ARMCI_Ckpt_create_ds(armci_ckpt_ds_t *ckptds, int count); -int ARMCI_Ckpt_init(char *filename, ARMCI_Group *grp, int savestack, int saveheap, armci_ckpt_ds_t *ckptds); -int ARMCI_Ckpt(int rid); -void ARMCI_Ckpt_finalize(int rid); -#define ARMCI_Restart_simulate armci_irecover -# ifdef MSG_COMMS_MPI - ARMCI_Group * ARMCI_Get_ft_group(); -# endif -#endif - -/* ------------------------------------------------------------------ */ - - -#if defined(__cplusplus) || defined(c_plusplus) -} -#endif - -#endif /* _ARMCI_H */ - - diff --git a/armci/src-portals/armci_portals.c b/armci/src-portals/armci_portals.c deleted file mode 100644 index ba2ed515f..000000000 --- a/armci/src-portals/armci_portals.c +++ /dev/null @@ -1,2229 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - /*$id:$*/ -/*#define _GNU_SOURCE*/ -#include -#include -#include "armcip.h" -#include "message.h" -#include -#include -#include -#include -#include - -#define DEBUG_COMM 0 -#define DEBUG_INIT 0 -#define DEBUG_SERV 0 -#define PUT_LOCAL_ONLY_COMPLETION__ - -typedef struct arminfo{ - caddr_t ptr[MAX_DS]; - size_t size[MAX_DS]; - long serv_offs[MAX_DS]; - int cur_ds; -}rm_info_t; - -static rm_info_t *all_meminfo; - -static int client_md_count=0,serv_md_count=0; - -typedef struct arns{ - long data; - long data1; - struct arns *next; -} arnode; - -#ifdef ARMCI_CHECK_STATE -arnode * arlist_add(arnode **p, long i,long j) -{ - arnode *n = (arnode *)malloc(sizeof(arnode)); - if(n == NULL) - return NULL; - n->next = *p; - *p = n; - n->data = i; - n->data1 = j; - return *p; -} - -void arlist_remove(arnode **p) -{ - if(*p != NULL){ - arnode *n = *p; - *p = (*p)->next; - free(n); - } -} - -arnode **arlist_search(arnode **n, long i) -{ - while (*n != NULL){ - if ((*n)->data == i){ - return n; - } - n = &(*n)->next; - } - return NULL; -} - -void arlist_print(arnode *n) -{ - if (n == NULL){ - /*printf("arlist is empty\n");*/ - } - while (n != NULL){ - printf("%d:%d %d next=%d\n", armci_me,n->data,n->data1,(n->next==NULL)?0:1); - n = n->next; - } -} -#endif - -extern void armci_util_wait_int(volatile int *, int , int ); -extern void armci_util_wait_long(volatile long *, long, int ); - -int _armci_portals_server_ready=0; -int _armci_portals_client_ready=0; -int _armci_server_mutex_ready=0; -void *_armci_server_mutex_ptr = NULL; - -#ifdef ARMCI_REGISTER_SHMEM -typedef struct { - void *base_ptr; - void *serv_ptr; - size_t size; - int islocal; - int valid; -}aptl_reginfo_t; - -typedef struct { - aptl_reginfo_t reginfo[MAX_MEM_REGIONS]; - int reg_count; -} rem_meminfo_t; -#endif - -typedef struct serv_buf_t{ - ptl_handle_md_t md_h; - ptl_handle_me_t me_h; - ptl_md_t md; - char *buf; - char *bufend; -} serv_buf_t; - -char **client_buf_ptrs; -static int armci_server_terminating=0; - -serv_buf_t *serv_bufs; -long servackval=ARMCI_STAMP,*serv_ack_ptr=&servackval; -ptl_handle_md_t serv_ack_md_h,serv_response_md_h; - -static armci_portals_proc_t _armci_portals_proc_struct; -static armci_portals_serv_t _armci_portals_serv_struct; -static armci_portals_proc_t *portals = &_armci_portals_proc_struct; -static armci_portals_serv_t *serv_portals = &_armci_portals_serv_struct; -/*static */comp_desc _compdesc_array[NUM_COMP_DSCR]; - -static arnode *arn = NULL; - -#ifdef ARMCI_REGISTER_SHMEM -static rem_meminfo_t *_rem_meminfo; -static aptl_reginfo_t *_tmp_rem_reginfo; - -#define IN_REGION(_ptr__,_reg__) ((_reg__.valid) && (_ptr__)>=(_reg__.serv_ptr) \ - && (_ptr__) <= ( (char *)(_reg__.serv_ptr)+_reg__.size)) -#endif - -static int ptl_initialized = 0; -extern pid_t server_pid; - -ptl_ni_limits_t armci_ptl_nilimits; -ptl_ni_limits_t armci_ptl_Snilimits; - -void armci_portals_init_ptl() -{ -int rc; -int npes,i; - ARMCI_PR_DBG("enter",0); - - /*initialize data structures*/ - portals->ptl = ARMCI_PORTALS_PTL_NUMBER; /* our own ptl number */ - - rc=PtlNIInit(IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS), - PTL_PID_ANY, NULL, &armci_ptl_nilimits, &(portals->ni_h)); - switch(rc) { - case PTL_OK: - /*printf("\n%d:ok for nii\n",armci_me);*/ - break; - case PTL_IFACE_DUP: - /*printf("\n%d:dup for nii\n",armci_me);*/ - break; - default: - printf( "PtlNIInit() failed %d error=%s\n",rc,ARMCI_NET_ERRTOSTR(rc) ); - exit(1); - } - - if((rc=PtlGetId(portals->ni_h,&portals->rank)) !=PTL_OK) { - printf("%s: PtlGetId failed: %d(%d)\n",FUNCTION_NAME, rc, server_pid); - exit(1); - } - ARMCI_PR_DBG("exit",0); -} - -static inline void init_serv_buf(serv_buf_t *tmp) -{ -int rc; -ptl_match_bits_t ignbits = 0xFFFFFFFFF00000FF; -ptl_match_bits_t mbits; -ptl_process_id_t match_id; -ptl_md_t *md_ptr,md; - - ARMCI_PR_DBG("enter",0); - tmp->md.user_ptr=tmp; - tmp->md.start=tmp->buf; - tmp->md.length=armci_nproc*NUM_SERV_BUFS*VBUF_DLEN; - tmp->md.eq_handle=portals->Seq_h; - tmp->md.max_size=0; - tmp->md.threshold=PTL_MD_THRESH_INF; - tmp->md.options=PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - { - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - mbits = 16<<8; - - rc = PtlMEAttach(portals->Sni_h,portals->ptl,match_id,mbits,ignbits, - PTL_RETAIN,PTL_INS_AFTER,&(tmp->me_h)); - if (rc != PTL_OK) { - printf("(%d):PtlMEAttach: %s\n", portals->Srank,ARMCI_NET_ERRTOSTR(rc)); - armci_die("portals attach error isb",rc); - } - tmp->md.options=PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE | PTL_MD_MANAGE_REMOTE; - - rc = PtlMDAttach((tmp->me_h),tmp->md,PTL_RETAIN,&(tmp->md_h)); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->Srank, ARMCI_NET_ERRTOSTR(rc),(serv_md_count+client_md_count) ); - exit(1); - } - serv_md_count++; - } - /*set up for sending acks */ - md_ptr = &(md); - md_ptr->start = serv_ack_ptr; - md_ptr->length = sizeof(long); - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = sizeof(long); - md_ptr->eq_handle = portals->Seq_h; - - rc = PtlMDBind(portals->Sni_h,md,PTL_RETAIN,&serv_ack_md_h); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBindxn: %s %d\n", portals->Srank.nid, - ARMCI_NET_ERRTOSTR(rc),(serv_md_count+client_md_count)); - armci_die("ptlmdbind failed",0); - } - serv_md_count++; - /*set up for sending response */ - md_ptr = &(md); - md_ptr->start = tmp->buf; - md_ptr->length = tmp->md.length; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = tmp->md.length; - md_ptr->eq_handle = portals->Seq_h; - - rc = PtlMDBind(portals->Sni_h, md, PTL_RETAIN, &serv_response_md_h); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBindxn: %s %d\n", portals->Srank.nid, - ARMCI_NET_ERRTOSTR(rc),(serv_md_count+client_md_count)); - armci_die("ptlmdbind failed",0); - } - serv_md_count++; - ARMCI_PR_DBG("exit",0); -} - -void armci_portals_wait_for_client() -{ -int rc; -int *procidinfo; -extern armci_clus_t *armci_clus_info; -ptl_process_id_t *tmp; - ARMCI_PR_SDBG("enter",0); - //printf(" "); - armci_util_wait_int(&_armci_portals_client_ready,1,1000); - if((armci_me)!=armci_master){ - exit(0); - } - else{ - if(DEBUG_SERV){ - printf("\n%d:chosen one nid,pid=%d,%d\n",armci_me,portals->Srank.nid,portals->Srank.pid); - } - } - ARMCI_PR_SDBG("exit",0); -} - - -void armci_portals_prepare_server() -{ -int rc,i,j; - ARMCI_PR_SDBG("enter",0); - serv_bufs=(serv_buf_t *)malloc(sizeof(serv_buf_t)); - bzero(serv_bufs,sizeof(serv_buf_t)); - assert(serv_bufs); - serv_bufs->buf=(char *)malloc((NUM_SERV_BUFS*armci_nproc*VBUF_DLEN)); - bzero(serv_bufs->buf,(NUM_SERV_BUFS*armci_nproc*VBUF_DLEN)); - assert(serv_bufs->buf); - serv_bufs->bufend=(char *)serv_bufs->buf+(NUM_SERV_BUFS*armci_nproc*VBUF_DLEN); - rc = PtlEQAlloc(portals->Sni_h,4*(NUM_SERV_BUFS*armci_nproc),NULL, &(portals->Seq_h)); - if (rc != PTL_OK) { - printf("(%d):Ptleaalloc() failed: %s %d (%d)\n",portals->Srank, - ARMCI_NET_ERRTOSTR(rc),(NUM_SERV_BUFS*armci_nproc),rc); - armci_die("EQ Alloc failed",rc); - } - init_serv_buf(serv_bufs); - _armci_portals_server_ready=1; - ARMCI_PR_SDBG("exit",0); -} - - -void *armci_server_code(void *data) -{ -int rc,num_interface; - ARMCI_PR_SDBG("enter",0); - if(DEBUG_INIT) - printf("%d: in server after creating thread.\n",armci_me); - - rc = PtlInit(&num_interface); - if (rc != PTL_OK) { - printf("PtlInit() failed %d %s\n",rc, ARMCI_NET_ERRTOSTR(rc) ); - exit(1); - } - - rc=PtlNIInit(IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS), - PTL_PID_ANY, NULL, &armci_ptl_Snilimits, &(portals->Sni_h)); - switch(rc) { - case PTL_OK: - //printf("\n(%d):ok for serv nii\n",armci_me); - break; - case PTL_IFACE_DUP: - //printf("\n(%d):dup for serv nii\n",armci_me); - break; - default: - printf( "PtlNIInit() serv failed %d error=%s\n",rc,ARMCI_NET_ERRTOSTR(rc) ); - exit(1); - } - - if((rc=PtlGetId(portals->Sni_h,&portals->Srank)) !=PTL_OK) { - printf("%s: PtlGetId failed: %d(%d)\n",FUNCTION_NAME, rc, server_pid); - exit(1); - } - /*printf("\n(%d):server nid=%d pid=%d\n",armci_me,portals->Srank.nid,portals->Srank.pid);*/ - - armci_portals_wait_for_client(); - armci_portals_prepare_server(); - - if(DEBUG_INIT) { - printf("(%d): connected to all computing processes\n",armci_me); - fflush(stdout); - } - armci_call_data_server(); - - armci_transport_cleanup(); - ARMCI_PR_SDBG("exit",0); - return(NULL); -} - - -void armci_client_connect_to_servers() -{ -int rc; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_md_t md_local; -ptl_handle_md_t md_hdl_local; -ptl_process_id_t *tmp; -int *procidinfo; -int c_info; -int *flag,shmid; -void *addr; -char *buf; -extern int _armci_server_started; - ARMCI_PR_DBG("enter",0); - - _armci_portals_client_ready=1; - if(armci_me==armci_master){ - armci_util_wait_int(&_armci_portals_server_ready,1,1000); - } - - armci_msg_barrier(); - _armci_server_started=1; - - if(armci_me==armci_master){ - portals->servid_map[armci_clus_me].pid=portals->Srank.pid; - portals->servid_map[armci_clus_me].nid=portals->Srank.nid; - } - - armci_msg_gop_scope(SCOPE_ALL,portals->servid_map,(sizeof(ptl_process_id_t)*armci_nclus)/sizeof(int),"+",ARMCI_INT); - - ARMCI_PR_DBG("exit",0); -} - -static int check_meminfo(void *ptr, long size, int proc) -{ - int i; - for(i=0;i=0) && (right>=size)) - return(i+1); - } - return 0; -} - - -static void add_meminfo(void *ptr, size_t size, int proc) -{ - if(check_meminfo(ptr,(long)size,proc)!=0)armci_die("repeat add request for dss",proc); - all_meminfo[proc].cur_ds++; - all_meminfo[proc].ptr[all_meminfo[proc].cur_ds]=ptr; - all_meminfo[proc].size[all_meminfo[proc].cur_ds]=size; -#ifdef DEBUG_MEM - printf("\n%d:%s:adding %p %ld %d at %d",armci_me,FUNCTION_NAME,ptr,size,proc,all_meminfo[proc].cur_ds); -#endif -} - - -typedef struct{ - void *ptr; - size_t size; - size_t serv_offs; -} meminfo_t; - - -void armci_exchange_meminfo(void *ptr, size_t size,size_t off) -{ - int i; - meminfo_t *exng; - exng = (meminfo_t*)malloc(armci_nproc*sizeof(meminfo_t)); - assert(exng != NULL); - bzero(exng,sizeof(meminfo_t)*armci_nproc); - exng[armci_me].ptr=ptr; exng[armci_me].size=size; exng[armci_me].serv_offs = off; - armci_msg_gop_scope(SCOPE_ALL,exng,(sizeof(meminfo_t)*armci_nproc)/sizeof(int),"+",ARMCI_INT); - for(i=0;ibrval[portals->cur_ds]){ - ptl_md_t *md_ptr; - ptl_match_bits_t ignbits = 0xFFFFFFFFFFFFFF00; - ptl_process_id_t match_id; - int rc,cds=++portals->cur_ds; - if(cds>=MAX_DS)armci_die("increase MAX_CDS",cds); - portals->dsbase[cds]=portals->brval[cds-1]; - //portals->dsbase[cds]=sbrk(0); - ptr = portals->brval[cds] = br_val; - size = portals->dssizes[cds]=((caddr_t)portals->brval[cds] - portals->dsbase[cds]); - portals->serv_offs[cds] = serv_offset; - printf("\n%d:%s:base=%p brval=%p dslen=%ld %p end=%p",armci_me,FUNCTION_NAME,portals->dsbase[cds],br_val, - portals->dssizes[cds],portals->brval[cds],get_heap_bottom_addr()); - - md_ptr = &(portals->heap_md[cds]); - md_ptr->start = portals->dsbase[cds]; - md_ptr->length = portals->dssizes[cds]; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = 0; - md_ptr->eq_handle = PTL_EQ_NONE; - - portals->heap_mb[cds]=cds+1; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id, - portals->heap_mb[cds], - ignbits, - PTL_RETAIN,PTL_INS_AFTER, - &(portals->heap_me_h[cds])); - if (rc != PTL_OK) { - printf("%d:PtlMEAttach: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("portals attach error reg",rc); - } - - rc = PtlMDAttach((portals->heap_me_h[cds]), - *md_ptr,PTL_RETAIN, - &(portals->heap_md_h[cds])); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->rank, ARMCI_NET_ERRTOSTR(rc),(client_md_count+serv_md_count)); - armci_die("portals attach error reg",rc); - } - } - else{ -#ifdef DEBUG_MEM_ - extern caddr_t _end; - printf("\n%d:%s:curds=%d brvalin=%p curbrval=%p _end=%p &_end=%p",armci_me,FUNCTION_NAME,portals->cur_ds,portals->brval[portals->cur_ds],br_val,_end,&_end); -#endif - } - armci_exchange_meminfo(ptr,size,serv_offset); -} - - -#ifndef PMI_SUCCESS -#define PMI_SUCCESS 0 -#endif - -static int *_client_servbuf_count; -int armci_init_portals(caddr_t atbeginbrval) -{ -#ifndef OLD_PORTALS_CODE - int i,rc,np,me; - ptl_process_id_t id; - ptl_process_id_t clone_id; - - MPI_Comm_size(ARMCI_COMM_WORLD,&np); - MPI_Comm_rank(ARMCI_COMM_WORLD,&me); - - if(armci_me != me) { - printf("[mpi %d]: armci_me=%d ... this is a problem\n",me,armci_me); - armci_die("mpi rank mismatch",911); - } - - portals_cp_init(); - - MPI_Barrier(ARMCI_COMM_WORLD); - - portals_ds_ready = 0; - if(armci_me == armci_master) { - portalsCloneDataServer( portals_ds_thread ); - portalsSpinLockOnInt( &portals_ds_ready,1,10000 ); - } - MPI_Barrier(ARMCI_COMM_WORLD); - - i=0; - if((rc=PMI_Initialized(&i))!=PMI_SUCCESS){ - printf("PMI_Initialized failed\n"); - } - - if(i==0){ - if((rc==PMI_Init(&i))!=PMI_SUCCESS){ - printf("MPI_Init failed (npes=%d)\n", armci_nproc); - } - } - - if((rc=PMI_CNOS_Get_nidpid_map(&portals_id_map))!=PMI_SUCCESS){ - printf("Getting proc map failed (npes=%d)\n", armci_nproc); - } - - /* create intra-node communicator */ - MPI_Barrier(ARMCI_COMM_WORLD); - portals_cp_init_throttle(armci_nclus); - MPI_Barrier(ARMCI_COMM_WORLD); - - /* stuff from old code ... */ - bzero(portals,sizeof(armci_portals_proc_t)); - // note: i got rid of this rem_meminfo stuff with the gemini version - // see that code to see how to remove it here - # ifdef ARMCI_REGISTER_SHMEM - _rem_meminfo = (rem_meminfo_t *)calloc(armci_nproc,sizeof(rem_meminfo_t)); - _tmp_rem_reginfo = (aptl_reginfo_t *)malloc(sizeof(aptl_reginfo_t)*armci_nproc); - if( _rem_meminfo==NULL || _tmp_rem_reginfo ==NULL) - armci_die("malloc failed in init_portals",0); - //if(armci_me == 0) { - // printf("sizeof(rem_meminfo_t)=%ld\n",sizeof(rem_meminfo_t)); - //} - # endif - # ifdef CRAY_USE_ARMCI_CLIENT_BUFFERS - client_buf_ptrs = (char **) calloc(armci_nproc,sizeof(char *)); - assert(client_buf_ptrs); - armci_msg_barrier(); - _armci_buf_init(); - # endif - /* end old stuff */ - - return 0; - -#else - -int num_interface; -int rc; -int npes,i; - ARMCI_PR_DBG("enter",0); - - bzero(portals,sizeof(armci_portals_proc_t)); - - _rem_meminfo = (rem_meminfo_t *)calloc(armci_nproc,sizeof(rem_meminfo_t)); - _tmp_rem_reginfo = (aptl_reginfo_t *)malloc(sizeof(aptl_reginfo_t)*armci_nproc); - if( _rem_meminfo==NULL || _tmp_rem_reginfo ==NULL) - armci_die("malloc failed in init_portals",0); - - portals->servid_map=(ptl_process_id_t *)calloc(armci_nclus,sizeof(ptl_process_id_t)); - if(portals->servid_map==NULL)armci_die("calloc of servidmap failed",0); - - rc = PtlInit(&num_interface); - if (rc != PTL_OK) { - printf("PtlInit() failed %d %s\n",rc, ARMCI_NET_ERRTOSTR(rc) ); - exit(1); - } - armci_portals_init_ptl(); - -#if 1 - i=0; - if((rc=PMI_Initialized(&i))!=PMI_SUCCESS){ - printf("PMI_Initialized failed\n"); - } - - if(i==0){ - if((rc==PMI_Init(&i))!=PMI_SUCCESS){ - printf("MPI_Init failed (npes=%d)\n", armci_nproc); - } - } - - if((rc=PMI_CNOS_Get_nidpid_map(&portals->procid_map))!=PMI_SUCCESS){ - printf("Getting proc map failed (npes=%d)\n", armci_nproc); - } - //printf(" "); -# else - portals->procid_map = (ptl_process_id_t *) calloc(armci_nproc,sizeof(ptl_process_id_t)); - portals->procid_map[armci_me]=portals->rank; - armci_msg_gop_scope(SCOPE_ALL,portals->procid_map,(sizeof(ptl_process_id_t)*armci_nproc)/sizeof(int),"+",ARMCI_INT); - //printf(" "); -#endif - - client_buf_ptrs = (char **) calloc(armci_nproc,sizeof(char *)); - assert(client_buf_ptrs); - armci_msg_barrier(); - - if(armci_me==armci_master)armci_create_server_process( armci_server_code ); - - rc = PtlEQAlloc(portals->ni_h,16*NUM_COMP_DSCR,NULL, &(portals->eq_h)); - if (rc != PTL_OK) { - printf("%d:Ptleqalloc() failed: %s (%d)\n", - armci_me, ARMCI_NET_ERRTOSTR(rc) , rc); - armci_die("EQ Alloc failed",rc); - } - /*printf("\n%d:client nid=%d pid=%d\n",armci_me,portals->rank.nid,portals->rank.pid);*/ - - _armci_buf_init(); - - for(i=0;ieq_h; - _compdesc_array[i].mem_dsc.max_size=0; - /*_compdesc_array[i].mem_dsc.threshold=PTL_MD_THRESH_INF;*/ - _compdesc_array[i].mem_dsc.threshold=2; - _compdesc_array[i].mem_dsc.options=PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - } - - ptl_initialized = 1; - portals->free_comp_desc_index=0; - /*for(i=0;iprocid_map[i].nid,portals->procid_map[i].pid);*/ - _client_servbuf_count = calloc(armci_nclus,sizeof(int)); - armci_msg_barrier(); - armci_client_connect_to_servers(); - armci_msg_barrier(); - if(DEBUG_COMM){ - cpu_set_t mycpuid,new_mask; - char cid[8],*cidptr; - int rrr; - extern char * cpuset_to_cstr(cpu_set_t *mask, char *str); - rrr=sched_getaffinity(0, sizeof(mycpuid), &mycpuid); - if(rrr)perror("sched_getaffinity"); - cidptr = cpuset_to_cstr(&mycpuid,cid); - printf("%d:my affinity is to %s\n",armci_me,cid); - } -#ifdef NEW_MALLOC - /*post entire heap wildcard for direct communication*/ - { - ptl_md_t *md_ptr; - ptl_match_bits_t ignbits = 0xFFFFFFFFFFFFFF00; - ptl_process_id_t match_id; - portals->cur_ds = 0; - portals->dsbase[0]=get_heap_bottom_addr(); - //portals->brval[0] = sbrk(0); - portals->brval[0] = atbeginbrval; - portals->dssizes[0]=((caddr_t)portals->brval[0] - portals->dsbase[0]); - printf("\n%d:base=%p dslen=%ld %p",armci_me,portals->dsbase[0], - portals->dssizes[0],portals->brval[0]); - - md_ptr = &(portals->heap_md[0]); - md_ptr->start = portals->dsbase[0]; - md_ptr->length = portals->dssizes[0]; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = 0; - md_ptr->eq_handle = PTL_EQ_NONE; - - portals->heap_mb[0]=1; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id, - portals->heap_mb[0], - ignbits, - PTL_RETAIN,PTL_INS_AFTER, - &(portals->heap_me_h[0])); - if (rc != PTL_OK) { - printf("%d:PtlMEAttach: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("portals attach error reg",rc); - } - - rc = PtlMDAttach((portals->heap_me_h[0]), - *md_ptr,PTL_RETAIN, - &(portals->heap_md_h[0])); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->rank, ARMCI_NET_ERRTOSTR(rc),(client_md_count+serv_md_count)); - armci_die("portals attach error reg",rc); - } - all_meminfo = (rm_info_t *)malloc(sizeof(rm_info_t)*armci_nproc); - all_meminfo[armci_me].cur_ds = -1; - armci_exchange_meminfo(portals->dsbase[0],portals->dssizes[0],0); - } -#endif - ARMCI_PR_DBG("exit",0); - return 0; -#endif -} - - -void armci_fini_portals() -{ - ARMCI_PR_DBG("enter",0); - if(DEBUG_INIT){ - printf("ENTERING ARMCI_FINI_PORTALS\n");fflush(stdout); - } -#ifdef ARMCI_CHECK_STATE - arlist_print(arn); -#endif - PtlNIFini(portals->ni_h); - /*PtlFini();*/ - if(DEBUG_INIT){ - printf("LEAVING ARMCI_FINI_PORTALS\n");fflush(stdout); - } - ARMCI_PR_DBG("exit",0); -} - - -void armci_pin_contig1(void *start,size_t bytes) -{ - -} - -#ifdef ARMCI_REGISTER_SHMEM -#ifndef NEW_MALLOC -void armci_register_req(void *start,int bytes, int ID) -{ -int rc; -ptl_md_t *md_ptr; -ptl_match_bits_t ignbits = 0xFFFFFFFFFFFFFF00; -ptl_process_id_t match_id; - - ARMCI_PR_DBG("enter",serv_portals->reg_count); - -#ifdef DEBUG_MEM - printf("\n(%d):armci_register_req start=%p bytes=%d\n", - armci_me,start,bytes);fflush(stdout); -#endif - - md_ptr = &(serv_portals->meminfo[serv_portals->reg_count].md); - md_ptr->start = start; - md_ptr->length = bytes; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = 0; - md_ptr->eq_handle = PTL_EQ_NONE; - - serv_portals->meminfo[serv_portals->reg_count].mb=serv_portals->reg_count+1; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id, - serv_portals->meminfo[serv_portals->reg_count].mb, - ignbits, - PTL_RETAIN,PTL_INS_AFTER, - &(serv_portals->meminfo[serv_portals->reg_count].me_h)); - if (rc != PTL_OK) { - printf("%d:PtlMEAttach: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("portals attach error reg",rc); - } - - rc = PtlMDAttach((serv_portals->meminfo[serv_portals->reg_count].me_h), - *md_ptr,PTL_RETAIN, - &serv_portals->meminfo[serv_portals->reg_count].md_h); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->rank, ARMCI_NET_ERRTOSTR(rc),(client_md_count+serv_md_count)); - armci_die("portals attach error reg",rc); - } - client_md_count++; - serv_portals->reg_count++; - - ARMCI_PR_DBG("exit",serv_portals->reg_count); -} -#endif -#endif - -int armci_must_remotecomplete=1; -extern _buf_ackresp_t *_buf_ackresp_first,*_buf_ackresp_cur; -int x_net_wait_ackresp(_buf_ackresp_t *ar) -{ -int rc; -ptl_event_t ev_t; -ptl_event_t *ev=&ev_t; -comp_desc *temp_comp = NULL; -int loop=1; -int temp_proc; - ARMCI_PR_DBG("enter",0); - while(ar->val){ - ev->type=0; - if((rc = PtlEQWait(portals->eq_h, ev)) != PTL_OK){ - printf("%d:PtlEQWait(): %d %s\n", portals->rank,rc, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("EQWait problem",rc); - } - if (ev->ni_fail_type != PTL_NI_OK) { - temp_comp = (comp_desc *)ev->md.user_ptr; - printf("%d:NI sent %d in event %d,%d.\n", - armci_me,portals->rank.nid, portals->rank.pid, ev->ni_fail_type); - armci_die("event failure problem",temp_comp->dest_id); - } - if(DEBUG_COMM){ - printf("\n%d:net_wait_ackresp:done waiting type=%d\n",armci_me, - ev->type); - fflush(stdout); - } - if (ev->type == PTL_EVENT_SEND_END){ - if(DEBUG_COMM){ - printf("\n%d:net_wait_ackresp:event send end\n",armci_me); - fflush(stdout); - } - temp_comp = (comp_desc *)ev->md.user_ptr; - if(temp_comp->type==ARMCI_PORTALS_GETPUT || temp_comp->type==ARMCI_PORTALS_NBGETPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - continue; - } - if(!armci_must_remotecomplete){ - if(temp_comp->type==ARMCI_PORTALS_PUT || temp_comp->type==ARMCI_PORTALS_NBPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - } - else - continue; - } - else{ - temp_comp->active++; - continue; - } - - - } - - else if (ev->type == PTL_EVENT_REPLY_END){ - temp_comp = (comp_desc *)ev->md.user_ptr; - if(DEBUG_COMM){ - printf("\n%d:net_wait_ackresp:reply end tag=%d\n",armci_me,temp_comp->tag); - fflush(stdout); - } - temp_comp->active = 0; /*this was a get request, so we are done*/ - temp_comp->tag=-1; - continue; - } - else if (ev->type == PTL_EVENT_ACK){ - temp_comp = (comp_desc *)ev->md.user_ptr; - if(DEBUG_COMM){ - printf("\n%d:net_wait_ackresp:event ack tag=%d\n",armci_me,temp_comp->tag); - fflush(stdout); - } - temp_comp->active=0; - temp_comp->tag=-1; - portals->outstanding_puts--; - } - else if (ev->type==PTL_EVENT_PUT_END){ - _buf_ackresp_t *sweep=_buf_ackresp_first; - if(DEBUG_COMM){printf("\n%d:put end offset=%d",armci_me,ev->offset);fflush(stdout);} - if(ar->val==ev->offset){ - /*bingo!*/ - ar->val=0; - } - else{ - while(sweep!=NULL){ - if(sweep->val==ev->offset){ - sweep->val=0; - break; - } - sweep=sweep->next; - } - /*if(sweep==NULL)armci_die("server wrote data at unexpected offset",ev->offset);*/ - if(sweep==NULL){ - int y; - printf("%d:server wrote data at unexpected offset %d",armci_me,ev->offset);fflush(stdout); - abort(); -# ifdef ARMCI_CHECK_STATE - for(y=0;yservid_map[y].pid==ev->initiator.pid && portals->servid_map[y].nid==ev->initiator.nid)break; - assert(y!=armci_nclus); - arlist_print(arn); - armci_rem_state(y); -# endif - } - } - } - else - armci_die("in net_wait_ackresp unknown event",ev->type); - } - -# ifdef ARMCI_CHECK_STATE - arlist_remove(arlist_search(&arn, ar->valc)); -# endif - ar->valc=0; - if(ar==_buf_ackresp_first)_buf_ackresp_first=ar->next; - if(ar->next!=NULL){ - ar->next->previous=ar->previous; - } - if(ar->previous!=NULL){ - /*printf("\n%d:prev=%p %d %p %p\n",armci_me,ar->previous, ar->val,ar->next,ar);fflush(stdout);*/ - ar->previous->next=ar->next; - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=ar->previous; - } - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=NULL; - - ar->previous=ar->next=NULL; - - ARMCI_PR_DBG("exit",0); - - return rc; -} -int armci_client_complete(ptl_event_kind_t evt,int proc_id, int nb_tag, - comp_desc *cdesc) -{ -int rc; -ptl_event_t ev_t; -ptl_event_t *ev=&ev_t; -comp_desc *temp_comp = NULL; -int loop=1; -int temp_proc; - ARMCI_PR_DBG("enter",0); - if(DEBUG_COMM){ - printf("\n%d:enter:client_complete active=%d tag=%d %d\n",armci_me, - cdesc->active,cdesc->tag,nb_tag);fflush(stdout); - } - if(nb_tag>0){ - if(cdesc->tag!=nb_tag)return 0; - } - while(cdesc->active!=0){ - ev->type=0; - if((rc = PtlEQWait(portals->eq_h, ev)) != PTL_OK){ - printf("%d:PtlEQWait(): %d %s\n", portals->rank,rc, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("EQWait problem",rc); - } - if (ev->ni_fail_type != PTL_NI_OK) { - temp_comp = (comp_desc *)ev->md.user_ptr; - printf("%d:NI sent %d in event %d,%d.\n", - armci_me,portals->rank.nid, portals->rank.pid, ev->ni_fail_type); - armci_die("event failure problem",temp_comp->dest_id); - } - if(DEBUG_COMM){ - printf("\n%d:armci_client_complete:done waiting type=%d\n",armci_me, - ev->type); - fflush(stdout); - } - if(cdesc!=ev->md.user_ptr){ - /*printf("\n%d:expecting desc %p completing %p\n",armci_me,cdesc,ev->md.user_ptr);*/ - } - if (ev->type == PTL_EVENT_SEND_END){ - if(DEBUG_COMM){ - printf("\n%d:armci_client_complete:event send end\n",armci_me); - fflush(stdout); - } - temp_comp = (comp_desc *)ev->md.user_ptr; - if(temp_comp->type==ARMCI_PORTALS_GETPUT || temp_comp->type==ARMCI_PORTALS_NBGETPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - continue; - } - if(!armci_must_remotecomplete){ - if(temp_comp->type==ARMCI_PORTALS_PUT || temp_comp->type==ARMCI_PORTALS_NBPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - } - else - continue; - } - else{ - temp_comp->active++; - continue; - } - } - - else if (ev->type == PTL_EVENT_REPLY_END){ - temp_comp = (comp_desc *)ev->md.user_ptr; - if(DEBUG_COMM){ - printf("\n%d:client_send_complete:reply end tag=%d\n",armci_me,temp_comp->tag); - fflush(stdout); - } - temp_comp->active = 0; /*this was a get request, so we are done*/ - temp_comp->tag=-1; - continue; - } - else if (ev->type == PTL_EVENT_ACK){ - temp_comp = (comp_desc *)ev->md.user_ptr; - if(DEBUG_COMM){ - printf("\n%d:client_send_complete:event ack tag=%d\n",armci_me,temp_comp->tag); - fflush(stdout); - } - temp_comp->active=0; - temp_comp->tag=-1; - portals->outstanding_puts--; - } - else if (ev->type==PTL_EVENT_PUT_END){ - _buf_ackresp_t *ar=_buf_ackresp_first; - while(ar!=NULL){ - if(ar->val==ev->offset){ - ar->val=0; - break; - } - ar=ar->next; - } - if(ar==NULL)armci_die("server wrote data at unexpected offset",ev->offset); - if(DEBUG_COMM){printf("\n%d:put end offset=%d",armci_me,ev->offset);fflush(stdout);} - } - else - armci_die("in client_complete unknown event",ev->type); - } - if(DEBUG_COMM){ - printf("\n%d:exit:client_complete active=%d tag=%d %d\n",armci_me, - cdesc->active,cdesc->tag,nb_tag);fflush(stdout); - } - - ARMCI_PR_DBG("exit",0); - - return rc; -} - - -comp_desc * get_free_comp_desc(int * comp_id) -{ -comp_desc * c; -int rc = PTL_OK; - - ARMCI_PR_DBG("enter",0); - - c = &(_compdesc_array[portals->free_comp_desc_index]); - if(c->active!=0 && c->tag>0) - armci_client_complete(0,c->dest_id,c->tag,c); - else{ - /* - if(c->active!=0) - printf("\n%d:potential problem:active completion descriptor but tag=%d",armci_me,c->tag); - else - printf("\n%d:potential problem:active completion descriptor with tag=%d",armci_me,c->tag); - */ - } - if(!armci_must_remotecomplete){ - do{ - rc = PtlMDUnlink(c->mem_dsc_hndl); - }while(rc==PTL_MD_IN_USE); - } - - *comp_id = portals->free_comp_desc_index; - if(DEBUG_COMM){ - printf("\nthe value of comp_desc_id is %d\n",*comp_id); - fflush(stdout); - } - portals->free_comp_desc_index = (portals->free_comp_desc_index+1) % NUM_COMP_DSCR; - - ARMCI_PR_DBG("exit",0); - - return c; -} - - -void print_mem_desc(ptl_md_t * md) -{ - printf("%d:%p:start=%p length=%d threshold=%d max_size=%d options=%d eq_handle=%d\n",armci_me,md,md->start, md->length,md->threshold,md->max_size,md->options,md->eq_handle); - fflush(stdout); -} - - -#ifndef NEW_MALLOC -#if 0 -void armci_unregister_shmem(void *my_ptr, long size) -{ -int i=0,dst,found=0; -long id ; -long reg_size=0; -int reg_num = _rem_meminfo[armci_me].reg_count; -void *tptr; - - ARMCI_PR_DBG("enter",reg_num); -#ifdef DEBUG_MEM - printf("%d:%s:got size=%ld myptr %p\n",armci_me,FUNCTION_NAME,size,my_ptr); - fflush(stdout); -#endif - bzero(_tmp_rem_reginfo,sizeof(aptl_reginfo_t)*armci_nproc); - if(reg_num>=MAX_MEM_REGIONS) - armci_die("reg_num corrupted",reg_num); - for(i=0;i=MAX_MEM_REGIONS) - armci_die("reg_num corrupted",reg_num); - for(i=0;i=MAX_MEM_REGIONS-1){ - printf("\n%d:more than expected regions -- %d, increase MAX_MEM_REGIONS",armci_me,_rem_meminfo[i].reg_count++);fflush(stdout); - armci_die2("more than expected regions",_rem_meminfo[i].reg_count,MAX_MEM_REGIONS); - } - } -#ifdef DEBUG_MEM - printf("%d: regist id=%ld found=%d size=%ld reg_num=%d\n", - armci_me,id,found,reg_size,reg_num); - fflush(stdout); -#endif - ARMCI_PR_DBG("exit",0); -} - -void armci_register_shmem_grp(void *my_ptr, long size, long *idlist, long off, - void *sptr,ARMCI_Group *group) -{ -ARMCI_Group orig_group; - ARMCI_PR_DBG("enter",0); - ARMCI_Group_get_default(&orig_group); - ARMCI_Group_set_default(group); - armci_register_shmem(my_ptr,size,idlist,off,sptr); - ARMCI_Group_set_default(&orig_group); - ARMCI_PR_DBG("enter",0); -} -#endif -#endif // end #ifdef ARMCI_REGISTER_SHMEM - -static int _get_rem_servinfo(int serv,size_t bytes, size_t* offset) -{ -int i; - ARMCI_PR_DBG("enter",0); - i = 16<<8; - *offset=(armci_me*NUM_SERV_BUFS+_client_servbuf_count[serv])*VBUF_DLEN; - _client_servbuf_count[serv] = (_client_servbuf_count[serv]+1)%NUM_SERV_BUFS; - ARMCI_PR_DBG("exit",i); - return i; -} - -static int _get_rem_info(int proc, void *ptr,size_t bytes, size_t* offset) -{ -#ifdef ARMCI_REGISTER_SHMEM -int i; - ARMCI_PR_DBG("enter",0); -#ifdef NEW_MALLOC - i = check_meminfo(ptr,(long)bytes,proc); - if(i==0){ - printf("\n%d:ptr=%p bytes=%d proc=%d",armci_me,ptr,bytes,proc); - armci_die("region not found",proc); - } - *offset = (size_t)((caddr_t)ptr-(caddr_t)portals->dsbase[i-1]); - printf("\n%d:ptr=%p dsbase[0]=%p offs=%ld",armci_me,ptr,portals->dsbase[0],*offset);fflush(stdout); - if(*offset>=0){ - ARMCI_PR_DBG("exit A",(i+1)); - return(i); - } -#else -rem_meminfo_t *mem_info=&(_rem_meminfo[proc]); -aptl_reginfo_t *memreg = mem_info->reginfo; - for(i=0;ireg_count;i++){ - /*for now size is not verified*/ - if(DEBUG_COMM){ - printf("\n%d:proc=%d regcount=%d reg=%d base=%p size=%d end=%p checkptr=%p\n",armci_me,proc,mem_info->reg_count,i,memreg[i].base_ptr,memreg[i].size, ((char *)memreg[i].base_ptr+memreg[i].size), ptr);fflush(stdout); - } - if((memreg[i].valid) && ptr>= memreg[i].base_ptr && - ptr< ((char *)memreg[i].base_ptr+memreg[i].size)){ - *offset = ((char *)ptr-(char *)memreg[i].base_ptr); - ARMCI_PR_DBG("exit A",(i+1)); - return (i+1); - } - } -#endif - ARMCI_PR_DBG("exit B",i); - armci_die("_get_rem_info, rem memory region not found",bytes); -#else - printf("_get_rem_info called ... this shouldn't happen"); abort(); -#endif -} - -void armci_client_direct_get(ptl_process_id_t dest_proc, - ptl_size_t offset_remote, ptl_match_bits_t mb, size_t bytes, - ptl_md_t *md_local, - ptl_handle_md_t *md_hdl_local) -{ -int rc; -ptl_size_t offset_local = 0; - - ARMCI_PR_DBG("enter",0); - - if(DEBUG_COMM){ - printf("\n%d:armci_client_direct_get:BYTES = %d\n",armci_me,bytes); - printf("\n%d:offr=%ld offl=%ld\n",armci_me,offset_remote,offset_local); - fflush(stdout); - } - - rc = PtlMDBind(portals->ni_h,*md_local, PTL_UNLINK, md_hdl_local); - if (rc != PTL_OK){ - printf("%d:PtlMDBind: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("ptlmdbind get failed",0); - } - -#ifdef CRAY_USE_MDMD_COPY - if (dest_proc.nid == portals->rank.nid) { - rc = PtlMDMDCopy(*md_hdl_local, dest_proc, - portals->ptl, - 0, - mb, - offset_remote); - } else { -#endif - rc = PtlGetRegion(*md_hdl_local,offset_local,bytes,dest_proc, - portals->ptl, - 0, - mb, - offset_remote); -#ifdef CRAY_USE_MDMD_COPY - } -#endif - - if (rc != PTL_OK){ - printf("%d:PtlGetRegion: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlGetRegion failed",0); - } - - if(DEBUG_COMM){ - printf("\n%d:issued get\n",armci_me);fflush(stdout); - } - - ARMCI_PR_DBG("exit",0); -} - -void armci_portals_get(int proc, void *src_buf, void *dst_buf, int bytes, - void** cptr,int tag) -{ -int rc; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_md_t *md_local; -ptl_handle_md_t *md_hdl_local; -int rem_info; -comp_desc *cdesc; -ptl_process_id_t dest_proc; -int c_info; -int cluster = armci_clus_id(proc); - - ARMCI_PR_DBG("enter",0); - - /*first remote process information*/ - /*dest_proc.nid = portals->procid_map[proc].nid; - dest_proc.pid = portals->procid_map[proc].pid;*/ - dest_proc.nid = portals->servid_map[cluster].nid; - dest_proc.pid = portals->servid_map[cluster].pid; - - /*create local xfer info*/ - cdesc = get_free_comp_desc(&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=dst_buf; - md_local->user_ptr = (void *)cdesc; - md_local->options = PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; - - /*get remote info*/ - rem_info = _get_rem_info(proc,src_buf,bytes,&offset_remote); - - cdesc->dest_id = proc; - if (tag){ - *((comp_desc **)cptr) = cdesc; - cdesc->tag = tag; - cdesc->type = ARMCI_PORTALS_NBGET; - /*printf("\n%d:get tag=%d c_info=%d - * %p",armci_me,tag,c_info,cdesc);fflush(stdout);*/ - } - else{ - cdesc->tag = 0; - cdesc->type = ARMCI_PORTALS_GET; - } - - cdesc->active = 1; - armci_client_direct_get(dest_proc,offset_remote,(ptl_match_bits_t)rem_info, - bytes,md_local,md_hdl_local); - - if(!tag){ - armci_client_complete(0,proc,0,cdesc); /* check this later */ - } - - ARMCI_PR_DBG("exit",0); -} - - -void armci_client_nb_get(int proc, void *src_buf, int *src_stride_arr, - void *dst_buf, int *dst_stride_arr, int bytes, - void** cptr,int tag) -{ -} - -void armci_client_direct_send(ptl_process_id_t dest_proc, - ptl_size_t offset_remote, ptl_match_bits_t mb, size_t bytes, - ptl_md_t *md_local, - ptl_handle_md_t *md_hdl_local) -{ -int rc; -ptl_size_t offset_local = 0; - - ARMCI_PR_DBG("enter",0); - - if(DEBUG_COMM){ - printf("%d:armci_client_direct_send:BYTES = %d\n",armci_me,bytes); - printf("\n%d:offr=%ld offl=%ld\n",armci_me,offset_remote,offset_local); - fflush(stdout); - } - /*print_mem_desc(md_local);*/ - rc = PtlMDBind(portals->ni_h,*md_local, PTL_UNLINK, md_hdl_local); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBind: %s\n", portals->rank, - ARMCI_NET_ERRTOSTR(rc)); - armci_die("ptlmdbind send failed",0); - } - if(armci_must_remotecomplete){ - rc = PtlPutRegion(*md_hdl_local,offset_local,bytes, - PTL_ACK_REQ, - dest_proc,portals->ptl,0, mb,offset_remote, 0); - } - else{ - rc = PtlPutRegion(*md_hdl_local,offset_local,bytes, - PTL_NOACK_REQ, - dest_proc,portals->ptl,0, mb,offset_remote, 0); - } - - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlPutRegion: %s\n", portals->rank, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlPutRegion failed",0); - } - - ARMCI_PR_DBG("exit",0); -} - - -void armci_portals_put(int proc, void *src_buf, void *dst_buf, int bytes, - void** cptr,int tag) -{ -int rc; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_md_t *md_local; -ptl_handle_md_t *md_hdl_local; -int rem_info; -comp_desc *cdesc; -ptl_process_id_t dest_proc; -int c_info; -int cluster = armci_clus_id(proc); - - ARMCI_PR_DBG("enter",0); - - /*first process information*/ - dest_proc.nid = portals->servid_map[cluster].nid; - dest_proc.pid = portals->servid_map[cluster].pid; - /*dest_proc.nid = portals->procid_map[proc].nid; - dest_proc.pid = portals->procid_map[proc].pid;*/ - - /*create local xfer info*/ - cdesc = get_free_comp_desc(&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=src_buf; - md_local->user_ptr = (void *)cdesc; - md_local->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - - /*get remote info*/ - rem_info = _get_rem_info(proc,dst_buf,bytes,&offset_remote); - - - if(DEBUG_COMM){ - printf("\n%d:offr=%ld offl=%ld\n",armci_me,offset_remote,offset_local); - } - - cdesc->dest_id = proc; - if (tag){ - *((comp_desc **)cptr) = cdesc; - cdesc->tag = tag; - cdesc->type = ARMCI_PORTALS_NBPUT; - /*printf("\n%d:put tag=%d c_info=%d - * %p",armci_me,tag,c_info,cdesc);fflush(stdout);*/ - } - else{ - cdesc->tag = 0; - cdesc->type = ARMCI_PORTALS_PUT; - } - - cdesc->active = 1; - - armci_client_direct_send(dest_proc,offset_remote,(ptl_match_bits_t)rem_info, - bytes,md_local,md_hdl_local); - - if(!tag){ - armci_client_complete(0,proc,0,cdesc); /* check this later */ - } - else - portals->outstanding_puts++; - - - ARMCI_PR_DBG("exit",0); - -} - -void armci_client_nb_send(int proc, void *src_buf, int *src_stride_arr, - void *dst_buf, int *dst_stride_arr, int bytes, - void** cptr,int tag) - -{ -} - -/*using non-blocking for multiple 1ds inside a 2d*/ -void armci_network_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, armci_ihdl_t nb_handle) -{ -int i, j,tag=0; -long idxs,idxd; /* index offset of current block position to ptr */ -int n1dim; /* number of 1 dim block */ -int bvalue_s[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; -int bvalue_d[MAX_STRIDE_LEVEL]; -int bytes = count[0]; -void *sptr,*dptr; -NB_CMPL_T cptr; -ptl_process_id_t dest_proc; -ptl_size_t offset_remote; -comp_desc *cdesc; -int c_info; -ptl_md_t *md_local; -int rem_info; -int cluster = armci_clus_id(proc); - - ARMCI_PR_DBG("enter",0); - - printf("%s calling abort ... network_strided not implemented\n",Portals_ID()); - abort(); - - if(nb_handle)tag=nb_handle->tag; - - /*first remote process information*/ - dest_proc.nid = portals->servid_map[cluster].nid; - dest_proc.pid = portals->servid_map[cluster].pid; - /*dest_proc.nid = portals->procid_map[proc].nid; - dest_proc.pid = portals->procid_map[proc].pid;*/ - - rem_info = _get_rem_info(proc,(op==GET)?src_ptr:dst_ptr,bytes,&offset_remote); - - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - - /* calculate the destination indices */ - bvalue_s[0] = 0; bvalue_s[1] = 0; bunit[0] = 1; - bvalue_d[0] = 0; bvalue_d[1] = 0; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) { - bvalue_s[i] = bvalue_d[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - if(ARMCI_ACC(op)){ /*for now die for acc*/ - /*lock here*/ -# ifdef ARMCI_CHECK_STATE - arlist_print(arn); - armci_rem_state(armci_clus_info[proc].master%armci_clus_info[0].nslave); -# endif - printf("\nSHOULD NOT DO NETWORK_STRIDED FOR ACCS \n",armci_me); - fflush(stdout); - armci_die("network_strided called for acc",proc); - } - - /*loop over #contig chunks*/ - for(i=0; i (count[j]-1)) bvalue_s[j] = 0; - if(bvalue_d[j] > (count[j]-1)) bvalue_d[j] = 0; - } - sptr = ((char *)src_ptr)+idxs; - dptr = ((char *)dst_ptr)+idxd; - cdesc = get_free_comp_desc(&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=(op==GET)?dptr:sptr; - md_local->user_ptr = (void *)cdesc; - cdesc->dest_id = proc; - cdesc->tag = tag; - - if(op==GET){ - md_local->options = PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; - cdesc->active = 1; - cdesc->type = ARMCI_PORTALS_NBGET; - /* - printf("\n%d:reminfo=%d off=%d idxs=%d idxd=%d",armci_me, rem_info, - offset_remote, idxs, idxd); - */ - armci_client_direct_get( dest_proc,offset_remote+idxs,rem_info, - bytes,md_local,md_hdl_local); - } - else if(op==PUT){ - cdesc->active = 1; - cdesc->type = ARMCI_PORTALS_NBPUT; - md_local->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - armci_client_direct_send(dest_proc,offset_remote+idxd,rem_info, - bytes,md_local,md_hdl_local); - if(op==PUT)portals->outstanding_puts++; - } - else if(ARMCI_ACC(op)){ - assert(0); - } - else{ - ARMCI_PR_DBG("exit",0); - armci_die("in network_strided unknown opcode",op); - } - armci_client_complete(0,proc,tag,cdesc); - } - - if(ARMCI_ACC(op)){ - /*unlock here*/ - } - - if(nb_handle){ - /* completing the last call is sufficient, given ordering semantics*/ - nb_handle->tag=tag; - nb_handle->cmpl_info=cdesc; - } - else{ - /*completing the last call ensures everything before it is complete this - * is one of the main reasons why dataserver is necessary*/ - /*armci_client_complete(0,proc,tag,cdesc);*/ - } - ARMCI_PR_DBG("exit",0); -} - -void armci_client_direct_getput(ptl_process_id_t dest_proc, - ptl_size_t offset_remote, ptl_match_bits_t mb, size_t bytes, - ptl_md_t *md_local_get,ptl_md_t *md_local_put, - ptl_handle_md_t *md_hdl_local_get, ptl_handle_md_t - *md_hdl_local_put) -{ -int rc; -ptl_size_t offset_get = 0; -ptl_size_t offset_put = 0; - - ARMCI_PR_DBG("enter",0); - - if(DEBUG_COMM){ - printf("%d:armci_client_direct_getput:BYTES = %d\n",armci_me,bytes); - printf("\n%d:offr=%ld\n",armci_me,offset_remote);fflush(stdout); - } - - rc = PtlGetPutRegion(*md_hdl_local_get, offset_get, *md_hdl_local_put, - offset_put,bytes,dest_proc, portals->ptl,0,mb, - offset_remote,0); - if (rc != PTL_OK){ - printf("%d:PtlGetPutRegion: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlGetPutRegion failed",0); - } - - ARMCI_PR_DBG("exit",0); - -} - - -long a_p_putfrom; -long a_p_getinto; - - -int armci_portals_rmw_(int op, int *ploc, int *prem, int extra, int proc) -{ - printf("error rmw"); - return(0); -} - -void armci_portals_shmalloc_allocate_mem(int num_lks) -{ -void **ptr_arr; -void *ptr; -armci_size_t bytes = 128; -int i; - - ARMCI_PR_DBG("enter",0); - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - if(!ptr_arr) armci_die("armci_shmalloc_get_offsets: malloc failed", 0); - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - - ARMCI_Malloc(ptr_arr,bytes); - ARMCI_PR_DBG("exit",0); - - return; -} - - -void armci_wait_for_server() -{ - ARMCI_PR_DBG("enter",0); - armci_server_terminating=1; - armci_serv_quit(); - ARMCI_PR_DBG("exit",0); -} - -/*client buffers info*/ -void armci_portals_client_buf_info(char *buf, ptl_match_bits_t *mb, ptl_size_t *offset,int proc) -{ - ARMCI_PR_DBG("enter",0); - *mb = (1<<30); - *offset = buf-client_buf_ptrs[proc]; - if(DEBUG_SERV){printf("\n(%d):serv writing to ofset %d on %d\n",armci_me,*offset,proc);fflush(stdout);} - ARMCI_PR_DBG("exit",0); -} - -/*memory for client buffers*/ -char *armci_portals_client_buf_allocate(int bytes) -{ -void *ptr; -ptl_match_bits_t ignbits = 0xFFFFFFFF0FFFFFFF; -ptl_match_bits_t mbits = 1; -ptl_md_t *md_ptr,md; -ptl_process_id_t match_id; -ptl_handle_me_t me_h; -ptl_handle_md_t md_h; -int rc; - ARMCI_PR_DBG("enter",sizeof(ptl_match_bits_t)); - ptr = malloc(bytes); - bzero(ptr,bytes); - assert(ptr); - - mbits = (1<<30); - md_ptr = &(md); - md_ptr->start = ptr; - md_ptr->length = bytes; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE | PTL_MD_EVENT_START_DISABLE; - md_ptr->user_ptr = NULL; - md_ptr->max_size = 0; - /*logic that says, eq_h is now recieving data for the buffers, including acks! */ - md_ptr->eq_handle = portals->eq_h; - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id, - mbits,ignbits,PTL_RETAIN,PTL_INS_AFTER,&(me_h)); - if (rc != PTL_OK){ - printf("%d:PtlMEAttach: %s\n", portals->rank, ARMCI_NET_ERRTOSTR(rc) ); - armci_die("portals attach error2",rc); - } - rc = PtlMDAttach(me_h,md,PTL_RETAIN,&md_h); - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s %d\n", portals->rank, ARMCI_NET_ERRTOSTR(rc),(client_md_count+serv_md_count) ); - armci_die("portals attach error CBA",rc); - } - client_md_count++; - - client_buf_ptrs[armci_me]=ptr; - armci_msg_barrier(); - armci_exchange_address(client_buf_ptrs,armci_nproc); - - ARMCI_PR_DBG("exit",0); - return(ptr); -} - -void armci_transport_cleanup() -{ - /*for i=0tomaxpendingclean*/ - ARMCI_PR_DBG("enter",0); - free(client_buf_ptrs); - ARMCI_PR_DBG("exit",0); -} - -void free_serv_bufs() -{ - if(serv_bufs) free(serv_bufs); -} - - -int armci_send_req_msg(int proc, void *buf, int bytes,int tag) -{ -#ifndef OLD_PORTALS_CODE - int cluster = armci_clus_id(proc); - int serv = armci_clus_info[cluster].master; - char *buffer = NULL; - request_header_t *msginfo = (request_header_t *) buf; - -// # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - _armci_buf_ensure_one_outstanding_op_per_node(buf,cluster); -// # endif - - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - BUF_INFO_T *bufinfo=_armci_buf_to_bufinfo(msginfo); - _buf_ackresp_t *ar = &bufinfo->ar; - portals_ds_req_t *req = &ar->req; - # endif - - if(msginfo->operation == PUT || ARMCI_ACC(msginfo->operation)) { - // printf("%s cp: sending packed put\n",Portals_ID()); - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - portals_remote_nbput(buf, buf, cluster, req); - // portalsWaitOnRequest(req); - # else - portals_remote_put(buf, buf, cluster); - # endif - // printf("%s cp: finished packed put\n",Portals_ID()); - } - - else if(msginfo->operation == GET) { - buffer = (char *) buf; - buffer += sizeof(request_header_t); - buffer += msginfo->dscrlen; - // printf("%s cp: sending blocking get request\n",Portals_ID()); - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - portals_remote_nbget(buffer, msginfo, cluster, req); - // portalsWaitOnRequest(req); - # else - portals_remote_get(buffer, msginfo, cluster); - # endif - // printf("%s cp: get request finished\n",Portals_ID()); - } - - else if(msginfo->operation == ACK) { - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - portalsRemoteOperationToNode(buf, bytes, cluster, req); - // portalsWaitOnRequest(req); - # else - portalsBlockingRemoteOperationToNode(buf, bytes, cluster); - # endif - } - - else if(msginfo->operation == ARMCI_SWAP || msginfo->operation == ARMCI_SWAP_LONG || - msginfo->operation == ARMCI_FETCH_AND_ADD || msginfo->operation == ARMCI_FETCH_AND_ADD_LONG) { - buffer = (char *) buf; - buffer += sizeof(request_header_t); - buffer += msginfo->dscrlen; - portals_remote_rmw(buffer, msginfo, cluster, req); - # ifndef PORTALS_USE_ARMCI_CLIENT_BUFFERS - portalsWaitOnOperation(req); - # endif - } - - else if(msginfo->operation!=ATTACH) { - printf("%s cp: msginfo->operation=%d not supported yet\n",Portals_ID(),msginfo->operation); - abort(); - } - - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS -/* for now, clear the ackresp structure because the call had to have been blocking - later, we will allow a modified x_net_wait_ackresp clear it */ - ar->val = ar->valc = 0; - if(ar==_buf_ackresp_first)_buf_ackresp_first=ar->next; - if(ar->next!=NULL){ - ar->next->previous=ar->previous; - } - if(ar->previous!=NULL){ - ar->previous->next=ar->next; - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=ar->previous; - } - if(_buf_ackresp_cur==ar)_buf_ackresp_cur=NULL; - ar->previous=ar->next=NULL; - # endif - - return 0; - -#else - -int rc; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_md_t *md_local; -ptl_handle_md_t *md_hdl_local; -int rem_info; -comp_desc *cdesc; -void *cptr; -ptl_process_id_t dest_proc; -int c_info; -int cluster = armci_clus_id(proc); -int serv = armci_clus_info[cluster].master; -request_header_t *msginfo = (request_header_t *)buf; - - ARMCI_PR_DBG("enter",0); - if(msginfo->operation==GET && msginfo->dscrlen<=msginfo->datalen){ - *(long *)((char *)(msginfo+1)+msginfo->datalen)=0; - } - - /*badbadbad*/ - msginfo->tag.ack_ptr=&(msginfo->tag.ack); - cptr = (void *)((double *)buf-1); - /*first process information*/ - dest_proc.nid = portals->servid_map[cluster].nid; - dest_proc.pid = portals->servid_map[cluster].pid; - /*create local xfer info*/ - cdesc = get_free_comp_desc(&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=buf; - md_local->user_ptr = (void *)cdesc; - md_local->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - - /*get remote info*/ - rem_info = _get_rem_servinfo(cluster,(size_t)bytes,&offset_remote); - - if(DEBUG_COMM){ - printf("\n%d:offr=%ld offl=%ld\n",armci_me,offset_remote,offset_local); - } - - cdesc->dest_id = serv; - *((comp_desc **)cptr) = cdesc; - if(tag==0)tag=GET_NEXT_NBTAG(); - cdesc->tag = tag; - cdesc->type = ARMCI_PORTALS_NBPUT; - /*printf("\n%d:put tag=%d c_info=%d - * %p",armci_me,tag,c_info,cdesc);fflush(stdout);*/ - cdesc->active = 1; - - if(msginfo->operation==PUT || msginfo->operation == UNLOCK || ARMCI_ACC(msginfo->operation)){ - _buf_ackresp_cur->valc = _buf_ackresp_cur->val = (char *)msginfo->tag.ack_ptr-client_buf_ptrs[armci_me]; -# ifdef ARMCI_CHECK_STATE - arlist_add(&arn,_buf_ackresp_cur->val,msginfo->operation); -# endif - } - else { - _buf_ackresp_cur->valc = _buf_ackresp_cur->val = (char *)msginfo->tag.data_ptr-client_buf_ptrs[armci_me]; -# ifdef ARMCI_CHECK_STATE - arlist_add(&arn,_buf_ackresp_cur->val,msginfo->operation); -# endif - } - - if(DEBUG_COMM){printf("\n%d:registered %d in val to %d at %d %d\n",armci_me,_buf_ackresp_cur->val,serv,offset_remote,msginfo->operation);fflush(stdout);} - _armci_buf_ensure_pend_outstanding_op_per_node(buf,cluster); - armci_client_direct_send(dest_proc,offset_remote,(ptl_match_bits_t)rem_info, - bytes,md_local,md_hdl_local); - /*if(msginfo->operation==GET){ - BUF_INFO_T *info=((char *)msginfo-sizeof(BUF_EXTRA_FIELD_T)-sizeof(BUF_INFO_T)); - armci_client_complete(0,proc,cdesc->tag,cdesc); - }*/ - /*armci_client_complete(0,proc,cdesc->tag,cdesc);*/ - - portals->outstanding_puts++; - - ARMCI_PR_DBG("exit",0); - return 0; -#endif - -} - - -char *armci_ReadFromDirect(int proc, request_header_t *msginfo, int len) -{ -#ifndef OLD_PORTALS_CODE - # ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - BUF_INFO_T *bufinfo = _armci_buf_to_bufinfo(msginfo); - portals_ds_req_t *req = &bufinfo->ar.req; - portalsWaitOnRequest(req); - # endif - char *ret = (char *) msginfo; - ret += sizeof(request_header_t); - ret += msginfo->dscrlen; - return ret; -#else -long *flag; -int loop; -BUF_INFO_T *bufinfo=_armci_buf_to_bufinfo(msginfo); - - ARMCI_PR_DBG("enter",0); - if(len) - flag = (long *)((char *)(msginfo+1)+len); - else - flag = (long *)((char *)(msginfo+1)+msginfo->datalen); - x_net_wait_ackresp(&(bufinfo->ar)); - - while(armci_util_long_getval(flag) != ARMCI_TAIL){ - loop++; - loop %=100000; - if(loop==0){ - if(DEBUG_COMM){ - printf("%d: client flag(%p)=%ld off=%d %d\n", - armci_me,flag,*flag,msginfo->datalen,*((int*)(msginfo+1))); - fflush(stdout); - } - } - } - *flag=0; - ARMCI_PR_DBG("exit",0); - return (msginfo+1); -#endif -} - -#ifdef ARMCI_CHECK_STATE -extern void sarlist_add(int,int,long); -#endif - -void armci_WriteToDirect(int proc, request_header_t* msginfo, void *buf) -{ -#ifndef OLD_PORTALS_CODE - ptl_size_t bytes = (ptl_size_t) msginfo->datalen; - ptl_event_t *ev = (ptl_event_t *) msginfo->tag.user_ptr; - portals_ds_send_put(buf, msginfo->datalen, ev->initiator, ev->hdr_data); - // you could do an assertion that the portals_id_map of proc == ev->initiator -#else -long *tail; -int bytes; -void *dst_addr = msginfo->tag.data_ptr; -ptl_match_bits_t ignbits = 0xFFFFFFFF0FFFFFFF; -ptl_match_bits_t mbits = 1; -ptl_md_t *md_ptr,md; -ptl_process_id_t match_id; -ptl_handle_me_t me_h; -ptl_size_t offst,localoffset; -int rc; - - /* set tail ack, make sure it is alligned */ - ARMCI_PR_SDBG("enter",0); - bytes = msginfo->datalen+sizeof(long); - if(!(buf>=serv_bufs->buf && bufbufend)){ - bcopy(buf,(msginfo+1),bytes); - buf=(msginfo+1); - } - tail = (long*)(buf + msginfo->datalen); - *tail = ARMCI_TAIL; - - armci_portals_client_buf_info((char *)dst_addr,&mbits,&offst,proc); - -# ifdef ARMCI_CHECK_STATE - sarlist_add(proc,msginfo->operation,offst); -# endif - - match_id.nid = portals->procid_map[proc].nid; - match_id.pid = portals->procid_map[proc].pid; - localoffset=(char *)buf-(char *)serv_bufs->buf; - if(DEBUG_COMM){ - printf("\n(%d):dst=%p,mbits=%d,localoffset=%d,offst=%d,proc=%d,nid=%d,pid=%d len=%d\n",armci_me, - dst_addr,mbits,localoffset,offst,proc,portals->procid_map[proc].nid, - portals->procid_map[proc].pid,bytes);fflush(stdout); - } - rc = PtlPutRegion(serv_response_md_h,localoffset,bytes,PTL_NOACK_REQ, - match_id,portals->ptl,0,mbits,offst,0); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlPutRegion: %s\n", portals->Srank, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlPutRegion failed",0); - } - ARMCI_PR_SDBG("exit",0); -#endif -} - - -void armci_rcv_req(void *mesg,void *phdr,void *pdescr,void *pdata,int *buflen) -{ -int i,na; -char *a; -double *tmp; -request_header_t *msginfo = (request_header_t *)mesg; - ARMCI_PR_SDBG("enter",msginfo->operation); - *(void **)phdr = msginfo; - if(0){ - printf("%s [ds]: got %d req (hdrlen=%d dscrlen=%d datalen=%d %d) from %d\n", - Portals_ID(), msginfo->operation, sizeof(request_header_t), msginfo->dscrlen, - msginfo->datalen, msginfo->bytes,msginfo->from); - fflush(stdout); - } - /* we leave room for msginfo on the client side */ - *buflen = MSG_BUFLEN - sizeof(request_header_t); - - if(msginfo->bytes) { - *(void **)pdescr = msginfo+1; - *(void **)pdata = msginfo->dscrlen + (char*)(msginfo+1); - - if(msginfo->operation == GET) { - // the descriptor will exists after the request header - // but there will be no data buffer - // use the MessageRcvBuffer - *(void**) pdata = MessageSndBuffer; -// printf("%s (server) overriding pdata in rcv_req\n",Portals_ID()); - } - } - else { - *(void**)pdescr = NULL; - *(void**)pdata = MessageRcvBuffer; - } - ARMCI_PR_SDBG("exit",msginfo->operation); -} - -void armci_call_data_server() -{ -int rc; -ptl_event_t ev_t; -ptl_event_t *ev=&ev_t; -serv_buf_t *compbuf = NULL; -int loop=1; -int temp_proc; -int ccc=2,rrr; -cpu_set_t mycpuid,new_mask; -char str[CPU_SETSIZE]; -char ncid[8],*cidptr,cid[8]; -extern char * cpuset_to_cstr(cpu_set_t *mask, char *str); - ARMCI_PR_SDBG("enter",0); - //if(armci_me==0)unsetenv("CRAY_PORTALS_USE_BLOCKING_POLL"); - sprintf (cid, "%d", ccc); - rrr = cstr_to_cpuset(&new_mask,cid); - -/* ------------------------------------------------------------ *\ - Change affinity for the data server -\* ------------------------------------------------------------ */ - if(sched_setaffinity(0, sizeof (new_mask), &new_mask)) { - perror("sched_setaffinity"); - printf("failed to set pid %d's affinity.\n", getpid()); - } - if(DEBUG_SERV){ - rrr=sched_getaffinity(0, sizeof(mycpuid), &mycpuid); - if(rrr)perror("sched_getaffinity"); - cidptr = cpuset_to_cstr(&mycpuid,ncid); - printf("(%d):my affinity is to %s\n",armci_me,ncid); - fflush(stdout); - } - -/* ------------------------------------- *\ - Main data server loop -\* ------------------------------------- */ - while(armci_server_terminating==0){ - - /* ------------------------------------------------------------ *\ - check event queue for incoming data requests from remote CPs - \* ------------------------------------------------------------ */ - ev->type=0; - if((rc = PtlEQWait(portals->Seq_h, ev)) != PTL_OK){ - printf("(%d):PtlEQWait(): %d %s\n", armci_me,rc,ARMCI_NET_ERRTOSTR(rc) ); - armci_die("EQWait problem",rc); - } - if (ev->ni_fail_type != PTL_NI_OK) { - printf("(%d)%d,%d:NI sent %d in event.\n", - armci_me,portals->Srank.nid, portals->Srank.pid,ev->ni_fail_type); - fflush(stdout); - armci_die2("event failure problem",ev->initiator.nid,ev->initiator.pid); - } - if(DEBUG_SERV){ - printf("\n(%d):armci_call_data_server: ptl event detected=%d\n",armci_me,ev->type); - fflush(stdout); - } - - /* ------------------------------------------------------------ *\ - PTL_EVENT_SEND_END: is ignored. This event is triggered as - the DS returns data to a remote CP via a PtlPut. This event - signals that that PtlPut has complete. - \* ------------------------------------------------------------ */ - if(ev->type == PTL_EVENT_SEND_END) continue; - - - /* ------------------------------------------------------------ *\ - PTL_EVENT_PUT_END: this is the key portals event for the DS. - PUT_END signifies that a remote data request has come in - from a remote CP. This data request will be handled by the - data server: armci_data_server - \* ------------------------------------------------------------ */ - else if(ev->type == PTL_EVENT_PUT_END) { - if(DEBUG_SERV) { - printf("\n(%d):ev->offset=%d from %d%d",armci_me,ev->offset, - ev->initiator.pid,ev->initiator.nid); - fflush(stdout); - } - armci_data_server(((char *)serv_bufs->buf+ev->offset)); - } - - /* ------------------------------------------------------------ *\ - Unexpected Portals Event -- Panic! - \* ------------------------------------------------------------ */ - else { - armci_die("unexpected event in data server",ev->type); - } - } - ARMCI_PR_SDBG("exit",0); -} - -void x_buf_wait_ack(request_header_t *msginfo, BUF_INFO_T *bufinfo) -{ - ARMCI_PR_DBG("enter",bufinfo->ar.val); - if(DEBUG_COMM){printf("\n%d:waiting for ack at %p",armci_me,&(msginfo->tag.ack));fflush(stdout);} - x_net_wait_ackresp(&(bufinfo->ar)); - armci_util_wait_long(&(msginfo->tag.ack),ARMCI_STAMP,10000); - if(DEBUG_COMM){printf("\n%d:done waiting for ack at %p",armci_me,&(msginfo->tag.ack));fflush(stdout);} - msginfo->tag.ack=0; - ARMCI_PR_DBG("exit",0); -} - -void x_net_send_ack(request_header_t *msginfo, int proc,void *dst,void *src) -{ -long *tail; -int bytes=sizeof(long); -ptl_size_t offst; -ptl_match_bits_t ignbits = 0xFFFFFFFF0FFFFFFF; -ptl_match_bits_t mbits = 1; -ptl_process_id_t match_id; -int rc; - - /* set tail ack, make sure it is alligned */ - ARMCI_PR_SDBG("enter",0); - - - armci_portals_client_buf_info((char *)dst,&mbits,&offst,proc); - -# ifdef ARMCI_CHECK_STATE - sarlist_add(proc,msginfo->operation,offst); -# endif - - match_id.nid = portals->procid_map[proc].nid; - match_id.pid = portals->procid_map[proc].pid; - if(DEBUG_SERV){ - printf("\n(%d):dst=%p,mbits=%d,offst=%d,proc=%d,nid=%d,pid=%d len=%d\n",armci_me, - dst,mbits,offst,proc,portals->procid_map[proc].nid, - portals->procid_map[proc].pid,bytes);fflush(stdout); - } - - rc = PtlPutRegion(serv_ack_md_h,0,bytes,PTL_NOACK_REQ, - match_id,portals->ptl,0,mbits,offst,0); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlPutRegion: %s\n", portals->Srank, - ARMCI_NET_ERRTOSTR(rc) ); - armci_die("PtlPutRegion failed",0); - } - ARMCI_PR_SDBG("exit",0); -} - -long x_net_offset(char *buf,int proc) -{ -#ifdef ARMCI_REGISTER_SHMEM -int i; -#if NEW_MALLOC - if((i=check_meminfo(buf,1,proc))==0) - armci_die("x_net_offset,reg not found",proc); - return(all_meminfo[proc].serv_offs[i]); -#else - ARMCI_PR_DBG("enter",_rem_meminfo[proc].reg_count); - if(DEBUG_COMM){printf("\n%d:%s:buf=%p",armci_me,FUNCTION_NAME,buf);fflush(stdout); } - for(i=0;i<_rem_meminfo[proc].reg_count;i++){ - if(IN_REGION(buf,_rem_meminfo[proc].reginfo[i])){ -#ifdef DEBUG_MEM - {printf("\n%d:found it in reg=%d (%p,%d) for proc=%d",armci_me,i,_rem_meminfo[proc].reginfo[i].base_ptr,_rem_meminfo[proc].reginfo[i].size,proc);} -#endif - return((long)((char *)_rem_meminfo[proc].reginfo[i].serv_ptr-(char *)_rem_meminfo[proc].reginfo[i].base_ptr)); - } - } -#endif - ARMCI_PR_DBG("exit",0); -#else - printf("x_net_offset called; this shouldn't happen ...\n"); abort(); -#endif -} - -void armci_set_serv_mutex_arr(void *ptr) -{ -int i; -long offset; - ARMCI_PR_DBG("enter",0); - offset=x_net_offset(ptr,armci_me); - - _armci_server_mutex_ready=1; - _armci_server_mutex_ptr = (char *)ptr+offset; - ARMCI_PR_DBG("exit",0); - -} - diff --git a/armci/src-portals/armci_portals.h b/armci/src-portals/armci_portals.h deleted file mode 100644 index 8fed1dac7..000000000 --- a/armci/src-portals/armci_portals.h +++ /dev/null @@ -1,150 +0,0 @@ -#ifndef ARMCI_PORTALS_H -#define ARMCI_PORTALS_H - -/* portals header file */ - -#include -#include -#include - -#define NUM_COMP_DSCR 4 - -#define ARMCI_PORTALS_PTL_NUMBER 37 - -#define HAS_RDMA_GET -#define NUM_SERV_BUFS 1 - -/*corresponds to num of different armci mem regions*/ -#define MAX_MEM_REGIONS 10 - -#define VBUF_DLEN_ORG 4*64*1024 -#define VBUF_DLEN 16*1024 -#define MSG_BUFLEN_DBL_VT ((VBUF_DLEN)>>3) - -/* VBUF_DLEN are only used in Vinod's code */ - -#ifdef PORTALS_USE_RENDEZ_VOUS -# define MSG_BUFLEN_DBL 262144 /* for rendez-vous, this can go bigger i think */ -#else -# define MSG_BUFLEN_DBL 1280 /* this is smaller when rendez-vous is off */ -#endif - - - -#define ARMCI_NET_ERRTOSTR(__ARMCI_ERC_) ptl_err_str[__ARMCI_ERC_] - -typedef enum op { - ARMCI_PORTALS_PUT, - ARMCI_PORTALS_NBPUT, - ARMCI_PORTALS_GET, - ARMCI_PORTALS_NBGET, - ARMCI_PORTALS_ACC, - ARMCI_PORTALS_NBACC, - ARMCI_PORTALS_GETPUT, - ARMCI_PORTALS_NBGETPUT -} armci_portals_optype; - -typedef struct { - void *data_ptr; /* pointer where the data should go */ - long ack; /* header ack */ - void *ack_ptr; /* pointer where the data should go */ - void *user_ptr; -#if defined(SERV_QUEUE) - int imm_msg; - size_t data_len; -#endif -} msg_tag_t; - -typedef struct armci_portals_desc{ - int active; - int tag; - int dest_id; - armci_portals_optype type; - ptl_md_t mem_dsc; - ptl_handle_md_t mem_dsc_hndl; - char *bufptr; -}comp_desc; - -/*for buffers*/ -extern char *armci_portals_client_buf_allocate(int); -#define BUF_ALLOCATE armci_portals_client_buf_allocate -#define BUF_EXTRA_FIELD_T comp_desc* - -#define INIT_SEND_BUF(_field,_snd,_rcv) _snd=1;_rcv=1;_field=NULL - -#define GET_SEND_BUFFER _armci_buf_get -#define FREE_SEND_BUFFER _armci_buf_release - -#define CLEAR_SEND_BUF_FIELD(_field,_snd,_rcv,_to,_op) if((_op==UNLOCK || _op==PUT || ARMCI_ACC(_op)) && _field!=NULL)x_buf_wait_ack((request_header_t *)((void **)&(_field)+1),((char *)&(_field)-sizeof(BUF_INFO_T)));_field=NULL; -#define TEST_SEND_BUF_FIELD(_field,_snd,_rcv,_to,_op,_ret) -#define COMPLETE_HANDLE _armci_buf_complete_nb_request - -#define NB_CMPL_T comp_desc* -#define ARMCI_NB_WAIT(_cntr) if(_cntr){\ - int rc;\ - if(nb_handle->tag)\ - if(nb_handle->tag==_cntr->tag)\ - rc = armci_client_complete(0,nb_handle->proc,nb_handle->tag,_cntr);\ -} else{\ -printf("\n%d:wait null ctr\n",armci_me);} - -#ifndef MAX_DS -#define MAX_DS 16 -#endif - -/* structure of computing process */ -typedef struct { - ptl_pt_index_t ptl; - ptl_process_id_t rank; - ptl_handle_ni_t ni_h; - ptl_handle_eq_t eq_h; - ptl_process_id_t Srank; - ptl_handle_ni_t Sni_h; - ptl_handle_eq_t Seq_h; - int outstanding_puts; - int outstanding_gets; - ptl_process_id_t *procid_map; - ptl_process_id_t *servid_map; - int free_comp_desc_index; - caddr_t dsbase[MAX_DS]; - size_t dssizes[MAX_DS]; - ptl_match_bits_t heap_mb[MAX_DS]; - ptl_md_t heap_md[MAX_DS]; - ptl_handle_me_t heap_me_h[MAX_DS]; - ptl_handle_md_t heap_md_h[MAX_DS]; - void *brval[MAX_DS]; - long serv_offs[MAX_DS]; - int cur_ds; -}armci_portals_proc_t; - -typedef struct { - ptl_match_bits_t mb; - ptl_md_t md; - ptl_handle_me_t me_h; - ptl_handle_md_t md_h; -}armci_portals_serv_mem_t; - -typedef struct { - int reg_count; - int outstanding_puts; - int outstanding_gets; - armci_portals_serv_mem_t meminfo[MAX_MEM_REGIONS]; -}armci_portals_serv_t; - - -extern void print_mem_desc_table(void); -extern int armci_init_portals(caddr_t); -extern void armci_fini_portals(void); -extern int armci_post_descriptor(ptl_md_t *md); -extern int armci_prepost_descriptor(void* start, long bytes); -extern ptl_size_t armci_get_offset(ptl_md_t md, void *ptr,int proc); -extern int armci_get_md(void * start, int bytes , ptl_md_t * md, ptl_match_bits_t * mb); -extern void armci_portals_put(int,void *,void *,int,void **,int ); -extern void armci_portals_get(int,void *,void *,int,void **,int ); -extern void comp_desc_init(); -extern int armci_client_complete(ptl_event_kind_t evt,int proc_id, int nb_tag ,comp_desc * cdesc); -extern void armci_portals_memsetup(long); - -extern MPI_Comm portals_smp_comm; - -#endif /* ARMCI_PORTALS_H */ diff --git a/armci/src-portals/armci_shmem.h b/armci/src-portals/armci_shmem.h deleted file mode 100644 index 5db18fff9..000000000 --- a/armci/src-portals/armci_shmem.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _SHMEM_H_ -#define _SHMEM_H_ -extern void Set_Shmem_Limit(unsigned long shmemlimit); -extern void Delete_All_Regions(); -extern char* Create_Shared_Region(long idlist[], long size, long *offset); -extern char* Attach_Shared_Region(long idlist[], long size, long offset); -extern void Free_Shmem_Ptr(long id, long size, char* addr); -extern long armci_shmem_reg_size(int i, long id); -extern char* armci_shmem_reg_ptr(int i); - -#define POST_ALLOC_CHECK(temp,size) ; - -#define MAX_REGIONS 64 - -#if defined(WIN32) -#define SHMIDLEN 3 -#else -#define SHMIDLEN (MAX_REGIONS + 2) -#endif - -#define IDLOC (SHMIDLEN - 3) - -#endif diff --git a/armci/src-portals/armcip.h b/armci/src-portals/armcip.h deleted file mode 100644 index e8528f8df..000000000 --- a/armci/src-portals/armcip.h +++ /dev/null @@ -1,505 +0,0 @@ -/* $Id: armcip.h,v 1.82.2.9 2007-08-29 17:32:31 manoj Exp $ */ -/* armci private header file */ -#ifndef _ARMCI_P_H - -#define _ARMCI_P_H -#include -#include "armci.h" -#include "message.h" -#include "code_options.h" -#if 0 -#define ARMCI_PR_DBG(__ARMCI_ST,__ARMCI_NU) \ - printf("\n%d:%s:%d:%s:%s:%d",armci_me,__FILE__,__LINE__,FUNCTION_NAME,__ARMCI_ST,__ARMCI_NU);fflush(stdout) -#define ARMCI_PR_SDBG(__ARMCI_ST,__ARMCI_NU) \ - printf("\n(%d):%s:%d:%s:%s:%d",armci_me,__FILE__,__LINE__,FUNCTION_NAME,__ARMCI_ST,__ARMCI_NU);fflush(stdout) -#else -#define ARMCI_PR_DBG(__ARMCI_ST,__ARMCI_NU) -#define ARMCI_PR_SDBG(__ARMCI_ST,__ARMCI_NU) -#endif - -/*#define ARMCI_CHECK_STATE*/ - -#define ARMCI_STAMP 11214 -#define ARMCI_TAIL 31121 -#ifdef QUADRICS -#include -#ifdef QSNETLIBS_VERSION_CODE -#ifndef DECOSF -# define ELAN_ACC -# define PENDING_OPER(x) ARMCI_ACC_INT -#endif - -# if QSNETLIBS_VERSION_CODE > QSNETLIBS_VERSION(1,5,0) -# define LIBELAN_ATOMICS -# endif - -#endif -extern void armci_elan_fence(int p); -#endif - -/* we got problems on IA64/Linux64 with Elan if inlining is used */ -#if defined(__GNUC__) && !defined(QUADRICS) -# define INLINE inline -#else -# define INLINE -#endif - -#ifdef WIN32 -#include -#define sleep(x) Sleep(100*(x)) -#else -#include -#endif - -#if (defined(SYSV) || defined(WIN32)|| defined(MMAP)) && !defined(NO_SHM) && !defined(HITACHI) && !defined(CATAMOUNT) -#define CLUSTER - -#ifdef SERVER_THREAD -# define SERVER_NODE(c) (armci_clus_info[(c)].master); -# define NODE_SERVER(c) (c); -#else -# define SOFFSET -1000000 -# define SERVER_NODE(c) ((int)(SOFFSET -armci_clus_info[(c)].master)); -# define NODE_SERVER(c) ((int)(SOFFSET - c)) -#endif - -#endif - - -/*\GPC call stuff -\*/ -typedef struct { - int hndl, hlen, dlen; - void *hdr, *data; -}gpc_send_t; - - -/*\ Stuff for non-blocking API -\*/ -#define NB_MULTI -1 /*more than one armci buffer(buffers.c) used for nbcall*/ -#define NB_NONE -2 /*no armci buffer(buffers.c) used for nbcall*/ -extern unsigned int _armci_get_next_tag(); -#define GET_NEXT_NBTAG _armci_get_next_tag -#define ARMCI_MAX_IMPLICIT 15 - -typedef struct{ - int len; - int last; - void *exthdr; -} ext_header_t; - -typedef struct{ -int val; -void *ptr; -} armci_flag_t; - - -#if defined(LAPI) || defined(PTHREADS) || defined(POSIX_THREADS) -# include - typedef pthread_t thread_id_t; -# define THREAD_ID_SELF pthread_self -#elif defined(WIN32) -# include - typedef DWORD thread_id_t; -# define THREAD_ID_SELF GetCurrentThreadId -#else - typedef int thread_id_t; -# define THREAD_ID_SELF() 1 -#endif - -extern thread_id_t armci_usr_tid; -#ifdef SERVER_THREAD -# define SERVER_CONTEXT (armci_usr_tid != THREAD_ID_SELF()) -#else -# define SERVER_CONTEXT (armci_me<0) -#endif - -#if defined(LAPI) || defined(CLUSTER) || defined(CRAY) \ - || defined(CRAY_SHMEM) || defined(BGML) || defined(DCMF) -# include "request.h" -#endif - -/* ------------------------ ARMCI threads support ------------------------- */ -#define ARMCI_THREADS_LIMIT 32 - -#include "utils.h" -#if defined(THREAD_SAFE) -typedef struct { - int max; /* max # of threads per proc */ - int avail; /* next available position */ - thread_id_t *ids; /* list of threads' ids */ - thread_lock_t lock; /* general case lock */ - thread_lock_t buf_lock; /* lock for buffer access */ - thread_lock_t net_lock; /* lock for network accees */ -} armci_user_threads_t; - -extern armci_user_threads_t armci_user_threads; - -extern void armci_init_threads(); -extern void armci_finalize_threads(); -extern int armci_thread_idx(); -extern INLINE int armci_register_thread(thread_id_t id); - -#define ARMCI_THREAD_IDX armci_thread_idx() /* needs to be optimized */ - -#else -# define ARMCI_THREAD_IDX 0 -#endif - -/* ------------------------------------------------------------------------ */ - -/* min amount of data in strided request to be sent in single TCP/IP message*/ -#if defined(SOCKETS) || defined(MPI_SPAWN_ZEROCOPY) -# define TCP_PAYLOAD 128 -# define LONG_GET_THRESHOLD TCP_PAYLOAD -# define LONG_GET_THRESHOLD_STRIDED LONG_GET_THRESHOLD -# define LONG_PUT_THRESHOLD 128 -#endif - -#ifdef WIN32 -# define bzero(a,len){\ - int _i;\ - char *_c = (char*)(a);\ - for(_i=0; _i< (int)(len); _i++)_c[_i]=(char)0;\ - } -# define bcopy(a,b,len) memcpy(b,a,len) -#else -# include -#endif - -/*#define ACC_COPY*/ -#if defined(CRAY_T3E) || defined(FUJITSU)\ - || defined(HITACHI) || (defined(QUADRICS) && !defined(ELAN_ACC)) -#define ACC_COPY -#endif - -#ifndef FATR -# ifdef WIN32 -# define FATR __stdcall -# else -# define FATR -# endif -#endif - -#define MAX_PROC 8096 -#define MAX_STRIDE_LEVEL ARMCI_MAX_STRIDE_LEVEL - -/* msg tag ARMCI uses in collective ops */ -#define ARMCI_TAG 30000 - -#ifndef EXTRA_MSG_BUFLEN_DBL -# define RESERVED_BUFLEN ((sizeof(request_header_t)>>3)+3*MAX_STRIDE_LEVEL) -#else -# define RESERVED_BUFLEN ((sizeof(request_header_t)>>3)+3*MAX_STRIDE_LEVEL +\ - EXTRA_MSG_BUFLEN_DBL) -#endif - -#if defined(HITACHI) -# define BUFSIZE ((0x50000) * sizeof(double)) -#else - /* packing algorithm for double complex numbers requires even number */ -# ifdef MSG_BUFLEN_DBL -# define BUFSIZE_DBL (MSG_BUFLEN_DBL - RESERVED_BUFLEN) -# else -# define BUFSIZE_DBL 32768 -# endif -# define BUFSIZE (BUFSIZE_DBL * sizeof(double)) -#endif - -/* note opcodes must be lower than ARMCI_ACC_OFF !!! */ -#define PUT 1 -#define GET 2 -#define RMW 3 -#define LOCK 4 -#define UNLOCK 5 -#define ACK 6 -#define STATE 11214 - -/* must fit in two bits, see msginfo->format in request.h */ -#define STRIDED 1 -#define VECTOR 2 - -extern int armci_me, armci_nproc; -extern int _armci_initialized; -#ifdef HITACHI - extern int sr8k_server_ready; - extern double *armci_internal_buffer; -#else - extern double armci_internal_buffer[BUFSIZE_DBL]; -#endif -extern int armci_getbufsize(); -extern void armci_shmem_init(); -extern void armci_krmalloc_init_localmem(); -extern void armci_die(char *msg, int code); -extern void armci_die2(char *msg, int code1, int code2); -extern void armci_write_strided(void *ptr, int stride_levels, - int stride_arr[], int count[], char *buf); -extern void armci_read_strided(void *ptr, int stride_levels, - int stride_arr[], int count[], char *buf); -extern int armci_op_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, - int dst_stride_arr[], int count[], - int stride_levels, int lockit,armci_ihdl_t nb_handle); -extern int armci_copy_vector(int op, /* operation code */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int armci_acc_vector(int op, /* operation code */ - void *scale, /* scale factor */ - armci_giov_t darr[],/* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ); - -extern int armci_pack_strided(int op, void* scale, int proc, - void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, ext_header_t *hdr, - int fit_level, int nb, int last,armci_ihdl_t nb_handle); - -extern int armci_pack_vector(int op, void *scale, - armci_giov_t darr[],int len,int proc,armci_ihdl_t nb_handle); - -extern void armci_lockmem(void *pstart, void* pend, int proc); -extern void armci_unlockmem(int proc); - -extern int armci_acc_copy_strided(int optype, void* scale, int proc, - void* src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels); - -extern void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf); -extern void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf); -extern void armci_init_fence(); - -#ifdef SOCKETS -#ifdef SERVER_THREAD - extern void armci_create_server_thread ( void* (* func)(void*) ); - extern void armci_terminate_server_thread(); -#else - extern void armci_create_server_process ( void* (* func)(void*) ); - extern void armci_wait_server_process(); - extern void RestoreSigChldDfl(); -#endif -#endif - -#ifdef MPI_SPAWN - extern void armci_create_server_MPIprocess (); -#endif - - -#define ARMCI_MAX(a,b) (((a)>(b))?(a):(b)) -#define ARMCI_MIN(a,b) (((a)<(b))?(a):(b)) -#define ARMCI_ABS(a) (((a) >= 0) ? (a) : (-(a))) -#define ARMCI_ACC(op) ((((int)(op))-ARMCI_ACC_INT)>=0) - -#ifdef CLUSTER - extern char *_armci_fence_arr; -# define FENCE_ARR(p_) (_armci_fence_arr[p_]) - -# define SAMECLUSNODE(p)\ - ( ((p) <= armci_clus_last) && ((p) >= armci_clus_first) ) -#elif defined(__crayx1) -# define SAMECLUSNODE(p) 1 -#elif defined(ARMCIX) -# define SAMECLUSNODE(p) 0 -#else -# define SAMECLUSNODE(p) ((p)==armci_me) -#endif - - -#if defined(LAPI) || defined(ELAN_ACC) -# define ORDER(op,proc)\ - if( proc == armci_me || ( ARMCI_ACC(op) && ARMCI_ACC(PENDING_OPER(proc))) );\ - else FENCE_NODE(proc) -# define UPDATE_FENCE_INFO(proc_) -#elif defined(CLUSTER) && !defined(QUADRICS) && !defined(HITACHI)\ - && !defined(CRAY_SHMEM) -# define ORDER(op_,proc_)\ - if(!SAMECLUSNODE(proc_) && op_ != GET )FENCE_ARR(proc_)=1 -# define UPDATE_FENCE_INFO(proc_) if(!SAMECLUSNODE(proc_))FENCE_ARR(proc_)=1 -#else -# if defined(GM) && defined(ACK_FENCE) -# define ORDER(op,proc) -# else -# define ORDER(op,proc) if(proc != armci_me) FENCE_NODE(proc) -# endif -# define UPDATE_FENCE_INFO(proc_) -#endif - -typedef struct { - int ptr_array_len; - int bytes; - void **ptr_array; -} armci_riov_t; - -/*\ consider up to HOSTNAME_LEN characters in host name - * we can truncate names of the SP nodes since it is not used - * to establish socket communication like on the networks of workstations - * SP node names must be distinct within first HOSTNAME_LEN characters -\*/ -#if defined(LAPI) && defined(AIX) -# define HOSTNAME_TRUNCATE -# define HOSTNAME_LEN 12 -#else -# define HOSTNAME_LEN 64 -#endif - -typedef struct { - int master; - int nslave; - char hostname[HOSTNAME_LEN]; -} armci_clus_t; - -extern armci_clus_t *armci_clus_info; -extern int armci_nclus, armci_clus_me, armci_master; -extern int armci_clus_first, armci_clus_last; -extern int armci_clus_id(int p); -extern void armci_init_clusinfo(); -extern void armci_set_mem_offset(void *ptr); -extern int _armci_terminating; -extern void armci_acc_2D(int op, void* scale, int proc, void *src_ptr, - void *dst_ptr, int bytes, int cols, int src_stride, - int dst_stride, int lockit); -extern void armci_lockmem_scatter(void *ptr_array[], int len, int bytes, int p); -extern void armci_generic_rmw(int op, void *ploc, void *prem, int extra, int p); -extern unsigned long armci_max_region(); -extern void armci_dispatch_strided(void *ptr, int stride_arr[], int count[], - int strides, int fit_level, int nb, int bufsize, - void (*fun)(void*,int*,int*,int,void*), void *arg); -extern void armci_msg_gop_init(); -extern void armci_util_spin(int n, void *notused); - -#if defined(SYSV) || defined(WIN32) -extern void armci_shmem_init(); -extern void armci_set_shmem_limit_per_core(unsigned long shmemlimit); -extern void armci_set_shmem_limit_per_node(int nslaves); -extern void armci_set_shmem_limit(unsigned long shmemlimit); -#endif - -#define ALIGN_PTR_LONG(type, x) if( ((long)(x)) % sizeof(long)) { long _y = (long)(x);\ - if(sizeof(long)==8){_y>>=3; _y<<=3; }\ - else { _y>>=2; _y<<=2; }\ - _y += sizeof(long); (x) = (type*)_y; } - -#define SIXTYFOUR 64 -#define ALIGN64ADD(buf) (SIXTYFOUR-(((ssize_t)(buf))%SIXTYFOUR)) -#define ALIGNLONGADD(buf) ((((ssize_t)(buf))%sizeof(long))?(sizeof(long)-(((ssize_t)(buf))%sizeof(long))):0) - -#define SET 1 -#define UNSET 0 - -extern int armci_agg_save_strided_descriptor(void *src_ptr, - int src_stride_arr[], - void* dst_ptr, - int dst_stride_arr[], - int count[], - int stride_levels, int proc, - int op, armci_ihdl_t nb_handle); - -extern int armci_agg_save_giov_descriptor(armci_giov_t darr[], int len, - int proc, int op, - armci_ihdl_t nb_handle); - -extern int armci_agg_save_descriptor(void *src, void *dst, int bytes, - int proc, int op, int is_registered_put, - armci_ihdl_t nb_handle); - -extern void armci_agg_complete(armci_ihdl_t nb_handle, int condition); - -extern armci_ihdl_t armci_set_implicit_handle (int op, int proc); - -extern int armci_getnumcpus(void); -extern long armci_util_long_getval(long* p); -extern int armci_util_int_getval(int* p); -extern void armci_region_register_shm(void *start, long size); -extern void armci_region_register_loc(void *start, long size); -extern void armci_region_clus_record(int node, void *start, long size); -extern void armci_region_init(); -extern int armci_region_clus_found(int node, void *start, int size); -extern int armci_region_loc_found(void *start, int size); -extern int armci_region_both_found(void *loc, void *rem, int size, int node); -#ifdef REGIONS_REQUIRE_MEMHDL -extern int get_armci_region_local_hndl(void *loc, int node, ARMCI_MEMHDL_T **loc_memhdl); -#endif -extern void armci_region_exchange(void *start, long size); -extern void cpu_yield(); - -#ifdef ALLOW_PIN -extern void armci_global_region_exchange(void *, long); -#endif - - -/* -------------------- ARMCI Groups ---------------------- */ -/* data structure that caches a group's attribute */ -#ifdef BGML -#define PCLASS 3 -#endif -#ifdef MSG_COMMS_MPI -typedef int ARMCI_Datatype; - -extern int ATTR_KEY; /* attribute key */ - -/* #define ARMCI_GROUP /\*Generic ARMCI implementation*\/ */ - -typedef struct { - armci_clus_t *grp_clus_info; - int grp_me; /* my group id */ - int grp_nclus; /* number of cluster nodes */ - int grp_clus_me; /* my cluster node id */ - int mem_offset; /* memory offset */ -#ifdef ARMCI_GROUP - int nproc; /* #procs in this group*/ - int *proc_list; /* Ids of procs in this group - (w.r.t. MPI_COMM_WORLD)*/ -#endif -}armci_grp_attr_t; - -#include "mpi.h" - -/**dup of MPI_COMM_WORLD for internal MPI communication*/ -extern MPI_Comm ARMCI_COMM_WORLD; - -#ifdef PORTALS -#include "portals.h" -#endif - -typedef MPI_Comm ARMCI_Comm; -typedef struct { -#ifndef ARMCI_GROUP - MPI_Comm icomm; - MPI_Group igroup; -#endif - armci_grp_attr_t grp_attr; -}ARMCI_iGroup; - -armci_grp_attr_t *ARMCI_Group_getattr(ARMCI_Group *grp); -extern void armci_group_init(); -extern void armci_group_finalize(); -extern ARMCI_iGroup* armci_get_igroup_from_group(ARMCI_Group *group); - -#endif /* ifdef MSG_COMMS_MPI */ -/* -------------------------------------------------------- */ - -/* ------------ ARMCI Chekcpointing/Recovery -------------- */ -#ifdef DO_CKPT -extern int armci_init_checkpoint(); -extern void armci_create_ckptds(armci_ckpt_ds_t *ckptds, int count); -extern int armci_icheckpoint_init(char *filename, ARMCI_Group *grp, - int savestack, int saveheap, - armci_ckpt_ds_t *ckptds); -extern int armci_icheckpoint(int rid); -extern int armci_irecover(int rid,int iamreplacement); -extern void armci_icheckpoint_finalize(int rid); - - -#endif /* ifdef DO_CKPT */ -/* -------------------------------------------------------- */ - -/* portals only */ -void armci_register_shmem(void *my_ptr, long size, long *idlist, long off, void *sptr); -void armci_register_shmem_grp(void *my_ptr, long size, long *idlist, long off, void *sptr,ARMCI_Group *group); - -#endif diff --git a/armci/src-portals/atomics-i386.h b/armci/src-portals/atomics-i386.h deleted file mode 100644 index dd5899a0e..000000000 --- a/armci/src-portals/atomics-i386.h +++ /dev/null @@ -1,22 +0,0 @@ -/** Atomic instructions for i386. To be populated as need arises. - * @author Sriram Krishnamoorthy - */ -#ifndef __ATOMICS_I386__ -#define __ATOMICS_I386__ - -#include - -#define v4b (volatile unsigned int *) - -static inline void atomic_exchange(void *val, void *ptr, int size) { - assert(size == 4); - __asm__ __volatile__ ("xchgl %0, %1" - : "=r"(*v4b(val)), "+m"(*v4b(ptr)) - : "0"(*v4b(val)) - : "memory"); -} - -#undef v4b - -#endif /*__ATOMICS_I386__*/ - diff --git a/armci/src-portals/bufalloc.c b/armci/src-portals/bufalloc.c deleted file mode 100644 index 8c6458eed..000000000 --- a/armci/src-portals/bufalloc.c +++ /dev/null @@ -1,436 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: bufalloc.c,v 1.2 2001-06-07 23:23:23 d3h325 Exp $ - * storage manager for a chunk of memory passed by user in armci_init_buf_alloc - * derived from K&R that manages a chunk of memory - */ - -#include - -#define USAGE_ - -extern char *buf_allocate(); /* Used to get memory from the system */ -extern void armci_die(); - -#define VALID1 0xaaaaaaaa /* For validity check on headers */ -#define VALID2 0x55555555 -#define LOG_ALIGN 6 -#define ALIGNMENT (1 << LOG_ALIGN) -#define DEFAULT_NALLOC (1024 - ALIGNMENT) - -#ifdef USAGE -static struct shmalloc_struct { - size_t total; /* Amount request from system in units */ - long nchunk; /* No. of chunks of system memory */ - long inuse; /* Amount in use in units */ - long maxuse; /* Maximum value of inuse */ - long nfrags; /* No. of fragments divided into */ - long nmcalls; /* No. of calls to shmalloc */ - long nfcalls; /* No. of calls to buf_free */ -} usage; -#endif - -union header{ - struct { - unsigned valid1; /* Token to check if is not overwritten */ - union header *ptr; /* next block if on free list */ - size_t size; /* size of this block*/ - unsigned valid2; /* Another token acting as a guard */ - } s; - char align[ALIGNMENT]; /* Align to ALIGNMENT byte boundary */ -}; -typedef union header Header; - -static Header base; /* empty list to get started */ -static Header *freep = NULL; /* start of free list */ -static Header *usedp = NULL; /* start of used list */ -static size_t nalloc = DEFAULT_NALLOC; -static size_t max_nalloc = DEFAULT_NALLOC; -static int do_verify = 0; /* Flag for automatic heap verification */ -static int initialized=0; - - -static void buf_error(char* s, unsigned long i) -{ - void buf_alloc_print_stats(); - fflush(stdout); - fprintf(stderr,"buf_alloc error: %s %ld(0x%lx)\n", s, i, i); - fflush(stderr); -#ifdef USAGE - buf_alloc_print_stats(); -#endif - armci_die("buf_alloc: fatal error", i); -} - -void armci_buf_alloc_request(size_t size, size_t maxsize) -{ - nalloc = (size+ALIGNMENT-1) >> LOG_ALIGN; - max_nalloc = (maxsize+ALIGNMENT-1) >> LOG_ALIGN; -} - -void armci_buf_alloc_debug(int code) -{ - do_verify = code; -} - - -void armci_buf_alloc_verify() -{ - Header *p; - - if ( freep ) { - - /* Check the used list */ - for (p=usedp; p; p=p->s.ptr) { - if (p->s.valid1 != VALID1 || p->s.valid2 != VALID2) - buf_error("invalid header on usedlist", (unsigned long) p->s.valid1); - -#ifdef USAGE - if (p->s.size > usage.total) - buf_error("invalid size in header usedlist",(unsigned long)p->s.size); -#endif - } - - /* Check the free list */ - p = base.s.ptr; - while (p != &base) { - if (p->s.valid1 != VALID1 || p->s.valid2 != VALID2) - buf_error("invalid header on freelist", (unsigned long) p->s.valid1); - -#ifdef USAGE - if (p->s.size > usage.total) - buf_error("invalid size in header freelist",(unsigned long)p->s.size); -#endif - - p = p->s.ptr; - } - } /* end if */ -} - - -static void addtofree(char* ap) -{ - Header *bp, *p, **up; - -#ifdef USAGE - usage.nfcalls++; -#endif - if (do_verify) armci_buf_alloc_verify(); - - /* only do something if pointer is not NULL */ - if ( ap ) { - - bp = (Header *) ap - 1; /* Point to block header */ - - if (bp->s.valid1 != VALID1 || bp->s.valid2 != VALID2) - buf_error("buf_free: pointer not from buf_alloc", (unsigned long) ap); - -#ifdef USAGE - usage.inuse -= bp->s.size; /* Decrement memory usage */ -#endif - - /* Extract the block from the used linked list ... for debug only */ - for (up=&usedp; ; up = &((*up)->s.ptr)) { - if (!*up) - buf_error("buf_free:block not found in used list\n",(unsigned long)ap); - if (*up == bp) { - *up = bp->s.ptr; - break; - } - } - - /* Join the memory back into the free linked list */ - for (p=freep; !(bp > p && bp < p->s.ptr); p = p->s.ptr) - if (p >= p->s.ptr && (bp > p || bp < p->s.ptr)) - break; /* Freed block at start or end of arena */ - - if (bp + bp->s.size == p->s.ptr) {/* join to upper neighbour */ - bp->s.size += p->s.ptr->s.size; - bp->s.ptr = p->s.ptr->s.ptr; -#ifdef USAGE - usage.nfrags--; /* Lost a fragment */ -#endif - } else - bp->s.ptr = p->s.ptr; - - if (p + p->s.size == bp) { /* Join to lower neighbour */ - p->s.size += bp->s.size; - p->s.ptr = bp->s.ptr; -#ifdef USAGE - usage.nfrags--; /* Lost a fragment */ -#endif - } else - p->s.ptr = bp; - - freep = p; - - } /* end if on ap */ -} - - -void armci_init_buf_alloc(size_t len, void* buffer) -{ - char *cp; - Header *up, *prevp; - size_t nu; - /* need to initialize the free list */ - if (sizeof(Header) != ALIGNMENT) - buf_error("Alignment is not valid", (unsigned long) ALIGNMENT); - - if(initialized)armci_die("armci_init_buf_alloc: already initialized",0); - -#ifdef USAGE - usage.total = 0; /* Initialize statistics */ - usage.nchunk = 0; - usage.inuse = 0; - usage.nfrags = 0; - usage.maxuse = 0; - usage.nmcalls= 0; - usage.nfcalls= 0; -#endif - - base.s.ptr = freep = prevp = &base; /* Initialize linked list */ - base.s.size = 0; - base.s.valid1 = VALID1; - base.s.valid2 = VALID2; - - nu = len/sizeof(Header); /* nu must by a multiplicity of nalloc */ - max_nalloc = nu*nalloc; - if(nu<1) armci_die("buffer less than nalloc",(int)len); - cp = (char*)buffer; - -#ifdef USAGE - usage.total += nu; /* Have just got nu more units */ - usage.nchunk++; /* One more chunk */ - usage.nfrags++; /* Currently one more frag */ - usage.inuse += nu; /* Inuse will be decremented by buf_free */ -#endif - - up = (Header *) cp; - up->s.size = nu; - up->s.valid1 = VALID1; - up->s.valid2 = VALID2; - - /* Insert into linked list of blocks in use so that buf_free works - ... for debug only */ - up->s.ptr = usedp; - usedp = up; - - addtofree((char *)(up+1)); /* Try to join into the free list */ -} - - -/*\ return a chunk memory of given size -\*/ -char *armci_buf_alloc(size_t nbytes) -{ - Header *p, *prevp; - size_t nunits; - char *return_ptr; - - /* need to initialize the free list */ - if ((prevp = freep) == NULL) armci_die("not initialized", 0); - -#ifdef USAGE - usage.nmcalls++; -#endif - - if (do_verify) armci_buf_alloc_verify(); - - /* Rather than divide make the alignment a known power of 2 */ - nunits = ((nbytes + sizeof(Header) - 1)>>LOG_ALIGN) + 1; - - for (p=prevp->s.ptr; ; prevp = p, p = p->s.ptr) { - if (p->s.size >= nunits) { /* Big enuf */ - if (p->s.size == nunits) /* exact fit */ - prevp->s.ptr = p->s.ptr; - else { /* allocate tail end */ - p->s.size -= nunits; - p += p->s.size; - p->s.size = nunits; - p->s.valid1 = VALID1; - p->s.valid2 = VALID2; -#ifdef USAGE - usage.nfrags++; /* Have just increased the fragmentation */ -#endif - } - - /* Insert into linked list of blocks in use ... for debug only */ - p->s.ptr = usedp; - usedp = p; - -#ifdef USAGE - usage.inuse += nunits; /* Record usage */ - if (usage.inuse > usage.maxuse) - usage.maxuse = usage.inuse; -#endif - freep = prevp; - return_ptr = (char *) (p+1); - break; - } - - if (p == freep){ /* wrapped around the free list */ - return_ptr = (char *) NULL; - break; - } - } - return return_ptr; -} - - -void armci_buf_free(char *ap) -{ - Header *bp, *p, **up; -#ifdef USAGE - usage.nfcalls++; -#endif - if (do_verify) armci_buf_alloc_verify(); - - /* only do something if pointer is not NULL */ - - if ( ap ) { - - bp = (Header *) ap - 1; /* Point to block header */ - - if (bp->s.valid1 != VALID1 || bp->s.valid2 != VALID2) - buf_error("buf_free: pointer not from buf_alloc", (unsigned long) ap); - -#ifdef USAGE - usage.inuse -= bp->s.size; /* Decrement memory usage */ -#endif - - /* Extract the block from the used linked list for debug only */ - for (up=&usedp; ; up = &((*up)->s.ptr)) { - if (!*up) - buf_error("buf_free:block not found in used list\n",(unsigned long)ap); - if (*up == bp) { - *up = bp->s.ptr; - break; - } - } - - /* Join the memory back into the free linked list */ - for (p=freep; !(bp > p && bp < p->s.ptr); p = p->s.ptr) - if (p >= p->s.ptr && (bp > p || bp < p->s.ptr)) - break; /* Freed block at start or end of arena */ - - if (bp + bp->s.size == p->s.ptr) {/* join to upper neighbour */ - bp->s.size += p->s.ptr->s.size; - bp->s.ptr = p->s.ptr->s.ptr; -#ifdef USAGE - usage.nfrags--; /* Lost a fragment */ -#endif - } else - bp->s.ptr = p->s.ptr; - - if (p + p->s.size == bp) { /* Join to lower neighbour */ - p->s.size += bp->s.size; - p->s.ptr = bp->s.ptr; -#ifdef USAGE - usage.nfrags--; /* Lost a fragment */ -#endif - } else - p->s.ptr = bp; - - freep = p; - - } /* end if on ap */ -} - - -#ifdef USAGE -/* - Return stats on buf_alloc performance. Use arg list instead of - returning structure so that FORTRAN can eventually use it -*/ -void buf_alloc_stats(size_t *total, long* nchunk, size_t * inuse, - size_t * maxuse, long* nfrags, long* nmcalls,long* nfcalls) -{ - *total = usage.total * sizeof(Header); - *nchunk = usage.nchunk; - *inuse = (size_t)usage.inuse * sizeof(Header); - *maxuse = (size_t)usage.maxuse* sizeof(Header); - *nfrags = usage.nfrags; - *nmcalls= usage.nmcalls; - *nfcalls= usage.nfcalls; -} - -/* - Print to standard output the usage statistics. -*/ -void buf_alloc_print_stats() -{ - size_t total, inuse, maxuse; - long nchunk, nfrags, nmcalls, nfcalls; - - buf_alloc_stats(&total, &nchunk, &inuse, &maxuse, &nfrags, - &nmcalls, &nfcalls); - - fflush(stderr); - printf("\nbuf_alloc statistics\n-------------------\n\n"); - printf("Total memory from system ... %ld bytes\n", (long)total); - printf("Current memory usage ....... %ld bytes\n", (long)inuse); - printf("Maximum memory usage ....... %ld bytes\n", (long)maxuse); - printf("No. chunks from system ..... %ld\n", nchunk); - printf("No. of fragments ........... %ld\n", nfrags); - printf("No. of calls to buf_alloc ... %ld\n", nmcalls); - printf("No. of calls to buf_free ..... %ld\n", nfcalls); - printf("\n"); - fflush(stdout); -} -#endif - - -#if 0 -void armci_die(char *str, int c) -{ -fprintf(stderr,"%s %d\n",str,c); -_exit(1); -} - - -#define LEN (16*1024) -char buf[LEN]; - -main (int argc, char **argv) -{ -int i,k,total=0,size=1024; -char *ar[100]; - armci_init_buf_alloc(LEN, buf); - - for(i=0; i<100; i++)ar[i]=(char*)0; - - for(i=0; i<100; i++){ - ar[i] =armci_buf_alloc(size); - if(!ar[i]){ - printf("i =%d total=%d\n", i, total); - buf_alloc_print_stats(); - k=i; - break; - } - total+=size; - } - for(i=0; i -#include -#include -#include "armcip.h" -#include "request.h" -#ifdef WIN32 -# include - typedef unsigned long ssize_t; -#else -# include -#endif - -# define EQ_TAGS(a_, b_) !memcmp(&(a_), &(b_), sizeof(a_)) - -#define ALIGN64ADD(buf) (SIXTYFOUR-(((ssize_t)(buf))%SIXTYFOUR)) -/* the following symbols should be defined if needed in protocol specific - header file: BUF_EXTRA_FIELD, BUF_ALLOCATE -*/ - -#ifndef BUF_ALLOCATE -# define BUF_ALLOCATE malloc -#endif -#if defined PORTALS -# define SMALL_BUF_LEN PORTALS_SMALL_BUF_SIZE -#else -# if defined(SERV_QUEUE) -# define SMALL_BUF_LEN 4096 -# else -# define SMALL_BUF_LEN 2048 -# endif -#endif - -#ifndef MSG_BUFLEN_SMALL -# define MSG_BUFLEN_SMALL (MSG_BUFLEN >>0) -#endif - -#define LEFT_GUARD 11.11e11 -#define RIGHT_GUARD 22.22e22 -#define CLEAR_TABLE_SLOT(idx) *((int*)(_armci_buf_state->table+(idx))) =0 - -#ifndef BUF_NET_INIT -#define BUF_NET_INIT(x,xX,Xx) -#endif -_buf_ackresp_t *_buf_ackresp_first,*_buf_ackresp_cur; -/* we allow multiple buffers (up to 15) per single request - * adjacent buffers can be coalesced into a large one - */ -typedef struct { - int op; /* pending operation code */ - int snd; /* if 1 then buffer is used for sending request */ - int rcv; /* if 1 then buffer is used for receiving data */ - int async; /* if 1 then request is nonblocking */ - int first; /* id of the 1st buffer in the set in same request */ - int count; /* count is not used and is always 1 (or 0???) */ - /*unsigned int count:4; \* how many buffers used for this request 8 possible */ - int busy; /* if 1 buffer is used and cannot be completed */ - int cmpl; /* set to 1 if buffer was completed and can be released */ - int to; /* serv/proc to which request was sent, 8172 possible */ -}buf_state_t; - - -#ifndef BUFID_PAD_T -#define BUFID_PAD_T BUF_INFO_T -#endif - -/* message send buffer data structure */ -typedef struct { - BUF_INFO_T id; -# ifdef BUF_EXTRA_FIELD_T - BUF_EXTRA_FIELD_T field; -# endif - char buffer[MSG_BUFLEN_SMALL]; -} buf_ext_t; - -/* message send buffer data structure */ -typedef struct { - BUF_INFO_T id; -# ifdef BUF_EXTRA_FIELD_T - BUF_EXTRA_FIELD_T field; -# endif - char buffer[SMALL_BUF_LEN]; -} buf_smext_t; - -/* we keep table and buffer pointer together for better locality */ -typedef struct { - double left_guard; /* stamp to verify if array was corrupted */ - buf_state_t table[MAX_BUFS+MAX_SMALL_BUFS]; /*array with state of buffer */ - buf_ext_t *buf; /* address of buffer pool */ - buf_smext_t *smallbuf; /* address of the large buffer pool */ - int avail; - int smavail; - int pad; - double right_guard; /* stamp to verify if array was corrupted */ - - unsigned buf_bitmap; /* bitmaps to track available buffers: */ - unsigned smbuf_bitmap;/* 1 - available, 0 - not available */ -} reqbuf_pool_t; - -#ifndef BUF_EXTRA_FIELD_T -# define SIZE_BUF_EXTRA_FIELD 0 -# define BUF_TO_EBUF(buf) (buf_ext_t*)(((char*)buf) - sizeof(BUFID_PAD_T) -\ - SIZE_BUF_EXTRA_FIELD) -# define BUF_TO_SMEBUF(buf) (buf_smext_t*)(((char*)buf)- sizeof(BUFID_PAD_T) -\ - SIZE_BUF_EXTRA_FIELD) -#else -# define BUF_TO_EBUF(buf) (buf_ext_t*)(((char*)buf) - sizeof(BUFID_PAD_T) -\ - sizeof(BUF_EXTRA_FIELD_T)) -# define BUF_TO_SMEBUF(buf) (buf_smext_t*)(((char*)buf)- sizeof(BUFID_PAD_T) -\ - sizeof(BUF_EXTRA_FIELD_T)) -#endif - -#define BUF_TO_BUFINDEX(buf) (BUF_TO_EBUF((buf)))->id.bufid -#define BUF_TO_SMBUFINDEX(buf) (BUF_TO_SMEBUF((buf)))->id.bufid - - -buf_ext_t *_armci_buffers; /* these are the actual buffers */ -buf_smext_t *_armci_smbuffers; /* no, these are the actual buffers */ -reqbuf_pool_t* _armci_buf_state; /* array that describes state of each buf */ - -extern active_socks_t *_armci_active_socks; - -/* returns bufinfo, given bufid */ -INLINE BUF_INFO_T *_armci_id_to_bufinfo(int bufid) { - if (bufid < 0 || bufid >= (MAX_BUFS+MAX_SMALL_BUFS)) - armci_die2("_armci_id_to_bufinfo: bad id",bufid,MAX_BUFS); - - return bufid < MAX_BUFS ? &(_armci_buf_state->buf[bufid].id) : - &(_armci_buf_state->smallbuf[bufid-MAX_BUFS].id); -} - - - -/*\ we allocate alligned buffer space - * this operation can be implemented in platform specific files -\*/ -void _armci_buf_init() -{ -char *tmp; -int extra=0; -int smallbuf_size = sizeof(buf_smext_t)*(MAX_SMALL_BUFS); - // tmp = (char *) BUF_ALLOCATE((MAX_BUFS*sizeof(buf_ext_t) + 64 + smallbuf_size + 64)); - tmp = (char *) malloc((MAX_BUFS*sizeof(buf_ext_t) + 64 + smallbuf_size + 64)); - bzero(tmp,MAX_BUFS*sizeof(buf_ext_t) + 64 + smallbuf_size + 64); - extra= ALIGN64ADD(tmp); - - _armci_buffers = (buf_ext_t *) (tmp + extra); - - tmp = (char *)(_armci_buffers + MAX_BUFS); - extra = ALIGN64ADD(tmp); - _armci_smbuffers = (buf_smext_t *) (tmp + extra); - - - if(DEBUG2_){ - printf("%d:armci_init_bufs: pointer %p, before align ptr=%p bufptr=%p end of region is %p size=%d extra=%d\n", - armci_me,_armci_buffers,tmp,_armci_buffers->buffer,(_armci_buffers+MAX_BUFS), - MAX_BUFS*sizeof(buf_ext_t),extra); - fflush(stdout); - } - - /* now allocate state array */ - tmp = malloc(sizeof(reqbuf_pool_t) + 64); - bzero(tmp,sizeof(reqbuf_pool_t) + 64); - if(!tmp)armci_die("_armci_buf_init calloc failed",0); - extra= ALIGN64ADD(tmp); - _armci_buf_state = (reqbuf_pool_t*)(tmp + extra); - - /* initialize it */ - _armci_buf_state->left_guard = LEFT_GUARD; - _armci_buf_state->right_guard = RIGHT_GUARD; - _armci_buf_state->avail =0; - _armci_buf_state->smavail =MAX_BUFS; - _armci_buf_state->buf = _armci_buffers; - _armci_buf_state->smallbuf = _armci_smbuffers; - - _buf_ackresp_first=_buf_ackresp_cur=NULL; - - if(BUF_TO_EBUF(_armci_buf_state->buf[0].buffer)!=_armci_buf_state->buf) - armci_die("buffers.c, internal structure alignment problem",0); -} - - -/*\ convert buffer pointer to index (in state array) -\*/ -int _armci_buf_to_index(void *buf) -{ -int index; -char *ptr = (char*)buf; - - if(DEBUG2_){ - printf("%d: in _armci_buf_to_index %p\n",armci_me, buf); - fflush(stdout); - } - if(buf > (void *)_armci_buffers && buf < (void *)(_armci_buffers+MAX_BUFS)){ - index = BUF_TO_BUFINDEX(ptr); - if((index >= MAX_BUFS)|| (index<0)) - armci_die2("armci_buf_to_index: bad index:",index,MAX_BUFS); - return(index); - } - else if(buf > (void *)_armci_smbuffers && buf < (void *)(_armci_smbuffers+MAX_SMALL_BUFS)){ - index = BUF_TO_SMBUFINDEX(ptr); - if((index >= MAX_BUFS+MAX_SMALL_BUFS)|| (indextable + index; - ARMCI_PR_DBG("enter",0); - if(index>=MAX_BUFS){ - int relidx; - relidx = index-MAX_BUFS; - CLEAR_SEND_BUF_FIELD(_armci_buf_state->smallbuf[relidx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op); - } - else - CLEAR_SEND_BUF_FIELD(_armci_buf_state->buf[index].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op); - ARMCI_PR_DBG("exit",0); -} - -/*\ complete outstanding operation that uses the specified buffer -\*/ -void _armci_buf_complete_index(int idx, int called) -{ -int count; -buf_state_t *buf_state = _armci_buf_state->table +idx; -portals_ds_req_t *req = NULL; - - count = buf_state->count; - if(DEBUG_ || 0) { - printf("%d:buf_complete_index:%d op=%d first=%d count=%d called=%d\n", - armci_me,idx,buf_state->op,buf_state->first,buf_state->count, - called); - fflush(stdout); - } - - if(buf_state->first != (unsigned int)idx){ - armci_die2("complete_buf_index:inconsistent Index:",idx,buf_state->first); - } - - /* need to call platform specific function */ - if(idx>=MAX_BUFS){ - int relidx,rr; - relidx = idx-MAX_BUFS; - //printf("\n%d:in clear idx=%d %d",armci_me,idx,_armci_buf_state->smallbuf[relidx].id.tag);fflush(stdout); - /* ------------------------------------------------------------------------------------------- *\ - active buffers need to be completed - \* ------------------------------------------------------------------------------------------- */ - # ifdef PORTALS - req = &_armci_buf_state->smallbuf[relidx].id.ar.req; - if(req->active) { - // printf("%s [cp buf_complete_index] waiting on request %p\n",Portals_ID(),req); - portalsWaitOnRequest(req); - // printf("%s [cp buf_complete_index] request %p completed\n",Portals_ID(),req); - } else { - // printf("%s [cp buf_complete_index] request %p already completed\n",Portals_ID(),req); - } - - # else - - if(_armci_buf_state->smallbuf[relidx].id.tag && (_armci_buf_state->smallbuf[relidx].field)->tag>0) { - printf("%s [cp] calling armci_client_complete\n",Portals_ID()); - rr=armci_client_complete(0,buf_state->to,_armci_buf_state->smallbuf[relidx].id.tag,_armci_buf_state->smallbuf[relidx].field); - } - CLEAR_SEND_BUF_FIELD(_armci_buf_state->smallbuf[relidx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op); - - # endif - - /*later, we might just need to do this for all operations, not just get*/ - # ifdef PORTALS_ALLOW_NBGETS - if(_armci_buf_state->smallbuf[relidx].id.tag!=0 &&(buf_state->op == GET)){ - armci_complete_req_buf(&(_armci_buf_state->smallbuf[relidx].id), - _armci_buf_state->smallbuf[relidx].buffer); - } - # endif - _armci_buf_state->smallbuf[relidx].id.tag=0; - } - else { - int rr; - - /* ------------------------------------------------------------------------------------------- *\ - active buffers need to be completed - \* ------------------------------------------------------------------------------------------- */ - # ifdef PORTALS - req = &_armci_buf_state->buf[idx].id.ar.req; - if(req->active) portalsWaitOnRequest(req); - - # else - - if(_armci_buf_state->buf[idx].id.tag && (_armci_buf_state->buf[idx].field)->tag>0 ) - rr=armci_client_complete(0,buf_state->to,_armci_buf_state->buf[idx].id.tag,_armci_buf_state->buf[idx].field); - CLEAR_SEND_BUF_FIELD(_armci_buf_state->buf[idx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op); - //printf("\n%d:in clear large idx=%d %d",armci_me,idx,_armci_buf_state->buf[idx].id.tag);fflush(stdout); - # endif - - /*later, we might just need to do this for all operations, not just get*/ - # ifdef PORTALS_ALLOW_NBGETS - if(_armci_buf_state->buf[idx].id.tag!=0 &&(buf_state->op == GET)){ - armci_complete_req_buf(&(_armci_buf_state->buf[idx].id), - _armci_buf_state->buf[idx].buffer); - } - # endif - _armci_buf_state->buf[idx].id.tag=0; - } - /* clear table slots for all the buffers in the set for this request */ - for(; count; count--, buf_state++) *(int*)buf_state = 0; -} - - -/*\ test outstanding operation that uses the specified buffer for complete - * It is important not to change the state of the buffer, the buffer has - * to remain as it was, only completion has to be indicated -\*/ -int _armci_buf_test_index(int idx, int called) -{ -int count,retval=0; -buf_state_t *buf_state = _armci_buf_state->table +idx; - count = buf_state->count; - if(DEBUG_ ){ - printf("%d:buf_test_index:%d op=%d first=%d count=%d called=%d\n", - armci_me,idx,buf_state->op,buf_state->first,buf_state->count, - called); - fflush(stdout); - } - if(buf_state->first != (unsigned int)idx){ - armci_die2("_buf_test_index:inconsistent index:",idx,buf_state->first); - } -# ifdef BUF_EXTRA_FIELD_T - /* need to call platform specific function */ - if(idx>=MAX_BUFS){ - int relidx; - relidx = idx-MAX_BUFS; - /*printf("\n%d:relidx=%d \n",armci_me,relidx);fflush(stdout);*/ - TEST_SEND_BUF_FIELD(_armci_buf_state->smallbuf[relidx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op,&retval); - - } - else { - TEST_SEND_BUF_FIELD(_armci_buf_state->buf[idx].field,buf_state->snd,buf_state->rcv,buf_state->to,buf_state->op,&retval); - } -# endif - if(DEBUG_ ){ - printf("%d:buf_test_index:%d op=%d first=%d count=%d called=%d ret=%d\n", - armci_me,idx,buf_state->op,buf_state->first,buf_state->count, - called,retval); - fflush(stdout); - } - return(retval); -} - -/** -an addition to the below operation to allow for multiple outstanding operations -per server node -*/ -void _armci_buf_ensure_pend_outstanding_op_per_node(void *buf, int node) -{ -int i; -int index =_armci_buf_to_index(buf); -int this = _armci_buf_state->table[index].first; -int nfirst, nlast; -void _armci_buf_release_index(int i); -int buf_pend_count=0; -int changeid=0; - nfirst=armci_clus_info[node].master; - nlast = nfirst+armci_clus_info[node].nslave-1; - if(_armci_buf_state->table[index].to<0){ - _armci_buf_state->table[index].to = 0-1e6-_armci_buf_state->table[index].to; - changeid=1; - } - - if((_armci_buf_state->table[index].to<(unsigned int) nfirst) || - (_armci_buf_state->table[index].to>(unsigned int) nlast)) - armci_die2("_armci_buf_ensure_pend_outstanding_op_per_node: bad to",node, - (int)_armci_buf_state->table[index].to); - - buf_pend_count=0; - for(i=0;itable +i; - if((buf_state->to >= nfirst) && (buf_state->to<= (unsigned int) nlast)) - if( (buf_state->first != (unsigned int) this) && (buf_state->first==(unsigned int) i) && buf_state->op){ - buf_pend_count++; - if(buf_pend_count == NUM_SERV_BUFS){ - _armci_buf_complete_index(i,0); - _armci_buf_release_index(i); - break; - } - } - } - if(changeid)_armci_buf_state->table[index].to = 0-1e6-_armci_buf_state->table[index].to; -} - -/*\ make sure that there are no other pending operations to that smp node - * this operation is called from platforms specific routine that sends - * request - * we could have accomplished the same in armci_buf_get but as Vinod - * is pointing out, it is better to delay completing outstanding - * calls to overlap memcpy for the current buffer with communication -\*/ -void _armci_buf_ensure_one_outstanding_op_per_node(void *buf, int node) -{ - int i; - int index =_armci_buf_to_index(buf); - int this = _armci_buf_state->table[index].first; - int nfirst, nlast; - void _armci_buf_release_index(int i); - - nfirst=armci_clus_info[node].master; - nlast = nfirst+armci_clus_info[node].nslave-1; - if((_armci_buf_state->table[index].to<(unsigned int) nfirst) || - (_armci_buf_state->table[index].to>(unsigned int) nlast)) - armci_die2("_armci_buf_ensure_one_outstanding_op_per_node: bad to",node, - (int)_armci_buf_state->table[index].to); - - for(i=0;itable +i; - if((buf_state->to >= nfirst) && (buf_state->to<= (unsigned int) nlast)) { - if((buf_state->first != (unsigned int) this)&&(buf_state->first==(unsigned int) i) && buf_state->op) { - _armci_buf_complete_index(i,0); - _armci_buf_release_index(i); - } - } - } -} - -/*\ same as above but for process -\*/ -void _armci_buf_ensure_one_outstanding_op_per_proc(void *buf, int proc) -{ - int i; - int index = _armci_buf_to_index(buf); - int this = _armci_buf_state->table[index].first; - void _armci_buf_release_index(int i); - - if(_armci_buf_state->table[index].to !=(unsigned int) proc ) - armci_die2("_armci_buf_ensure_one_outstanding_op_per_proc: bad to", proc, - (int)_armci_buf_state->table[index].to); - - for(i=0;itable +i; - if(buf_state->to == (unsigned int) proc) { - if((buf_state->first != (unsigned int) this)&&(buf_state->first==(unsigned int) i) && buf_state->op) { - _armci_buf_complete_index(i,0); - _armci_buf_release_index(i); - } - } - } -} - - -#define HISTORY__ -#ifdef HISTORY -typedef struct{ int size; int op; int count; int id; } history_t; -history_t history[100]; -int h=0; - -void print_history() -{ -int i; - fflush(stdout); - printf("%d records\n",h); - for(i=0; ibuf[history[i].id].buffer, history[i].count, - history[i].op); - - fflush(stdout); -} -#endif - -/*\ call corresponding to GET_SEND_BUF -\*/ -char *_armci_buf_get_small(int size, int operation, int to) -{ -int avail=_armci_buf_state->smavail,i; -_buf_ackresp_t *ar; - if(_armci_buf_state->table[avail].op || - _armci_buf_state->table[avail].first || - _armci_buf_state->smallbuf[avail-MAX_BUFS].id.ar.req.active) { - - for(i=MAX_BUFS;itable[i].op && - !_armci_buf_state->table[i].first && - !_armci_buf_state->smallbuf[i-MAX_BUFS].id.ar.req.active) - break; - } - if(i<(MAX_SMALL_BUFS+MAX_BUFS))avail = i; - else { - _armci_buf_complete_index(avail,1); - } - } - _armci_buf_state->table[avail].op = operation; - _armci_buf_state->table[avail].to = to; - _armci_buf_state->table[avail].count= 1; - _armci_buf_state->table[avail].first = avail; - _armci_buf_state->smallbuf[avail-MAX_BUFS].id.tag=0; - _armci_buf_state->smallbuf[avail-MAX_BUFS].id.bufid= avail; - _armci_buf_state->smallbuf[avail-MAX_BUFS].id.protocol=0; - ar=&_armci_buf_state->smallbuf[avail-MAX_BUFS].id.ar; - assert(ar->val==0);assert(ar->next==NULL);assert(ar->previous==NULL); - # ifdef PORTALS - assert(ar->req.active == 0); - # endif - ar->req.active = 1; - if(_buf_ackresp_cur!=NULL) - _buf_ackresp_cur->next=ar; - if(_buf_ackresp_first==NULL) - _buf_ackresp_first=ar; - ar->previous=_buf_ackresp_cur; - ar->next=NULL; - _buf_ackresp_cur=ar; - - if(DEBUG_ || 0) { - printf("%d:buf_get_sm1:size=%d max=%d got %d ptr=%p op=%d to=%d count=%d first=%d\n", - armci_me,size,SMALL_BUF_LEN,avail, - _armci_buf_state->smallbuf[avail-MAX_BUFS].buffer,operation,to, - (int)_armci_buf_state->table[avail].count,(int)_armci_buf_state->table[avail].first); - fflush(stdout); - } - -# ifdef BUF_EXTRA_FIELD_T - INIT_SEND_BUF(_armci_buf_state->smallbuf[avail-MAX_BUFS].field,_armci_buf_state->table[avail].snd,_armci_buf_state->table[avail].rcv); -#endif - - _armci_buf_state->smavail = (avail+1-MAX_BUFS)%MAX_SMALL_BUFS + MAX_BUFS; - - if(DEBUG_ || 0) { - printf("%d:buf_get_sm:size=%d max=%d got %d ptr=%p op=%d to=%d count=%d first=%d\n", - armci_me,size,SMALL_BUF_LEN,avail, - _armci_buf_state->smallbuf[avail-MAX_BUFS].buffer,operation,to, - _armci_buf_state->table[avail].count,_armci_buf_state->table[avail].first); - fflush(stdout); - } - - return(_armci_buf_state->smallbuf[avail-MAX_BUFS].buffer); - -} - -/*\ call corresponding to GET_SEND_BUF -\*/ -static char *rmo_buffer = NULL; - -char *_armci_buf_get(int size, int operation, int to) -{ -#ifndef PORTALS_USE_ARMCI_CLIENT_BUFFERS - if(rmo_buffer) return rmo_buffer; - rmo_buffer = (char *) valloc(MSG_BUFLEN); - return rmo_buffer; -#else -int avail=_armci_buf_state->avail; -int count=1, i; -_buf_ackresp_t *ar; - - /*if small buffer, we go to another routine that gets smallbuf*/ - if(size MSG_BUFLEN_SMALL) ){ - double val = (double)size; /* use double due to a bug in gcc */ - val /= MSG_BUFLEN_SMALL; - count=(int)val; - if(size%MSG_BUFLEN_SMALL) count++; - assert(0); - } - /* start from 0 if there is not enough bufs available from here */ - if((avail+count) > MAX_BUFS)avail = 0; - - /* avail should never point to buffer in a middle of a set of used bufs */ - if(_armci_buf_state->table[avail].op && - (_armci_buf_state->table[avail].first != (unsigned int) avail)){ sleep(1); - printf("%d: inconsistent first. avail=%d count=%d first=%d size=%d\n", - armci_me, avail, count, _armci_buf_state->table[avail].first, size); - armci_die2("armci_buf_get: inconsistent first", avail, - _armci_buf_state->table[avail].first); - } - - /* we need complete "count" number of buffers */ - for(i=0;itable[cur].op && - _armci_buf_state->table[cur].first==(unsigned int) cur) || - _armci_buf_state->buf[cur].id.ar.req.active) { - _armci_buf_complete_index(cur,1); - } - } - - for(i=0; itable[avail+i].op = operation; - _armci_buf_state->table[avail+i].to = to; - _armci_buf_state->table[avail+i].count= count; - _armci_buf_state->table[avail+i].first = avail; - } - - _armci_buf_state->buf[avail].id.tag=0; - _armci_buf_state->buf[avail].id.bufid=avail; - _armci_buf_state->buf[avail].id.protocol=0; - ar=&_armci_buf_state->buf[avail].id.ar; - - assert(ar->val==0);assert(ar->next==NULL);assert(ar->previous==NULL); - assert(ar->req.active == 0); - - ar->req.active = 1; - - if(_buf_ackresp_cur!=NULL) - _buf_ackresp_cur->next=ar; - if(_buf_ackresp_first==NULL) - _buf_ackresp_first=ar; - ar->previous=_buf_ackresp_cur; - ar->next=NULL; - _buf_ackresp_cur = ar; - -# ifdef BUF_EXTRA_FIELD_T - INIT_SEND_BUF(_armci_buf_state->buf[avail].field,_armci_buf_state->table[avail].snd,_armci_buf_state->table[avail].rcv); -#endif - -#ifdef HISTORY - history[h].size=size; - history[h].op=operation; - history[h].count=count; - history[h].id = avail; - h++; -#endif - - if(DEBUG_ || 0) { - printf("%d:buf_get:size=%d max=%d got %d ptr=%p count=%d op=%d to=%d\n", - armci_me,size,MSG_BUFLEN_SMALL,avail, - _armci_buf_state->buf[avail].buffer, count,operation,to); - fflush(stdout); - } - - /* select candidate buffer for next allocation request */ - _armci_buf_state->avail = avail+count; - _armci_buf_state->avail %= MAX_BUFS; - - return(_armci_buf_state->buf[avail].buffer); -#endif -} - - -void _armci_buf_release_index(int index) { - int count; - buf_state_t *buf_state = _armci_buf_state->table +index; - char *_armci_buf_ptr_from_id(int id); - - if((index >= MAX_BUFS+MAX_SMALL_BUFS)|| (index<0)) - armci_die2("armci_buf_release: bad index:",index,MAX_BUFS); - - count = _armci_buf_state->table[index].count; - - if(DEBUG_ || 0) { - printf("%d:_armci_buf_release_index %d ptr=%p count=%d op=%d smavail=%d\n", - armci_me,index,_armci_buf_ptr_from_id(index),count, _armci_buf_state->table[index].op,_armci_buf_state->smavail); - fflush(stdout); - } - - /* clear table slots for all the buffers in the set for this request */ - for(; count; count--, buf_state++) *(int*)buf_state = 0; - if(index >= MAX_BUFS){ - _armci_buf_state->smallbuf[index-MAX_BUFS].id.tag=0; - //_armci_buf_state->smavail = index; - } - else{ - _armci_buf_state->buf[index].id.tag=0; - // _armci_buf_state->avail = index; - } - /* the current buffer is prime candidate to satisfy next buffer request */ -} - - -/*\ release buffer when it becomes free -\*/ -void _armci_buf_release(void *buf) { -#ifdef PORTALS_USE_ARMCI_CLIENT_BUFFERS - _armci_buf_release_index(_armci_buf_to_index(buf)); -#endif -} - - -/*\ return pointer to buffer number id -\*/ -char *_armci_buf_ptr_from_id(int id) -{ - if(id <0 || id >=(MAX_BUFS+MAX_SMALL_BUFS)) - armci_die2("armci_buf_ptr_from_id: bad id",id,MAX_BUFS); - if(id >=MAX_BUFS)return(_armci_buf_state->smallbuf[id-MAX_BUFS].buffer); - return(_armci_buf_state->buf[id].buffer); -} - - - -/*\function called from PARMCI_Wait to wait for non-blocking ops -\*/ -void _armci_buf_complete_nb_request(int bufid,unsigned int tag, int *retcode) -{ -int i=0; -#if 0 - printf("\n%d:wait called with bufid=%d tag=%d \n",armci_me,bufid,tag); - fflush(stdout); -#endif - - if(bufid == NB_NONE) *retcode=0; - else if(bufid == NB_MULTI) { - for(i=0;ibuf[i].id.tag) - _armci_buf_complete_index(i,1); - } - for(i=0;ismallbuf[i].id.tag) - _armci_buf_complete_index(i+MAX_BUFS,1); - } - *retcode=0; - } - else { - if(bufidbuf[bufid].id.tag) - _armci_buf_complete_index(bufid,1); - } - else{ - if(tag && tag==_armci_buf_state->smallbuf[bufid-MAX_BUFS].id.tag) - _armci_buf_complete_index(bufid,1); - } - *retcode=0; - } -} - - -/*\function called from PARMCI_Test to test completion of non-blocking ops -\*/ -void _armci_buf_test_nb_request(int bufid,unsigned int tag, int *retcode) -{ -int i; - if(bufid == NB_NONE) *retcode=0; - else if(bufid == NB_MULTI) { - for(i=0;ibuf[i].id.tag){ - if(_armci_buf_test_index(i,1)){ - *retcode=1; - break; - } - } - } - for(i=0;ismallbuf[i].id.tag) - if(_armci_buf_test_index(i+MAX_BUFS,1)){ - *retcode=1; - break; - } - } - } - else { - if(bufidbuf[bufid].id.tag) - *retcode = _armci_buf_test_index(bufid,1); - } - else{ - if(tag && tag==_armci_buf_state->smallbuf[bufid-MAX_BUFS].id.tag) - *retcode = _armci_buf_test_index(bufid,1); - } - } -} - -/*\function to set the buffer tag and the protocol -\*/ -void _armci_buf_set_tag(void *bufptr,unsigned int tag,short int protocol) -{ -int index = _armci_buf_to_index(bufptr); - /*_armci_buf_state->table[index].async=1;*/ - if(indexbuf[index].id.tag=tag; - _armci_buf_state->buf[index].id.protocol=protocol; - } - else{ - _armci_buf_state->smallbuf[index-MAX_BUFS].id.tag=tag; - _armci_buf_state->smallbuf[index-MAX_BUFS].id.protocol=protocol; - } -} - -int _armci_buf_get_tag(void *bufptr) -{ -int index = _armci_buf_to_index(bufptr); - if(indexbuf[index].id.tag); - else - return(_armci_buf_state->smallbuf[index-MAX_BUFS].id.tag); -} - -/*\function to return bufinfo, given buf ptr -\*/ -BUF_INFO_T *_armci_buf_to_bufinfo(void *buf){ - if(buf > (void *)_armci_buffers && buf < (void *)(_armci_buffers+MAX_BUFS)){ - return(&((BUF_TO_EBUF(buf))->id)); - } - else if(buf > (void *)_armci_smbuffers && buf < (void *)(_armci_smbuffers+MAX_SMALL_BUFS)){ - return(&((BUF_TO_SMEBUF(buf))->id)); - } - else { - armci_die("armci_buf_to_index: bad pointer",0); - return(0); - } -} - -/*\function to clear all buffers -\*/ -void _armci_buf_clear_all() -{ -int i; - for(i=0;itable[i].op || _armci_buf_state->table[i].first) - CLEAR_SEND_BUF_FIELD(_armci_buf_state->buf[i].field,_armci_buf_state->table[i].snd,_armci_buf_state->table[i].rcv,_armci_buf_state->table[i].to,_armci_buf_state->table[i].op); -#endif - } - for(i=MAX_BUFS;itable[i].op || _armci_buf_state->table[i].first) - CLEAR_SEND_BUF_FIELD(_armci_buf_state->smallbuf[i-MAX_BUFS].field,_armci_buf_state->table[i].snd,_armci_buf_state->table[i].rcv,_armci_buf_state->table[i].to,_armci_buf_state->table[i].op); -#endif - } -} - -/* function to return bufinfo, given buf tag */ -BUF_INFO_T *_armci_tag_to_bufinfo(msg_tag_t tag) { - int idx; - - for (idx=0; idx < MAX_BUFS; idx++) - if (EQ_TAGS(_armci_buffers[idx].id.tag, tag)) break; - - if (idx == MAX_BUFS) {/* not found is regular buffers */ - for (idx = 0; idx < MAX_SMALL_BUFS; idx++) - if (EQ_TAGS(_armci_smbuffers[idx].id.tag, tag)) break; - if (idx == MAX_SMALL_BUFS) /* not found at all */ - armci_die("_armci_tag_to_bufinfo: bad tag",0); - - return &(_armci_smbuffers[idx].id); - } else return &(_armci_buffers[idx].id); -} - - -/* inline primitives for buffer state management */ -INLINE char *_armci_buf_get_clear_busy(int size, int operation, int to) { - char *buf = _armci_buf_get(size, operation, to); - _armci_buf_set_busy(buf, 0); - return buf; -} - -INLINE void _armci_buf_set_busy(void *buf, int state) { - _armci_buf_state->table[_armci_buf_to_index(buf)].busy = state; -} - -INLINE void _armci_buf_set_busy_idx(int idx, int state) { - _armci_buf_state->table[idx].busy = state; -} - -#if 0 -INLINE int _armci_buf_cmpld(void *buf) { - return _armci_buf_state->table[_armci_buf_to_index(buf)].cmpl; -} -#else -INLINE int _armci_buf_cmpld(int bufid) { - return _armci_buf_state->table[bufid].cmpl; -} -#endif - - -INLINE void _armci_buf_set_cmpld(void *buf, int state) { - _armci_buf_state->table[_armci_buf_to_index(buf)].cmpl = state; -} - -INLINE void _armci_buf_set_cmpld_idx(int idx, int state) { - _armci_buf_state->table[idx].cmpl = state; -} - - diff --git a/armci/src-portals/caccumulate.c b/armci/src-portals/caccumulate.c deleted file mode 100644 index c4a05324c..000000000 --- a/armci/src-portals/caccumulate.c +++ /dev/null @@ -1,798 +0,0 @@ -/*************************************************************************** - - COPYRIGHT - -The following is a notice of limited availability of the code, and disclaimer -which must be included in the prologue of the code and in all source listings -of the code. - -Copyright Notice - + 2009 University of Chicago - -Permission is hereby granted to use, reproduce, prepare derivative works, and -to redistribute to others. This software was authored by: - -Jeff R. Hammond -Leadership Computing Facility -Argonne National Laboratory -Argonne IL 60439 USA -phone: (630) 252-5381 -e-mail: jhammond@anl.gov - - GOVERNMENT LICENSE - -Portions of this material resulted from work developed under a U.S. -Government Contract and are subject to the following license: the Government -is granted for itself and others acting on its behalf a paid-up, nonexclusive, -irrevocable worldwide license in this computer software to reproduce, prepare -derivative works, and perform publicly and display publicly. - - DISCLAIMER - -This computer code material was prepared, in part, as an account of work -sponsored by an agency of the United States Government. Neither the United -States, nor the University of Chicago, nor any of their employees, makes any -warranty express or implied, or assumes any legal liability or responsibility -for the accuracy, completeness, or usefulness of any information, apparatus, -product, or process disclosed, or represents that its use would not infringe -privately owned rights. - - ***************************************************************************/ - -/*********************************************************************** - * accumulate operation for the following datatypes: - * real, double precision, complex, double complex, integer - * - * WARNING: This file must be compiled WITH optimization under AIX. - * IBM fortran compilers generate bad code with -g option. - * - * Two versions of each routine are provided: - * original and unrolled loops. - * - ***********************************************************************/ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "acc.h" - -#if 0 - subroutine d_accumulate_1d(alpha, A, B, rows) - integer rows, r - double precision A(*), B(*), alpha -ccdir$ no_cache_alloc a,b - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_d_accumulate_1d_(const double* const restrict alpha, - double* restrict A, - const double* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - - -#if 0 - subroutine d_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double precision A(ald,*), B(bld,*), alpha -ccdir$ no_cache_alloc a,b - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_d_accumulate_2d_(const double* const alpha, - const int* const rows, - const int* const cols, - double* restrict A, - const int* const ald, - const double* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine f_accumulate_1d(alpha, A, B, rows) - integer rows, r - real A(*), B(*), alpha - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_f_accumulate_1d_(const float* const restrict alpha, - float* const restrict A, - const float* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - -#if 0 - subroutine f_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - real A(ald,*), B(bld,*), alpha - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_f_accumulate_2d_(const float* const alpha, - const int* const rows, - const int* const cols, - float* restrict A, - const int* const ald, - const float* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine z_accumulate_1d(alpha, A, B, rows) - integer rows, r - double complex A(*), B(*), alpha - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_c_accumulate_1d_(const complex_t* const restrict alpha, - complex_t* const restrict A, - const complex_t* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i].real += (*alpha).real * B[i].real - (*alpha).imag * B[i].imag; - A[i].imag += (*alpha).imag * B[i].real + (*alpha).real * B[i].imag; - } - return; -} - -#if 0 - subroutine z_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double complex A(ald,*), B(bld,*), alpha - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_c_accumulate_2d_(const complex_t* const alpha, - const int* const rows, - const int* const cols, - complex_t* restrict A, - const int* const ald, - const complex_t* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ) { - for ( r = 0 ; r < (*rows) ; r++ ) { - A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag; - A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag; - } - } - return; -} - -#if 0 - subroutine c_accumulate_1d(alpha, A, B, rows) - integer rows, r - complex A(*), B(*), alpha - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_z_accumulate_1d_(const dcomplex_t* const restrict alpha, - dcomplex_t* const restrict A, - const dcomplex_t* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i].real += (*alpha).real * B[i].real - (*alpha).imag * B[i].imag; - A[i].imag += (*alpha).imag * B[i].real + (*alpha).real * B[i].imag; - } - return; -} - -#if 0 - subroutine c_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - complex A(ald,*), B(bld,*), alpha - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_z_accumulate_2d_(const dcomplex_t* const alpha, - const int* const rows, - const int* const cols, - dcomplex_t* restrict A, - const int* const ald, - const dcomplex_t* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ) { - for ( r = 0 ; r < (*rows) ; r++ ) { - A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag; - A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag; - } - } - return; -} - -#if 0 - subroutine i_accumulate_2d(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - integer A(ald,*), B(bld,*), alpha - do c = 1, cols - do r = 1, rows - A(r,c) = A(r,c)+ alpha*B(r,c) - enddo - enddo - end -#endif - -void c_i_accumulate_1d_(const int* const restrict alpha, - int* const restrict A, - const int* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - -void c_l_accumulate_1d_(const long* const restrict alpha, - long* const restrict A, - const long* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - -void c_ll_accumulate_1d_(const long long* const restrict alpha, - long long* const restrict A, - const long long* const restrict B, - const int* const restrict rows) -{ - int i; - for ( i = 0 ; i < (*rows) ; i++ ){ - A[i] += (*alpha) * B[i]; - } - return; -} - -#if 0 - subroutine i_accumulate_1d(alpha, A, B, rows) - integer rows, r - integer A(*), B(*), alpha - do r = 1, rows - A(r) = A(r)+ alpha*B(r) - enddo - end -#endif - -void c_i_accumulate_2d_(const int* const alpha, - const int* const rows, - const int* const cols, - int* restrict A, - const int* const ald, - const int* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -void c_l_accumulate_2d_(const long* const alpha, - const int* const rows, - const int* const cols, - long* restrict A, - const int* const ald, - const long* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -void c_ll_accumulate_2d_(const long long* const alpha, - const int* const rows, - const int* const cols, - long long* restrict A, - const int* const ald, - const long long* const B, - const int* const bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine d_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double precision A(ald,*), B(bld,*), alpha - integer r1 - doubleprecision d1, d2, d3, d4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - d1 = a(r,c) + alpha*b(r,c) - d2 = a(r+1,c) + alpha*b(r+1,c) - d3 = a(r+2,c) + alpha*b(r+2,c) - d4 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = d1 - a(r+1,c) = d2 - a(r+2,c) = d3 - a(r+3,c) = d4 - enddo - enddo - end -#endif - -void c_d_accumulate_2d_u_(const double* const alpha, - const int* const rows, - const int* const cols, - double* restrict A, - const int* const ald, - const double* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine f_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - real A(ald,*), B(bld,*), alpha - integer r1 - real d1, d2, d3, d4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - d1 = a(r,c) + alpha*b(r,c) - d2 = a(r+1,c) + alpha*b(r+1,c) - d3 = a(r+2,c) + alpha*b(r+2,c) - d4 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = d1 - a(r+1,c) = d2 - a(r+2,c) = d3 - a(r+3,c) = d4 - enddo - enddo - end -#endif - -void c_f_accumulate_2d_u_(const float* const alpha, - const int* const rows, - const int* const cols, - float* restrict A, - const int* const ald, - const float* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 - subroutine z_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double complex A(ald,*), B(bld,*), alpha - integer r1 - double complex x1, x2, x3, x4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - x1 = a(r,c) + alpha*b(r,c) - x2 = a(r+1,c) + alpha*b(r+1,c) - x3 = a(r+2,c) + alpha*b(r+2,c) - x4 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = x1 - a(r+1,c) = x2 - a(r+2,c) = x3 - a(r+3,c) = x4 - enddo - enddo - end -#endif - -void c_c_accumulate_2d_u_(const complex_t* const alpha, - const int* const rows, - const int* const cols, - complex_t* restrict A, - const int* const ald, - const complex_t* const B, - const int* const bld) -{ - int r, c; - int jA, jB; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - jA = c * (*ald) + r; - jB = c * (*bld) + r; - A[ jA ].real += (*alpha).real * B[ jB ].real - (*alpha).imag * B[ jB ].imag; - A[ jA ].imag += (*alpha).imag * B[ jB ].real + (*alpha).real * B[ jB ].imag; - A[ jA+1 ].real += (*alpha).real * B[ jB+1 ].real - (*alpha).imag * B[ jB+1 ].imag; - A[ jA+1 ].imag += (*alpha).imag * B[ jB+1 ].real + (*alpha).real * B[ jB+1 ].imag; - A[ jA+2 ].real += (*alpha).real * B[ jB+2 ].real - (*alpha).imag * B[ jB+2 ].imag; - A[ jA+2 ].imag += (*alpha).imag * B[ jB+2 ].real + (*alpha).real * B[ jB+2 ].imag; - A[ jA+3 ].real += (*alpha).real * B[ jB+3 ].real - (*alpha).imag * B[ jB+3 ].imag; - A[ jA+3 ].imag += (*alpha).imag * B[ jB+3 ].real + (*alpha).real * B[ jB+3 ].imag; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag; - A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag; - } - } - return; -} - -#if 0 - subroutine c_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - complex A(ald,*), B(bld,*), alpha - integer r1 - complex x1, x2, x3, x4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - x1 = a(r,c) + alpha*b(r,c) - x2 = a(r+1,c) + alpha*b(r+1,c) - x3 = a(r+2,c) + alpha*b(r+2,c) - x4 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = x1 - a(r+1,c) = x2 - a(r+2,c) = x3 - a(r+3,c) = x4 - enddo - enddo - end -#endif - -void c_z_accumulate_2d_u_(const dcomplex_t* const alpha, - const int* const rows, - const int* const cols, - dcomplex_t* restrict A, - const int* const ald, - const dcomplex_t* const B, - const int* const bld) -{ - int r, c; - int jA, jB; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - jA = c * (*ald) + r; - jB = c * (*bld) + r; - A[ jA ].real += (*alpha).real * B[ jB ].real - (*alpha).imag * B[ jB ].imag; - A[ jA ].imag += (*alpha).imag * B[ jB ].real + (*alpha).real * B[ jB ].imag; - A[ jA+1 ].real += (*alpha).real * B[ jB+1 ].real - (*alpha).imag * B[ jB+1 ].imag; - A[ jA+1 ].imag += (*alpha).imag * B[ jB+1 ].real + (*alpha).real * B[ jB+1 ].imag; - A[ jA+2 ].real += (*alpha).real * B[ jB+2 ].real - (*alpha).imag * B[ jB+2 ].imag; - A[ jA+2 ].imag += (*alpha).imag * B[ jB+2 ].real + (*alpha).real * B[ jB+2 ].imag; - A[ jA+3 ].real += (*alpha).real * B[ jB+3 ].real - (*alpha).imag * B[ jB+3 ].imag; - A[ jA+3 ].imag += (*alpha).imag * B[ jB+3 ].real + (*alpha).real * B[ jB+3 ].imag; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag; - A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag; - } - } - return; -} - -#if 0 - subroutine i_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - integer A(ald,*), B(bld,*), alpha - - integer r1, j2, j3, j4, j5 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 - a(r,c) = a(r,c) + alpha*b(r,c) - end do - do r = r1 + 1, rows, 4 - j2 = a(r,c) + alpha*b(r,c) - j3 = a(r+1,c) + alpha*b(r+1,c) - j4 = a(r+2,c) + alpha*b(r+2,c) - j5 = a(r+3,c) + alpha*b(r+3,c) - a(r,c) = j2 - a(r+1,c) = j3 - a(r+2,c) = j4 - a(r+3,c) = j5 - enddo - enddo - end -#endif - -void c_i_accumulate_2d_u_(const int* const alpha, - const int* const rows, - const int* const cols, - int* restrict A, - const int* const ald, - const int* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -void c_l_accumulate_2d_u_(const long* const alpha, - const int* const rows, - const int* const cols, - long* restrict A, - const int* const ald, - const long* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -void c_ll_accumulate_2d_u_(const long long* const alpha, - const int* const rows, - const int* const cols, - long long* restrict A, - const int* const ald, - const long long* const B, - const int* const bld) -{ - int r, c; - int m = (*rows) - ((*rows)%4); - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < m ; r+=4 ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ]; - A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ]; - A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ]; - } - } - return; -} - -#if 0 -c---------- operations used in armci gops -------------- -c - subroutine fort_dadd(n, x, work) - integer n,i - double precision x(n), work(n) - do i= 1,n - x(i) = x(i) + work(i) - enddo - end -#endif - -void c_dadd_(const int* const restrict n, - double* const restrict x, - const double* const restrict work) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - x[i] += work[i]; - } - return; -} - -#if 0 - subroutine fort_dadd2(n, x, work, work2) - integer n,i - double precision x(n), work(n), work2(n) - do i= 1,n - x(i) = work(i) + work2(i) - enddo - end -#endif - -void c_dadd2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - x[i] = work[i] + work2[i]; - } - return; -} - -#if 0 - subroutine fort_dmult(n, x, work) - integer n,i - double precision x(n), work(n) - do i= 1,n - x(i) = x(i) * work(i) - enddo - end -#endif - -void c_dmult_(const int* const restrict n, - double* const restrict x, - const double* const restrict work) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - x[i] *= work[i]; - } - return; -} - -#if 0 - subroutine fort_dmult2(n, x, work,work2) - integer n,i - double precision x(n), work(n) - do i= 1,n - x(i) = work(i)*work2(i) - enddo - end -#endif - -void c_dmult2_(const int* const restrict n, - double* const restrict x, - const double* const restrict work, - const double* const restrict work2) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - x[i] = work[i] * work2[i]; - } - return; -} - - -// specific to src-portals && to src-gemini -void RA_ACCUMULATE_2D_(long* alpha, int* rows, int* cols, long* a, - int* lda, long* b, int* ldb) -{ -int i,j; - for(j=0;j< *cols; j++){ - long *aa = a + j* *lda; - long *bb = b + j* *ldb; - for(i=0;i< *rows; i++) - aa[i] ^= bb[i]; - } -} diff --git a/armci/src-portals/capi.c b/armci/src-portals/capi.c deleted file mode 100644 index 007618131..000000000 --- a/armci/src-portals/capi.c +++ /dev/null @@ -1,438 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - - -#if HAVE_STDIO_H -# include -#endif -#include "armci.h" -#include "parmci.h" - -int -ARMCI_AccV (int op, void *scale, armci_giov_t * darr, int len, int proc) -{ - int rval; - rval = PARMCI_AccV (op, scale, darr, len, proc); - return rval; -} - -void -ARMCI_Barrier () -{ - PARMCI_Barrier (); -} - -int -ARMCI_AccS (int optype, void *scale, void *src_ptr, int *src_stride_arr, - void *dst_ptr, int *dst_stride_arr, int *count, int stride_levels, - int proc) -{ - int rval; - rval = - PARMCI_AccS (optype, scale, src_ptr, src_stride_arr, dst_ptr, - dst_stride_arr, count, stride_levels, proc); - return rval; -} - -void -ARMCI_Finalize () -{ - PARMCI_Finalize (); -} - -int -ARMCI_NbPut (void *src, void *dst, int bytes, int proc, - armci_hdl_t * nb_handle) -{ - int rval; - rval = PARMCI_NbPut (src, dst, bytes, proc, nb_handle); - return rval; -} - -int -ARMCI_GetValueInt (void *src, int proc) -{ - int rval; - rval = PARMCI_GetValueInt (src, proc); - return rval; -} - -int -ARMCI_Put_flag (void *src, void *dst, int bytes, int *f, int v, int proc) -{ - int rval; - rval = PARMCI_Put_flag (src, dst, bytes, f, v, proc); - return rval; -} - -int -ARMCI_NbGetS (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, int proc, - armci_hdl_t * nb_handle) -{ - int rval; - rval = - PARMCI_NbGetS (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, - stride_levels, proc, nb_handle); - return rval; -} - -void * -ARMCI_Malloc_local (armci_size_t bytes) -{ - void *rval; - rval = PARMCI_Malloc_local (bytes); - return rval; -} - -int -ARMCI_Free_local (void *ptr) -{ - int rval; - rval = PARMCI_Free_local (ptr); - return rval; -} - -int -ARMCI_Get (void *src, void *dst, int bytes, int proc) -{ - int rval; - rval = PARMCI_Get (src, dst, bytes, proc); - return rval; -} - -int -ARMCI_Put (void *src, void *dst, int bytes, int proc) -{ - int rval; - rval = PARMCI_Put (src, dst, bytes, proc); - return rval; -} - -int -ARMCI_Destroy_mutexes () -{ - int rval; - rval = PARMCI_Destroy_mutexes (); - return rval; -} - -int -ARMCI_GetS (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, int proc) -{ - int rval; - rval = - PARMCI_GetS (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, - stride_levels, proc); - return rval; -} - -int -ARMCI_NbAccV (int op, void *scale, armci_giov_t * darr, int len, int proc, - armci_hdl_t * nb_handle) -{ - int rval; - rval = PARMCI_NbAccV (op, scale, darr, len, proc, nb_handle); - return rval; -} - -float -ARMCI_GetValueFloat (void *src, int proc) -{ - float rval; - rval = PARMCI_GetValueFloat (src, proc); - return rval; -} - -int -ARMCI_Malloc (void **ptr_arr, armci_size_t bytes) -{ - int rval; - rval = PARMCI_Malloc (ptr_arr, bytes); - return rval; -} - -int -ARMCI_Malloc_memdev (void **ptr_arr, armci_size_t bytes, const char *device) -{ - int rval; - rval = PARMCI_Malloc (ptr_arr, bytes); - return rval; -} - - -int -ARMCI_NbAccS (int optype, void *scale, void *src_ptr, int *src_stride_arr, - void *dst_ptr, int *dst_stride_arr, int *count, - int stride_levels, int proc, armci_hdl_t * nb_handle) -{ - int rval; - rval = - PARMCI_NbAccS (optype, scale, src_ptr, src_stride_arr, dst_ptr, - dst_stride_arr, count, stride_levels, proc, nb_handle); - return rval; -} - -int -ARMCI_PutS (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, int proc) -{ - int rval; - rval = - PARMCI_PutS (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, - stride_levels, proc); - return rval; -} - -void * -ARMCI_Memat (armci_meminfo_t * meminfo, long offset) -{ - void *rval; - rval = PARMCI_Memat (meminfo, offset); - return rval; -} - -int -ARMCI_PutV (armci_giov_t * darr, int len, int proc) -{ - int rval; - rval = PARMCI_PutV (darr, len, proc); - return rval; -} - -int -ARMCI_Free (void *ptr) -{ - int rval; - rval = PARMCI_Free (ptr); - return rval; -} - -int -ARMCI_Free_memdev (void *ptr) -{ - int rval; - rval = PARMCI_Free_memdev (ptr); - return rval; -} - -int -ARMCI_Init_args (int *argc, char ***argv) -{ - int rval; - rval = PARMCI_Init_args (argc, argv); - return rval; -} - -int -ARMCI_PutValueInt (int src, void *dst, int proc) -{ - int rval; - rval = PARMCI_PutValueInt (src, dst, proc); - return rval; -} - -void -ARMCI_Memget (size_t bytes, armci_meminfo_t * meminfo, int memflg) -{ - PARMCI_Memget (bytes, meminfo, memflg); -} - -void -ARMCI_AllFence () -{ - PARMCI_AllFence (); -} - -int -ARMCI_NbPutV (armci_giov_t * darr, int len, int proc, armci_hdl_t * nb_handle) -{ - int rval; - rval = PARMCI_NbPutV (darr, len, proc, nb_handle); - return rval; -} - -int -ARMCI_PutValueDouble (double src, void *dst, int proc) -{ - int rval; - rval = PARMCI_PutValueDouble (src, dst, proc); - return rval; -} - -int -ARMCI_GetV (armci_giov_t * darr, int len, int proc) -{ - int rval; - rval = PARMCI_GetV (darr, len, proc); - return rval; -} - -int -ARMCI_Test (armci_hdl_t * nb_handle) -{ - int rval; - rval = PARMCI_Test (nb_handle); - return rval; -} - -void -ARMCI_Unlock (int mutex, int proc) -{ - PARMCI_Unlock (mutex, proc); -} - -void -ARMCI_Fence (int proc) -{ - PARMCI_Fence (proc); -} - -int -ARMCI_Create_mutexes (int num) -{ - int rval; - rval = PARMCI_Create_mutexes (num); - return rval; -} - -int -ARMCI_PutS_flag (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, - int *flag, int val, int proc) -{ - int rval; - rval = - PARMCI_PutS_flag (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, - stride_levels, flag, val, proc); - return rval; -} - -int -ARMCI_WaitProc (int proc) -{ - int rval; - rval = PARMCI_WaitProc (proc); - return rval; -} - -void -ARMCI_Lock (int mutex, int proc) -{ - PARMCI_Lock (mutex, proc); -} - -double -ARMCI_GetValueDouble (void *src, int proc) -{ - double rval; - rval = PARMCI_GetValueDouble (src, proc); - return rval; -} - -int -ARMCI_NbGetV (armci_giov_t * darr, int len, int proc, armci_hdl_t * nb_handle) -{ - int rval; - rval = PARMCI_NbGetV (darr, len, proc, nb_handle); - return rval; -} - -int -ARMCI_Rmw (int op, int *ploc, int *prem, int extra, int proc) -{ - int rval; - rval = PARMCI_Rmw (op, ploc, prem, extra, proc); - return rval; -} - -int -ARMCI_Init () -{ - int rval; - rval = PARMCI_Init (); - return rval; -} - -int -ARMCI_Init_mpi_comm (MPI_Comm comm) -{ - int rval; - rval = PARMCI_Init_mpi_comm (comm); - return rval; -} - -int -ARMCI_WaitAll () -{ - int rval; - rval = PARMCI_WaitAll (); - return rval; -} - -int -ARMCI_NbGet (void *src, void *dst, int bytes, int proc, - armci_hdl_t * nb_handle) -{ - int rval; - rval = PARMCI_NbGet (src, dst, bytes, proc, nb_handle); - return rval; -} - -int -ARMCI_PutValueFloat (float src, void *dst, int proc) -{ - int rval; - rval = PARMCI_PutValueFloat (src, dst, proc); - return rval; -} - -int -ARMCI_NbPutS (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, int proc, - armci_hdl_t * nb_handle) -{ - int rval; - rval = - PARMCI_NbPutS (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, - stride_levels, proc, nb_handle); - return rval; -} - -int -ARMCI_PutS_flag_dir (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, - int *flag, int val, int proc) -{ - int rval; - rval = - PARMCI_PutS_flag_dir (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, - count, stride_levels, flag, val, proc); - return rval; -} - -int -ARMCI_PutValueLong (long src, void *dst, int proc) -{ - int rval; - rval = PARMCI_PutValueLong (src, dst, proc); - return rval; -} - -int -ARMCI_Wait (armci_hdl_t * nb_handle) -{ - int rval; - rval = PARMCI_Wait (nb_handle); - return rval; -} - -long -ARMCI_GetValueLong (void *src, int proc) -{ - long rval; - rval = PARMCI_GetValueLong (src, proc); - return rval; -} diff --git a/armci/src-portals/ccopy.c b/armci/src-portals/ccopy.c deleted file mode 100644 index e99f8f025..000000000 --- a/armci/src-portals/ccopy.c +++ /dev/null @@ -1,337 +0,0 @@ -/*************************************************************************** - - COPYRIGHT - -The following is a notice of limited availability of the code, and disclaimer -which must be included in the prologue of the code and in all source listings -of the code. - -Copyright Notice - + 2009 University of Chicago - -Permission is hereby granted to use, reproduce, prepare derivative works, and -to redistribute to others. This software was authored by: - -Jeff R. Hammond -Leadership Computing Facility -Argonne National Laboratory -Argonne IL 60439 USA -phone: (630) 252-5381 -e-mail: jhammond@anl.gov - - GOVERNMENT LICENSE - -Portions of this material resulted from work developed under a U.S. -Government Contract and are subject to the following license: the Government -is granted for itself and others acting on its behalf a paid-up, nonexclusive, -irrevocable worldwide license in this computer software to reproduce, prepare -derivative works, and perform publicly and display publicly. - - DISCLAIMER - -This computer code material was prepared, in part, as an account of work -sponsored by an agency of the United States Government. Neither the United -States, nor the University of Chicago, nor any of their employees, makes any -warranty express or implied, or assumes any legal liability or responsibility -for the accuracy, completeness, or usefulness of any information, apparatus, -product, or process disclosed, or represents that its use would not infringe -privately owned rights. - - ***************************************************************************/ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "copy.h" - -/* ONE-DIMENSIONAL COPY OPERATIONS */ - -#if 0 - subroutine dcopy1d_n(A, B, n) - integer n,i - double precision A(n), B(n) -ccdir$ no_cache_alloc a,b - do i = 1, n - B(i) = A(i) - end do - end -#endif - -void c_dcopy1d_n_(const double* const restrict A, - double* const restrict B, - const int* const restrict n) -{ - int i; - for ( i = 0 ; i < (*n) ; i++ ){ - B[i] = A[i]; - } - return; -} - -#if 0 - subroutine dcopy1d_u(A, B, n) - integer n,n1,i - double precision A(n), B(n) - double precision d1, d2, d3, d4 - n1 = iand(max0(n,0),3) - do i = 1, n1 - B(i) = A(i) - end do - do i = n1+1, n, 4 - d1 = a(i) - d2 = a(i+1) - d3 = a(i+2) - d4 = a(i+3) - b(i) = d1 - b(i+1) = d2 - b(i+2) = d3 - b(i+3) = d4 - end do - end -#endif - -void c_dcopy1d_u_(const double* const restrict A, - double* const restrict B, - const int* const restrict n) -{ - int i; - int m = (*n) - ((*n)%4); - for ( i = 0 ; i < m ; i+=4 ){ - B[i ] = A[i ]; - B[i+1] = A[i+1]; - B[i+2] = A[i+2]; - B[i+3] = A[i+3]; - } - for ( i = m ; i < (*n) ; i++ ){ - B[i] = A[i]; - } - return; -} - -/* TWO-DIMENSIONAL COPY OPERATIONS */ - -#if 0 - subroutine dcopy21(rows, cols, A, ald, buf, cur) - integer rows, cols - integer c, r, ald, cur - double precision A(ald,*), buf(ald) - cur = 0 - do c = 1, cols - do r = 1, rows - cur = cur+1 - buf(cur) = A(r,c) - end do - end do - end -#endif - -void c_dcopy21_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict buf, - int* const restrict cur) -{ - int r, c, i=0; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - buf[i++] = A[ c * (*ald) + r ]; - } - } - (*cur) = i; - return; -} - -#if 0 - subroutine dcopy12(rows, cols, A, ald, buf, cur) - integer rows, cols - integer c, r, ald, cur - double precision A(ald,*), buf(ald) - cur = 0 - do c = 1, cols - do r = 1, rows - cur = cur+1 - A(r,c) = buf(cur) - end do - end do - end -#endif - -void c_dcopy12_(const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict buf, - int* const restrict cur) -{ - int r, c, i=0; - i = 0; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ c * (*ald) + r ] = buf[i++]; - } - } - (*cur) = i; - return; -} - -#if 0 - subroutine dcopy2d_n(rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double precision A(ald,*), B(bld,*) - do c = 1, cols - do r = 1, rows - B(r,c) = A(r,c) - end do - end do - end -#endif - -void c_dcopy2d_n_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - B[ c * (*bld) + r ] = A[ c * (*ald) + r ]; - } - } - return; -} - -#if 0 - subroutine dcopy2d_u(rows, cols, A, ald, B, bld) - integer rows, cols - integer c, r, ald, bld - double precision A(ald,*), B(bld,*) - integer r1 - double precision d1, d2, d3, d4 - do c = 1, cols - r1 = iand(max0(rows,0),3) - do r = 1, r1 -c$$$ b(r,c) = a(r,c) + b(r,c) * 0 - b(r,c) = a(r,c) - end do - do r = r1 + 1, rows, 4 - d1 = a(r,c) - d2 = a(r+1,c) - d3 = a(r+2,c) - d4 = a(r+3,c) - b(r,c) = d1 - b(r+1,c) = d2 - b(r+2,c) = d3 - b(r+3,c) = d4 -c$$$ b(r,c) = a(r,c) + b(r,c) * 0 -c$$$ b(r+1,c) = a(r+1,c) + b(r+1,c) * 0 -c$$$ b(r+2,c) = a(r+2,c) + b(r+2,c) * 0 -c$$$ b(r+3,c) = a(r+3,c) + b(r+3,c) * 0 - enddo - enddo - end -#endif - -void c_dcopy2d_u_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld) -{ - int r, c; - for ( c = 0 ; c < (*cols) ; c++ ){ - int m = (*rows) - ((*rows)%4); - for ( r = 0 ; r < m ; r+=4 ){ - B[ c * (*bld) + r ] = A[ c * (*ald) + r ]; - B[ c * (*bld) + r+1 ] = A[ c * (*ald) + r+1 ]; - B[ c * (*bld) + r+2 ] = A[ c * (*ald) + r+2 ]; - B[ c * (*bld) + r+3 ] = A[ c * (*ald) + r+3 ]; - } - for ( r = m ; r < (*rows) ; r++ ){ - B[ c * (*bld) + r ] = A[ c * (*ald) + r ]; - } - } - return; -} - -/* THREE-DIMENSIONAL COPY OPERATIONS */ - -#if 0 - subroutine dcopy31(rows, cols, planes, A, aldr, aldc, buf, cur) - integer rows, cols, planes - integer c, r, p, aldr, aldc, cur - double precision A(aldr, aldc, *), buf(aldr) - cur = 0 - do p = 1, planes - do c = 1, cols - do r = 1, rows - cur = cur+1 - buf(cur) = A(r,c,p) - end do - end do - end do - end -#endif - -void c_dcopy31_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - const double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - double* const restrict buf, - int* const restrict cur) -{ - int r, c, p, i=0; - for ( p = 0 ; p < (*plns) ; p++ ){ - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - buf[i++] = A[ p * (*aldc) * (*aldr) + c * (*aldr) + r ]; - } - } - } - (*cur) = i; - return; -} - -#if 0 - subroutine dcopy13(rows, cols, planes, A, aldr, aldc, buf, cur) - integer rows, cols, planes - integer c, r, p, aldr, aldc, cur - double precision A(aldr, aldc, *), buf(aldr) - cur = 0 - do p = 1, planes - do c = 1, cols - do r = 1, rows - cur = cur+1 - A(r,c,p) = buf(cur) - end do - end do - end do - end -#endif - -void c_dcopy13_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - const double* const restrict buf, - int* const restrict cur) -{ - int r, c, p, i=0; - for ( p = 0 ; p < (*plns) ; p++ ){ - for ( c = 0 ; c < (*cols) ; c++ ){ - for ( r = 0 ; r < (*rows) ; r++ ){ - A[ p * (*aldc) * (*aldr) + c * (*aldr) + r ] = buf[i++]; - } - } - } - (*cur) = i; - return; -} diff --git a/armci/src-portals/clusterinfo.c b/armci/src-portals/clusterinfo.c deleted file mode 100644 index 1beaa8ced..000000000 --- a/armci/src-portals/clusterinfo.c +++ /dev/null @@ -1,497 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: clusterinfo.c,v 1.36.2.3 2007-06-13 00:46:13 vinod Exp $ */ -/****************************************************************************** -* file: cluster.c -* purpose: Determine cluster info i.e., number of machines and processes -* running on each of them. -* -*******************************************************************************/ - -#include -#include -#include -#ifdef unix -#include -#endif -#include "message.h" -#include "armcip.h" - -#ifdef WIN32 - /* this is where gethostbyname is declared */ -# include -#endif - -/* NO_SHMEM enables to simulate cluster environment on a single workstation. - * Must define NO_SHMMAX_SEARCH in shmem.c to prevent depleting shared memory - * due to a gready shmem request by the master process on cluster node 0. - */ -#if defined(DECOSF) && defined(QUADRICS) -# if !defined(REGION_ALLOC) -# define NO_SHMEM - extern int armci_enable_alpha_hack(); -# endif -#else -# define armci_enable_alpha_hack() 1 -#endif - -#define DEBUG 0 -#define MAX_HOSTNAME 80 -#define CHECK_NODE_NAMES - -/* print info on how many cluster nodes detected */ -#ifdef CLUSTER -# define PRINT_CLUSTER_INFO 1 -#else -# define PRINT_CLUSTER_INFO 0 -#endif - -#if defined(GM) - static char *network_protocol="Myrinet GM"; -#elif defined(VIA) - static char *network_protocol="VIA"; -#elif defined(MELLANOX) - static char *network_protocol="Mellanox Verbs API"; -#elif defined(OPENIB) - static char *network_protocol="OpenIB Verbs API"; -#elif defined(DOELAN4) - static char *network_protocol="Quadrics ELAN-4"; -#elif defined(QUADRICS) - static char *network_protocol="Quadrics ELAN-3"; -#elif defined(PM) - static char *network_protocol="Score PM"; -#elif defined(PORTALS) - static char *network_protocol="PORTALS"; -#elif defined(MPI_SPAWN) - static char *network_protocol="MPI-SPAWN"; -#else - static char *network_protocol="TCP/IP Sockets"; -#endif - - -/*** stores cluster configuration ***/ -armci_clus_t *armci_clus_info; - -#ifdef HITACHI -#include -# define GETHOSTNAME sr_gethostname -ndes_t _armci_group; - -static int sr_gethostname(char *name, int len) -{ -int no; -pid_t ppid; - - if(hmpp_nself (&_armci_group,&no,&ppid,0,NULL) <0) - return -1; - - if(len<6)armci_die("len too small",len); - if(no>1024)armci_die("expected node id <1024",no); - sprintf(name,"n%d",no); - return 0; -} -#elif defined(SGIALTIX) -# define GETHOSTNAME altix_gethostname -static int altix_gethostname(char *name, int len) { - sprintf(name,"altix"); - return 0; -} -#elif defined(CRAY_XT) && !defined(PORTALS) -#define GETHOSTNAME cnos_gethostname -static int cnos_gethostname(char *name, int len) -{ - sprintf(name,"%d",cnos_get_rank()); -} -#else -# define GETHOSTNAME gethostname -#endif - -static char* merge_names(char *name) -{ - int jump = 1, rem, to, from; - int lenmes, lenbuf, curlen, totbuflen= armci_nproc*HOSTNAME_LEN; - int len = strlen(name); - char *work = malloc(totbuflen); - - if(!work)armci_die("armci: merge_names: malloc failed: ",totbuflen); - - strcpy(work, name); - curlen = len+1; - - /* prefix tree merges names in the order of process numbering in log(P)time - * result = name_1//name_2//...//name_P-1 - */ - do { - jump *= 2; rem = armci_me%jump; - if(rem){ - to = armci_me - rem; - armci_msg_snd(ARMCI_TAG, work, curlen, to); - break; - }else{ - from = armci_me + jump/2; - if(from < armci_nproc){ - lenbuf = totbuflen - curlen; - armci_msg_rcv(ARMCI_TAG, work+curlen, lenbuf, &lenmes, from); - curlen += lenmes; - } - } - }while (jump < armci_nproc); - return(work); -} - - -static void process_hostlist(char *names) -{ -#ifdef CLUSTER - - int i, cluster=0; - char *s,*master; - int len, root=0; - - /******** inspect list of machine names to determine locality ********/ - if (armci_me==0){ - - /* first find out how many cluster nodes we got */ - armci_nclus =1; s=master=names; - for(i=1; i < armci_nproc; i++){ - s += strlen(s)+1; - if(strcmp(s,master)){ - /* we found a new machine name on the list */ - master = s; - armci_nclus++; - /*fprintf(stderr,"new name %s len =%d\n",master, strlen(master));*/ - - } - } - - /* allocate memory */ - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); - - /* fill the data structure -- go through the list again */ - s=names; - master="*-"; /* impossible hostname */ - cluster =0; - for(i=0; i < armci_nproc; i++){ - if(strcmp(s,master)){ - /* we found a new machine name on the list */ - master = s; - armci_clus_info[cluster].nslave=1; - armci_clus_info[cluster].master=i; - strcpy(armci_clus_info[cluster].hostname, master); - -#ifdef CHECK_NODE_NAMES - /* need consecutive task id allocated on the same node - * the current test only compares hostnames against first cluster */ - if(cluster) if(!strcmp(master,armci_clus_info[0].hostname)){ - /* we have seen that hostname before */ - fprintf(stderr, "\nIt appears that tasks allocated on the same"); - fprintf(stderr, " host machine do not have\n"); - fprintf(stderr, "consecutive message-passing IDs/numbers. "); - fprintf(stderr,"This is not acceptable \nto the ARMCI library "); - fprintf(stderr,"as it prevents SMP optimizations and would\n"); - fprintf(stderr,"lead to poor resource utilization.\n\n"); - fprintf(stderr,"Please contact your System Administrator "); - fprintf(stderr,"or, if you can, modify the "); -# if defined(MSG_COMMS_MPI) - fprintf(stderr,"MPI"); -# elif defined(TCGMSG) - fprintf(stderr,"TCGMSG"); -# elif defined(PVM) - fprintf(stderr,"PVM"); -# endif - fprintf(stderr,"\nmessage-passing job startup configuration.\n\n"); -#ifdef HITACHI - fprintf(stderr,"On Hitachi it can be done by setting environment variable MPIR_RANK_NO_ROUND, for example\n setenv MPIR_RANK_NO_ROUND yes\n\n"); -#endif - sleep(1); - armci_die("Cannot run: improper task to host mapping!",0); - } -#endif - cluster++; - - }else{ - /* the process is still on the same host */ - armci_clus_info[cluster-1].nslave++; - } - s += strlen(s)+1; - } - - if(armci_nclus != cluster) - armci_die("inconsistency processing clusterinfo",armci_nclus); - - } - /******** process 0 got all data ********/ - - /* now broadcast locality info struct to all processes - * two steps are needed because of the unknown length of hostname list - */ - len = sizeof(int); - armci_msg_brdcst(&armci_nclus, len, root); - - if(armci_me){ - /* allocate memory */ - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); - } - - len = sizeof(armci_clus_t)*armci_nclus; - armci_msg_brdcst(armci_clus_info, len, root); - - /******** all processes 0 got all data ********/ - - /* now determine current cluster node id by comparing me to master */ - armci_clus_me = armci_nclus-1; - for(i =0; i< armci_nclus-1; i++) - if(armci_me < armci_clus_info[i+1].master){ - armci_clus_me=i; - break; - } -#else - - armci_clus_me=0; - armci_nclus=1; - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); - strcpy(armci_clus_info[0].hostname, names); - armci_clus_info[0].master=0; - armci_clus_info[0].nslave=armci_nproc; -#endif - - armci_clus_first = armci_clus_info[armci_clus_me].master; - armci_clus_last = armci_clus_first +armci_clus_info[armci_clus_me].nslave-1; - -} - - -/*\ Substring Replacement: replace needle with nail in a haystack -\*/ -static char *substr_replace(char *haystack, char *needle, char *nail) -{ -char *tmp, *pos, *first; -size_t len=strlen(needle), nlen=strlen(nail),bytes; -size_t left; - - pos = strstr(haystack,needle); - if (pos ==NULL) return NULL; - first= tmp = calloc(strlen(haystack)+nlen-len+1+1,1); - if(first==NULL) return(NULL); - bytes = pos - haystack; - while(bytes){ *tmp = *haystack; tmp++; haystack++; bytes--;} - while(nlen) { *tmp = *nail; tmp++; nail++; nlen--;} - haystack += len; - left = strlen(haystack); - while(left>0){*tmp = *haystack; tmp++; haystack++; left --;} - *tmp='\0'; - return(first); -} - - -/*\ ARMCI_HOSTNAME_REPLACE contains "needle/nail" string to derive new hostname -\*/ -static char *new_hostname(char *host) -{ - char *tmp, *needle, *nail; - if((tmp =getenv("ARMCI_HOSTNAME_REPLACE"))){ - needle = strdup(tmp); - if(needle== NULL) return NULL; - nail = strchr(needle,'/'); - if(nail == NULL) return NULL; - *nail = '\0'; - nail++; - if(nail == (needle+1)){ - char* tmp1 = calloc(strlen(host)+strlen(nail)+1,1); - if(tmp1 == NULL) return NULL; - strcpy(tmp1,host); - strcat(tmp1,nail); - return tmp1; - } - return substr_replace(host,needle,nail); - } else return NULL; -} - - -static void print_clus_info() -{ -int i; - - if(PRINT_CLUSTER_INFO && armci_nclus >1 && armci_me ==0){ -#if defined(DATA_SERVER) || defined(SERVER_THREAD) - printf("ARMCI configured for %d cluster nodes. Network protocol is '%s'.\n", - armci_nclus, network_protocol); -#else - printf("ARMCI configured for %d cluster nodes\n", armci_nclus); -#endif - fflush(stdout); - } - - if(armci_me==0 && DEBUG) for(i=0;i= MAX_HOSTNAME) - armci_die("armci: hostname too long",strlen(tmp)); - strcpy(name,tmp); - printf("%d using %s hostname\n",armci_me, name); - fflush(stdout); - } - len = strlen(name); - /*a simple way to run as many servers as compute processes*/ - enval = getenv("ARMCI_NSERV_EQ_NPROC"); - if(enval != NULL){ - sprintf(name+len,"n%d",getpid()); - len = strlen(name); - printf("\n%s\n",name); - } - - -#ifdef HOSTNAME_TRUNCATE - { - /* in some cases (e.g.,SP) when name is used to determine - * cluster structure but not to establish communication - * we can truncate hostnames to save memory */ - int i; - limit = HOSTNAME_LEN-1; - for(i=0; i",i+1); - } - if(len>limit)name[limit]='\0'; - len =limit; - } -#else - if(len >= HOSTNAME_LEN-1) - armci_die("armci: gethostname overrun name string length",len); -#endif - -#ifdef NO_SHMEM - if(armci_enable_alpha_hack()) { - name[len]='0'+armci_me; - name[len+1]='\0'; - len++; - } -#endif - - if(DEBUG) - fprintf(stderr,"%d: %s len=%d\n",armci_me, name,(int)strlen(name)); - -#ifdef CLUSTER - merged = merge_names(name); /* create hostname list */ - process_hostlist(merged); /* compute cluster info */ - free(merged); -#else - process_hostlist(name); /* compute cluster info */ -#endif - -#ifndef NO_SHMEM - armci_set_shmem_limit_per_node(armci_clus_info[0].nslave); -#endif - armci_master = armci_clus_info[armci_clus_me].master; - -#ifdef NO_SHMEM - if(armci_enable_alpha_hack()) { - int i; - for(i=0;i= armci_nproc)armci_die("armci_clus_id: out of range",p); - - if(p < armci_clus_first){ from = 0; to = armci_clus_me;} - else {from = armci_clus_me; to = armci_nclus;} - - found = to-1; - for(c = from; c< to-1; c++) - if(p < armci_clus_info[c+1].master){ - found=c; - break; - } - - return (found); -} - -int armci_smp_master(int i) -{ -return(armci_clus_info[i].master); -} -/*\ return number of processes in the domain represented by id; id<0 means my node -\*/ -int armci_domain_nprocs(armci_domain_t domain, int id) -{ - if(id>= armci_nclus) armci_die2("armci domain error",id,armci_nclus); - if(id<0) id = armci_clus_me; - return armci_clus_info[id].nslave; -} - -/*\ return number of nodes in diven domain -\*/ -int armci_domain_count(armci_domain_t domain) -{ - return armci_nclus; -} - -/*\ return domain ID of the specified process -\*/ -int armci_domain_id(armci_domain_t domain, int glob_proc_id) -{ -int id = glob_proc_id; - if(id<0 || id>= armci_nproc) armci_die2("armci domain error",id,armci_nproc); - return armci_clus_id(glob_proc_id); -} - -/*\ return global ID of a process loc_proc_id in domain identified by id - * armci_domain_nproc(id)< loc_proc_id >=0 -\*/ -int armci_domain_glob_proc_id(armci_domain_t domain, int id, int loc_proc_id) -{ - if(id<0 || id>= armci_nclus) armci_die2("armci domain error",id,armci_nclus); - if(loc_proc_id<0 || loc_proc_id>= armci_clus_info[id].nslave) - armci_die2("armci domain proc error",loc_proc_id,armci_clus_info[id].nslave); - return (armci_clus_info[id].master + loc_proc_id); -} - -/*\ return ID of domain that the calling process belongs to -\*/ -int armci_domain_my_id(armci_domain_t domain) -{ - return(armci_clus_me); -} - -int armci_domain_same_id (armci_domain_t domain, int proc) -{ - int rc = SAMECLUSNODE(proc); - return(rc); -} diff --git a/armci/src-portals/code_options.h b/armci/src-portals/code_options.h deleted file mode 100644 index 4a30cd4fc..000000000 --- a/armci/src-portals/code_options.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - Questions: - ORNL - tipparajuv@ornl.gov - CRAY - ryan@cray.com -*/ - -/* --------------------------------------------------------------------------- *\ - PORTALS_USE_RENDEZ_VOUS - ======================= - When the number of PEs gets very large, the data server is required to have - buffer space available for all possible incoming messages which is defined - by PORTALS_MAX_DESCRIPTORS = (MAX_BUFS+MAX_SMALL_BUFS). - For each PE, the DS must have at least: - min_memory_per_pe = PORTALS_MAX_BUFS*PORTALS_BUF_SIZE + - PORTALS_MAX_SMALL_BUFS*PORTALS_SMALL_BUF_SIZE - This becomes a memory constraint at large core count. - Rendez-vous message is one mechanism to get around requiring the DS to - have buffer space for all messages. When rendez-vous (RZV) messaging is - enabled, the messages what use the large buffers no longer send the entire - buffer "eagerly". Instead, only the data request (request_header_t) gets - sent to the data server. When the data server is ready to handle the - request, it "pulls" the entire buffer over via a portals_get operation. - One can immediately see that this can lead to a slow down in performance, - since the data server is idle when it has to pull the data over. This is - the price paid when you remove the bufferign for those messsages. Ideally, - when the DS is pulling the message, it could be processing another request. - This double buffering technique needs to be programmed in. Care must be - taken to ensure proper ARMCI behavior. The next request handled can not be - from the same PE, nor can it be a FENCE operation ... all other (?) - requests/operations can be double buffered. -\* --------------------------------------------------------------------------- */ - # define PORTALS_USE_RENDEZ_VOUS - - - -/* --------------------------------------------------------------------------- *\ - PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - ===================================== - Another means to reduce the required buffer needed by the data server is - to limit the number of cores that can talk to the data server at any given - moment. When this options is turned on, only 1 request per node is allowed - to be in the buffer of any given data server. On a 10 core node, the size - of the buffer required by the data server is reduced by more than an order - of magnitude. You get more than an order of magnitude, because you don't - need to reserve space for any of the small buffers, since you can only have - one small or one large from any given node in the ds buffer at any one time. - Another major benefit is you can increase MAX_BUFS and MAX_SMALL_BUFS to - increase concurrency without affecting the DS's buffer size. - - Can be used with PORTALS_USE_RENDEZ_VOUS. - - notes: every request needs to respond with an ack, even gets. - acks actually send data when we limit remote request ... the ack - response is needed to trigger that the outstanding request has - been finished by the data server ... the ack zeros out the index - in the active_requests_by_node array. -\* --------------------------------------------------------------------------- */ - # define PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE_TURNED_OFF - - -/* --------------------------------------------------------------------------- *\ - PORTALS_AFFINITY - ================ - When initializing compute processes and data servers, the affinity passed - in by aprun/alps is ignored. - - Compute processes are bound strictly to a particular core. Cores are - evenly divided between sockets keeping the last core (mask = 1 << (ncpus-1)) - free for the data server. - - If the node is not fully subscribed, then the data server is bound to the - last core on the node (mask = 1 << (ncpus-1)); otherwise, the data server - is "free floating" (mask = (1 << ncpus)-1) on a fully subscribed node. -\* --------------------------------------------------------------------------- */ - # define PORTALS_AFFINITY - # define PORTALS_AFFINITY_NSOCKETS 2 - - -/* --------------------------------------------------------------------------- *\ - CRAY_USE_MDMD_COPY - ================== - Used MDMD copy instead of PtlGetRegion for on-node "local" transfers -\* --------------------------------------------------------------------------- */ - # define CRAY_USE_MDMD_COPY - - - -/* --------------------------------------------------------------------------- *\ - ORNL_USE_DS_FOR_REMOTE_GETS - =========================== - Vinod informed us of a modification that can be made to enable the use of - the data server for remote gets. Without this option, direct gets are - used. This can cause severe network congestion, because many armci_gets - are not stride 1. The data server packs those gets into contiguous blocks - and sends them back as a single put. However, the direct gets, require - many small messages. - - Unfortunately, there is a small bug in the DS for remote gets. This bug - may cause the program to abort or print out the following message: - %d: server wrote data at unexpected offset %d - - This is a bug actively being worked on @ CRAY and ORNL. -\* --------------------------------------------------------------------------- */ - # define ORNL_USE_DS_FOR_REMOTE_GETS - # define CRAY_USE_ARMCI_CLIENT_BUFFERS diff --git a/armci/src-portals/copy.h b/armci/src-portals/copy.h deleted file mode 100644 index 1bddd649a..000000000 --- a/armci/src-portals/copy.h +++ /dev/null @@ -1,525 +0,0 @@ -/* $Id: copy.h,v 1.86.2.6 2007-08-29 17:32:32 manoj Exp $ */ -#ifndef _COPY_H_ -#define _COPY_H_ - -#include -#include -#ifdef WIN32 -# include -#endif -#ifdef DECOSF -#include -#endif - - -#ifndef EXTERN -# define EXTERN extern -#endif - - -#if defined(SGI) || defined(FUJITSU) || defined(HPUX) || defined(SOLARIS) || defined (DECOSF) || defined(__ia64__) || defined(__crayx1) -# define PTR_ALIGN -#endif - -#if defined(NB_NONCONT) && !defined(CRAY_SHMEM) && !defined(QUADRICS) -#error NB_NONCONT is only available on CRAY_SHMEM,QUADRICS and PORTALS -#endif - -#if defined(SHMEM_HANDLE_SUPPORTED) && !defined(CRAY_SHMEM) -#error SHMEM_HANDLE_SUPPORTED should not be defined on a non CRAY_SHMEM network -#endif - -/* 08/30/06 moved up here from lines 252-397, MEM_FENCE before FENCE_NODE */ - -#if defined(NEED_MEM_SYNC) -# ifdef AIX -# define MEM_FENCE {int _dummy=1; _clear_lock((int *)&_dummy,0); } -# elif defined(__ia64) -# if defined(__GNUC__) && !defined (__INTEL_COMPILER) -# define MEM_FENCE __asm__ __volatile__ ("mf" ::: "memory"); -# else /* Intel Compiler */ - extern void _armci_ia64_mb(); -# define MEM_FENCE _armci_ia64_mb(); -# endif -# elif defined(LINUX) && defined(__GNUC__) && defined(__ppc__) -# define MEM_FENCE \ - __asm__ __volatile__ ("isync" : : : "memory"); -# endif -#endif - -#ifndef armci_copy -# define armci_copy(src,dst,n) bcopy(src,dst,n) -#endif - -/****************************** 2D Copy *******************/ - - -# define DCopy2D(rows, cols, src_ptr, src_ld, dst_ptr, dst_ld){\ - int j, nbytes = sizeof(double)* rows;\ - char *ps=src_ptr, *pd=dst_ptr;\ - for (j = 0; j < cols; j++){\ - armci_copy(ps, pd, nbytes);\ - ps += sizeof(double)* src_ld;\ - pd += sizeof(double)* dst_ld;\ - }\ - } - - -# define ByteCopy2D(bytes, count, src_ptr, src_stride, dst_ptr,dst_stride){\ - int _j;\ - char *ps=src_ptr, *pd=dst_ptr;\ - for (_j = 0; _j < count; _j++){\ - armci_copy(ps, pd, bytes);\ - ps += src_stride;\ - pd += dst_stride;\ - }\ - } - -#if defined(FUJITSU) - -# define armci_put2D(p, bytes,count,src_ptr,src_stride,dst_ptr,dst_stride)\ - CopyPatchTo(src_ptr, src_stride, dst_ptr, dst_stride, count,bytes, p) - -# define armci_get2D(p, bytes, count, src_ptr,src_stride,dst_ptr,dst_stride)\ - CopyPatchFrom(src_ptr, src_stride, dst_ptr, dst_stride,count,bytes,p) - -#elif defined(HITACHI) || defined(_ELAN_PUTGET_H) && !defined(NB_NONCONT) - -#if defined(QUADRICS) -#if 0 -# define WAIT_FOR_PUTS elan_putWaitAll(elan_base->state,200) -# define WAIT_FOR_GETS elan_getWaitAll(elan_base->state,200) -#else -# define WAIT_FOR_PUTS armcill_wait_put() -# define WAIT_FOR_GETS armcill_wait_get() - extern void armcill_wait_put(); - extern void armcill_wait_get(); -#endif -#endif - - extern void armcill_put2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); - extern void armcill_get2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); -# define armci_put2D armcill_put2D -# define armci_get2D armcill_get2D - -#elif defined(NB_NONCONT) - - extern void armcill_wait_put(); - extern void armcill_wait_get(); -# define WAIT_FOR_PUTS armcill_wait_put() -# define WAIT_FOR_GETS armcill_wait_get() - - extern void armcill_put2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); - extern void armcill_get2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); -# define armci_put2D armcill_put2D -# define armci_get2D armcill_get2D - -# if defined(QUADRICS) - -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - _hdl = elan_put(elan_base->state,_src,_dst,(size_t)_sz,_proc) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - _hdl = elan_get(elan_base->state,_src,_dst,(size_t)_sz,_proc) -# define armcill_nb_wait(_hdl)\ - elan_wait(_hdl,100) - -# elif defined(CRAY_SHMEM) - -# define armcill_nb_wait(_hdl)\ - shmem_wait_nb(_hdl) -/*VT:this should be ifdef'ed based on if shmem_handle is defined or not*/ -# if defined (CRAY_XT) -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - shmem_putmem(_dst, _src, (size_t)_sz, _proc) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - shmem_getmem(_dst, _src, (size_t)_sz, _proc) -# else -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - _hdl = shmem_putmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl)) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - _hdl = shmem_getmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl)) -# endif -# endif - -#else -# define armci_put2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\ - int _j;\ - char *ps=src_ptr, *pd=dst_ptr;\ - for (_j = 0; _j < count; _j++){\ - armci_put(ps, pd, bytes, proc);\ - ps += src_stride;\ - pd += dst_stride;\ - }\ - } - - -# define armci_get2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\ - int _j;\ - char *ps=src_ptr, *pd=dst_ptr;\ - for (_j = 0; _j < count; _j++){\ - armci_get(ps, pd, bytes, proc);\ - ps += src_stride;\ - pd += dst_stride;\ - }\ - } -#endif - -/* macros to ensure ordering of consecutive puts or gets following puts */ -#if defined(LAPI) - -# include "lapidefs.h" - -#elif defined(_CRAYMPP) || defined(QUADRICS) || defined(__crayx1)\ - || defined(CRAY_SHMEM) -#if defined(CRAY) || defined(CRAY_XT) -# include -#else -# include -#ifndef ptrdiff_t -# include -#endif -# include -#endif -# ifdef ELAN_ACC -# define FENCE_NODE(p) {\ - if(((p)armci_clus_last))armci_elan_fence(p);} -# define UPDATE_FENCE_STATE(p, op, nissued) -# else - int cmpl_proc; -# ifdef DECOSF -# define FENCE_NODE(p) if(cmpl_proc == (p)){\ - if(((p)armci_clus_last))shmem_quiet();\ - else asm ("mb"); } -# else -# define FENCE_NODE(p) if(cmpl_proc == (p)){\ - if(((p)armci_clus_last))shmem_quiet(); } -# endif -# define UPDATE_FENCE_STATE(p, op, nissued) if((op)==PUT) cmpl_proc=(p); -# endif -#else -# if defined(GM) && defined(ACK_FENCE) - extern void armci_gm_fence(int p); -# define FENCE_NODE(p) armci_gm_fence(p) -# elif defined(BGML) -# include "bgmldefs.h" -# define FENCE_NODE(p) BGML_WaitProc(p) -# else -# define FENCE_NODE(p) -# endif -# define UPDATE_FENCE_STATE(p, op, nissued) - -#endif - - -#ifdef NEC -# define THRESH 1 -# define THRESH1D 1 -#else -# define THRESH 32 -# define THRESH1D 512 -#endif -#define ALIGN_SIZE sizeof(double) - -/********* interface to C 1D and 2D memory copy functions ***********/ -/* dcopy2d_u_ uses explicit unrolled loops to depth 4 */ -void c_dcopy2d_n_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld); -void c_dcopy2d_u_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld); -void c_dcopy1d_n_(const double* const restrict A, - double* const restrict B, - const int* const restrict n); -void c_dcopy1d_u_(const double* const restrict A, - double* const restrict B, - const int* const restrict n); -void c_dcopy21_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict buf, - int* const restrict cur); -void c_dcopy12_(const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict buf, - int* const restrict cur); -void c_dcopy31_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - const double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - double* const restrict buf, - int* const restrict cur); -void c_dcopy13_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - const double* const restrict buf, - int* const restrict cur); - -/********* interface to fortran 1D and 2D memory copy functions ***********/ -#if ENABLE_F77 -# ifdef WIN32 -# define ATR __stdcall -# else -# define ATR -# endif -# define dcopy2d_n_ F77_FUNC_(dcopy2d_n,DCOPY2D_N) -# define dcopy2d_u_ F77_FUNC_(dcopy2d_u,DCOPY2D_U) -# define dcopy1d_n_ F77_FUNC_(dcopy1d_n,DCOPY1D_N) -# define dcopy1d_u_ F77_FUNC_(dcopy1d_u,DCOPY1D_U) -# define dcopy21_ F77_FUNC(dcopy21,DCOPY21) -# define dcopy12_ F77_FUNC(dcopy12,DCOPY12) -# define dcopy31_ F77_FUNC(dcopy31,DCOPY31) -# define dcopy13_ F77_FUNC(dcopy13,DCOPY13) -void ATR dcopy2d_n_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld); -void ATR dcopy2d_u_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict B, - const int* const restrict bld); -void ATR dcopy1d_n_(const double* const restrict A, - double* const restrict B, - const int* const restrict n); -void ATR dcopy1d_u_(const double* const restrict A, - double* const restrict B, - const int* const restrict n); -void ATR dcopy21_(const int* const restrict rows, - const int* const restrict cols, - const double* const restrict A, - const int* const restrict ald, - double* const restrict buf, - int* const restrict cur); -void ATR dcopy12_(const int* const restrict rows, - const int* const restrict cols, - double* const restrict A, - const int* const restrict ald, - const double* const restrict buf, - int* const restrict cur); -void ATR dcopy31_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - const double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - double* const restrict buf, - int* const restrict cur); -void ATR dcopy13_(const int* const restrict rows, - const int* const restrict cols, - const int* const restrict plns, - double* const restrict A, - const int* const restrict aldr, - const int* const restrict aldc, - const double* const restrict buf, - int* const restrict cur); -#endif - -#if NOFORT -# if defined(AIX) || defined(BGML) -# define DCOPY2D c_dcopy2d_u_ -# define DCOPY1D c_dcopy1d_u_ -# elif defined(LINUX) || defined(__crayx1) || defined(HPUX64) || defined(DECOSF) || defined(CRAY) || defined(WIN32) || defined(HITACHI) -# define DCOPY2D c_dcopy2d_n_ -# define DCOPY1D c_dcopy1d_n_ -# else -# define DCOPY2D c_dcopy2d_u_ -# define DCOPY1D c_dcopy1d_u_ -# endif -# define DCOPY21 c_dcopy21_ -# define DCOPY12 c_dcopy12_ -# define DCOPY31 c_dcopy31_ -# define DCOPY13 c_dcopy13_ -#else -# if defined(AIX) || defined(BGML) -# define DCOPY2D dcopy2d_u_ -# define DCOPY1D dcopy1d_u_ -# elif defined(LINUX) || defined(__crayx1) || defined(HPUX64) || defined(DECOSF) || defined(CRAY) || defined(WIN32) || defined(HITACHI) -# define DCOPY2D dcopy2d_n_ -# define DCOPY1D dcopy1d_n_ -# else -# define DCOPY2D dcopy2d_u_ -# define DCOPY1D dcopy1d_u_ -# endif -# define DCOPY21 dcopy21_ -# define DCOPY12 dcopy12_ -# define DCOPY31 dcopy31_ -# define DCOPY13 dcopy13_ -#endif - - -/***************************** 1-Dimensional copy ************************/ -#if defined(QUADRICS) -# include - -# if defined(_ELAN_PUTGET_H) -# define qsw_put(src,dst,n,proc) \ - elan_wait(elan_put(elan_base->state,src,dst,n,proc),elan_base->waitType) -# define qsw_get(src,dst,n,proc) \ - elan_wait(elan_get(elan_base->state,src,dst,n,proc),elan_base->waitType) -/* -# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\ - *(phandle)=elan_put(elan_base->state,src,dst,n,proc) -*/ -#ifdef DOELAN4 -extern void armci_elan_put_with_tracknotify(char *src,char *dst,int n,int proc, ELAN_EVENT **phandle); -# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\ - armci_elan_put_with_tracknotify(src,dst,n,proc,phandle) -#endif - -# define ARMCI_NB_GET(src,dst,n,proc,phandle)\ - *(phandle)=elan_get(elan_base->state,src,dst,n,proc) -# define ARMCI_NB_WAIT(handle) if(handle)elan_wait(handle,elan_base->waitType) -# define ARMCI_NB_TEST(handle,_succ) (*(_succ))= (handle)? !elan_poll(handle,1L): 1 -# else -# define qsw_put(src,dst,n,proc) shmem_putmem((dst),(src),(int)(n),(proc)) -# define qsw_get(src,dst,n,proc) shmem_getmem((dst),(src),(int)(n),(proc)) -# endif - -# define armci_put(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { qsw_put(src,dst,n,proc);} -# define armci_get(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { qsw_get((src),(dst),(int)(n),(proc));} - -#elif defined(CRAY_T3E) || defined(CRAY_SHMEM) -# define armci_copy_disabled(src,dst,n)\ - if((n)<256 || n%sizeof(long) ) memcpy((dst),(src),(n));\ - else {\ - shmem_put((long*)(dst),(long*)(src),(int)(n)/sizeof(long),armci_me);\ - shmem_quiet(); } - -# define armci_put(src,dst,n,proc) \ - shmem_put32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\ - shmem_quiet() - -# define armci_get(src,dst,n,proc) \ - shmem_get32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\ - shmem_quiet() - -#elif defined(HITACHI) - - extern void armcill_put(void *src, void *dst, int bytes, int proc); - extern void armcill_get(void *src, void *dst, int bytes, int proc); - -# define armci_put(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armcill_put((src), (dst),(n),(proc));} - -# define armci_get(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armcill_get((src), (dst),(n),(proc));} - -#elif defined(FUJITSU) - -# include "fujitsu-vpp.h" -# ifndef __sparc -# define armci_copy(src,dst,n) _MmCopy((char*)(dst), (char*)(src), (n)) -# endif -# define armci_put CopyTo -# define armci_get CopyFrom - -#elif defined(LAPI) - -# include - extern lapi_handle_t lapi_handle; - -# define armci_put(src,dst,n,proc)\ - if(proc==armci_me){\ - armci_copy(src,dst,n);\ - } else {\ - if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\ - NULL,&(ack_cntr[ARMCI_THREAD_IDX].cntr),&cmpl_arr[proc].cntr))\ - ARMCI_Error("LAPI_put failed",0); else;} - - /**** this copy is nonblocking and requires fence to complete!!! ****/ -# define armci_get(src,dst,n,proc) \ - if(proc==armci_me){\ - armci_copy(src,dst,n);\ - } else {\ - if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \ - NULL, &(get_cntr[ARMCI_THREAD_IDX].cntr)))\ - ARMCI_Error("LAPI_Get failed",0);else;} - -# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\ - {if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\ - ARMCI_Error("LAPI_Setcntr in NB_PUT failed",0);\ - (cmplt)->val=1;\ - if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\ - NULL, &((cmplt)->cntr), &cmpl_arr[proc].cntr))\ - ARMCI_Error("LAPI_put failed",0); else;} - -# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\ - {if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\ - ARMCI_Error("LAPI_Setcntr in NB_GET failed",0);\ - (cmplt)->val=1;\ - if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \ - NULL, &((cmplt)->cntr)))\ - ARMCI_Error("LAPI_Get NB_GET failed",0);else;} - -# define ARMCI_NB_WAIT(cmplt) CLEAR_COUNTER((cmplt)) -# define ARMCI_NB_TEST(cmplt,_succ) TEST_COUNTER((cmplt),(_succ)) - -#elif defined(PORTALS) -# define armci_put(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armci_portals_put((proc),(src), (dst),(n),NULL,0);} - -# define armci_get(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armci_portals_get((proc),(src), (dst),(n),NULL,0);} - -# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\ - nb_handle->tag=GET_NEXT_NBTAG();armci_portals_put((proc),(src),\ - (dst),(n),cmplt,nb_handle->tag) -# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\ - nb_handle->tag=GET_NEXT_NBTAG();armci_portals_get((proc),(src),\ - (dst),(n),cmplt,nb_handle->tag) - -#elif defined(BGML) -#define armci_get(src, dst, n, p) PARMCI_Get(src, dst, n, p) -#define armci_put(src, dst, n, p) PARMCI_Put(src, dst, n, p) - -#else - -# define armci_get(src,dst,n,p) armci_copy((src),(dst),(n)) -# define armci_put(src,dst,n,p) armci_copy((src),(dst),(n)) - -#endif - -#ifndef MEM_FENCE -# define MEM_FENCE -#endif -#ifndef armci_copy_fence -# define armci_copy_fence armci_copy -#endif - -#endif diff --git a/armci/src-portals/ds-shared.c b/armci/src-portals/ds-shared.c deleted file mode 100644 index 96b33ace8..000000000 --- a/armci/src-portals/ds-shared.c +++ /dev/null @@ -1,575 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "armcip.h" -#include "request.h" -#include "message.h" -#include "memlock.h" -#include "copy.h" -#include "gpc.h" -#include -#include -#ifdef WIN32 -#include -#else -#include -#endif - -#define DEBUG_ 0 -#define DEBUG1 0 - -#ifndef SERV -# define SERV 2 -#endif - -#ifdef SOCKETS -# define EQ_TAGS(a_, b_) ((a_) == (b_)) -#else -# define EQ_TAGS(a_, b_) !memcmp(&(a_), &(b_), sizeof(a_)) -#endif - -int _armci_server_started=0; - -extern active_socks_t *_armci_active_socks; - -#ifdef ARMCI_CHECK_STATE -typedef struct sarns{ - int data; - long data1; - struct sarns *next; -} sarnode; - -sarnode **sarn_np=NULL; - -sarnode * sarlist_add(int pr, int i,long j) -{ -sarnode **p = &sarn_np[pr]; - sarnode *n = (sarnode *)malloc(sizeof(sarnode)); - assert(n != NULL); - - n->next = *p; - *p = n; - n->data = i; - n->data1 = j; - return *p; -} - -void sarlist_remove(sarnode **p) -{ - if(*p != NULL){ - sarnode *n = *p; - *p = (*p)->next; - free(n); - } -} - -sarnode **sarlist_search(sarnode **n, long i) -{ - while (*n != NULL){ - if ((*n)->data == i){ - return n; - } - n = &(*n)->next; - } - return NULL; -} - -void sarlist_print(int proc) -{ - sarnode *n =sarn_np[proc]; - if (n == NULL){ - /*printf("sarlist is empty\n");*/ - } - while (n != NULL){ - printf("(%d):%d %d next=%d\n", armci_me,n->data,n->data1,(n->next==NULL)?0:1); - n = n->next; - } -} -#endif - -/*\ client sends request to server -\*/ -void armci_send_req(int proc, request_header_t* msginfo, int len,int tag) -{ -int hdrlen = sizeof(request_header_t); -int bytes; - - ARMCI_PR_DBG("enter",0); - if(msginfo->operation == GET) { - if(msginfo->format==VECTOR && msginfo->ehlen > 0) { - printf("%s [cp] unhandled condition in send_req for VECTOR and ehlen\n",Portals_ID()); - abort(); - bytes = msginfo->dscrlen + hdrlen + msginfo->datalen; - } else { - bytes = msginfo->dscrlen + hdrlen; - } - } else bytes = msginfo->bytes + hdrlen; - - if(DEBUG_){printf("%d: sending req %d (len=%d dscr=%d data=%d) to %d \n", - armci_me, msginfo->operation, bytes,msginfo->dscrlen, - msginfo->datalen,proc); fflush(stdout); - } - if(bytes > len) armci_die2("armci_send_req:buffer overflow",bytes,len); - msginfo->tag.data_ptr = (msginfo+1); // not really data, but dscr ptr - armci_send_req_msg(proc,msginfo, bytes,tag); - ARMCI_PR_DBG("exit",0); -} - - -/*\ client sends strided data + request to server -\*/ -void armci_send_strided(int proc, request_header_t *msginfo, char *bdata, - void *ptr, int strides, int stride_arr[], int count[],int tag) -{ -int hdrlen = sizeof(request_header_t); -int dscrlen = msginfo->dscrlen; -int datalen = msginfo->datalen; -int cluster = armci_clus_id(proc); -int bytes; -int i,na; -char *a; -double *tmp; - ARMCI_PR_DBG("enter",0); - bytes = msginfo->bytes + hdrlen; - if(0){ - printf("%d:sending strided %d to(%d,%d,%d) bytes=%d dslen=%d dlen=%d,\n", - armci_me, msginfo->operation, msginfo->to, - cluster, proc, bytes, dscrlen, datalen); fflush(stdout); - } - armci_write_strided(ptr, strides, stride_arr, count, bdata); - msginfo->tag.data_ptr = (msginfo+1); -#ifdef RMO_DEBUG_ - a = (char *) (msginfo + 1); - a += msginfo->dscrlen; - tmp = (double *) a; - na = msginfo->datalen/sizeof(double); - for(i=0; idatalen; -char *buf; - ARMCI_PR_DBG("enter",0); - if(rcvlen)datalen=rcvlen; - if(DEBUG_) { - printf("%d:armci_rcv_data: bytes= %d \n", armci_me, datalen); - fflush(stdout); - } - - if(datalen == 0) armci_die("armci_rcv_data: no data to receive",datalen); - - buf = armci_ReadFromDirect(proc, msginfo, datalen); - - if(DEBUG_){ - printf("%d:armci_rcv_data: got %d bytes \n",armci_me,datalen); - fflush(stdout); - } - ARMCI_PR_DBG("exit",0); - return(buf); -} - -/*\ client receives vector data from server and unpacks to the right loc -\*/ -void armci_rcv_vector_data(int proc, request_header_t* msginfo, armci_giov_t darr[], int len) -{ - ARMCI_PR_DBG("enter",0); - char *buf = armci_rcv_data(proc, msginfo, 0); - armci_vector_from_buf(darr, len, buf); - ARMCI_PR_DBG("exit",0); -} - -/*\ client receives strided data from server -\*/ -void armci_rcv_strided_data(int proc, request_header_t* msginfo, int datalen, - void *ptr, int strides,int stride_arr[],int count[]) -{ - char *databuf; - ARMCI_PR_DBG("enter",0); - if(DEBUG_){ - printf("%d: armci_rcv_strided_data: expecting datalen %d from %d\n", - armci_me, datalen, proc); fflush(stdout); - } - databuf = armci_ReadFromDirect(proc,msginfo,0); - armci_read_strided(ptr, strides, stride_arr, count, databuf); - ARMCI_PR_DBG("exit",0); -} - -void armci_rem_state(int clus) -{ -int bufsize = sizeof(request_header_t)+sizeof(int); -int destproc = 0; -request_header_t *msginfo; -destproc = SERVER_NODE(clus); -msginfo = (request_header_t *)GET_SEND_BUFFER(bufsize,STATE,destproc); -int tag=0; - - ARMCI_PR_DBG("enter",0); - msginfo->dscrlen = 0; - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(clus); - msginfo->operation = STATE; - msginfo->bytes =0; - msginfo->datalen =sizeof(int); - msginfo->tag.data_ptr = (msginfo+1); - - if(DEBUG_){ - printf("%d(c):sending ACKreq to %d clus=%d\n",armci_me,msginfo->to,clus); - fflush(stdout); - } - - armci_send_req(armci_clus_info[clus].master, msginfo, bufsize,tag); - armci_rcv_data(armci_clus_info[clus].master, msginfo,0); /* receive */ - FREE_SEND_BUFFER(msginfo); - ARMCI_PR_DBG("exit",0); -} - - -/*\ get ACK from server -\*/ -void armci_rem_ack(int clus) -{ -int bufsize = sizeof(request_header_t)+sizeof(int); -int destproc = 0; -request_header_t *msginfo; -destproc = SERVER_NODE(clus); -msginfo = (request_header_t *) GET_SEND_BUFFER(bufsize,ACK,destproc); -int tag=0; - - ARMCI_PR_DBG("enter",0); - msginfo->dscrlen = 0; - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(clus); - msginfo->operation = ACK; - msginfo->bytes =0; - msginfo->datalen =sizeof(int); - msginfo->tag.data_ptr = (msginfo+1); - - if(DEBUG_){ - printf("%d(c):sending ACKreq to %d clus=%d\n",armci_me,msginfo->to,clus); - fflush(stdout); - } - - armci_send_req(armci_clus_info[clus].master, msginfo, bufsize,tag); - armci_rcv_data(armci_clus_info[clus].master, msginfo,0); /* receive ACK */ - FREE_SEND_BUFFER(msginfo); - ARMCI_PR_DBG("exit",0); -} - - -/*\ request to QUIT sent by client -\*/ -void armci_serv_quit() -{ -int bufsize = sizeof(request_header_t)+sizeof(int); -int destproc; -request_header_t *msginfo; -destproc = SERVER_NODE(armci_clus_me); -msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,QUIT,destproc); -int tag=0; - - ARMCI_PR_DBG("enter",0); - if(DEBUG_){ printf("%d master: sending quit request to server\n",armci_me); - fflush(stdout); - } - - msginfo->dscrlen = 0; - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(armci_clus_me); - msginfo->operation = QUIT; - if(ACK_QUIT) - msginfo->bytes = msginfo->datalen = sizeof(int); /* ACK */ - else - msginfo->bytes = msginfo->datalen = 0; /* no ACK */ - - armci_send_req(armci_master, msginfo, bufsize,tag); - - if(ACK_QUIT){ - int stat; - stat = *(int*)armci_rcv_data(armci_master,msginfo,0); /* receive ACK */ - if(stat != QUIT) - armci_die("armci_serv_quit: wrong response from server", stat); - FREE_SEND_BUFFER(msginfo); - } - ARMCI_PR_SDBG("exit",0); -} - - -/***************************** server side *********************************/ - -static void armci_check_req(request_header_t *msginfo, int buflen) -{ - - ARMCI_PR_SDBG("enter",msginfo->operation); - if((msginfo->to != armci_me && msginfo->to < armci_master) || - msginfo->to >= armci_master + armci_clus_info[armci_clus_me].nslave) - /*armci_die("armci_check_req: invalid to", msginfo->to);*/ - printf("\n%d:got following to %d",armci_me,msginfo->to); - if(msginfo->dscrlen < 0) - armci_die("armci_check_req: dscrlen < 0", msginfo->dscrlen); - if(msginfo->datalen < 0) - armci_die("armci_check_req: datalen < 0", msginfo->datalen); - if(msginfo->dscrlen > (int)msginfo->bytes) - armci_die2("armci_check_req: dsclen > bytes", msginfo->dscrlen, - msginfo->bytes); - ARMCI_PR_SDBG("exit",0); -} - - -/*\ server response - send data to client -\*/ -void armci_send_data(request_header_t* msginfo, void *data) -{ - int to = msginfo->from; - ARMCI_PR_SDBG("enter",0); - armci_WriteToDirect(to, msginfo, data); - ARMCI_PR_SDBG("exit",0); -} - - -/*\ server sends strided data back to client -\*/ -void armci_send_strided_data(int proc, request_header_t *msginfo, - char *bdata, void *ptr, int strides, - int stride_arr[], int count[]) -{ - int i,na; - double *a = NULL; - int to = msginfo->from; - ARMCI_PR_SDBG("enter",0); - - if(DEBUG_){ printf("%d(server): sending datalen = %d to %d %p\n", - armci_me, msginfo->datalen, to,ptr); fflush(stdout); } - - /* for small contiguous blocks copy into a buffer before sending */ - armci_write_strided(ptr, strides, stride_arr, count, bdata); - -#ifdef RMO_PORTALS_DEBUG_GET - a = (double *) bdata; - na = msginfo->datalen/sizeof(double); - for(i=0; idatalen,to); - fflush(stdout); - } - ARMCI_PR_SDBG("exit",0); -} - - -/*\ server sends ACK to client -ptl_event_t *ev = (ptl_event_t *) msginfo->tag.user_ptr; - - ARMCI_PR_SDBG("enter",0); - if(DEBUG_){ - printf("%d server: terminating request by %d\n",armci_me,msginfo->from); - fflush(stdout); - } - - portals_ds_send_ack(ev->initiator,ev->hdr_data); - -\*/ -void armci_server_ack(request_header_t* msginfo) -{ -int *ack=(int*)(msginfo+1); -ptl_event_t *ev = (ptl_event_t *) msginfo->tag.user_ptr; - - ARMCI_PR_SDBG("enter",0); - if(DEBUG_){ - printf("%d server: sending ACK to %d\n",armci_me,msginfo->from); - fflush(stdout); - } - -#ifndef OLD_PORTALS_CODE - portals_ds_send_ack(ev->initiator,ev->hdr_data); -#else - *ack = ACK; - if(msginfo->datalen != sizeof(int)) - armci_die("armci_server_ack: bad datalen=",msginfo->datalen); - armci_send_data(msginfo,ack); -#endif - - ARMCI_PR_SDBG("exit",0); -} - - - -/*\ server action triggered by request to quit -\*/ -void armci_server_goodbye(request_header_t* msginfo) -{ - ptl_event_t *ev = (ptl_event_t *) msginfo->tag.user_ptr; - - ARMCI_PR_SDBG("enter",0); - if(DEBUG_){ - printf("%d server: terminating request by %d\n",armci_me,msginfo->from); - fflush(stdout); - } - - portals_ds_send_ack(ev->initiator,ev->hdr_data); - -#ifdef ARMCI_CHECK_STATE_ - for(int i=0;itag.user_ptr; - - /* check what we got */ -// armci_check_req(msginfo,buflen); - from = msginfo->from; - - if(DEBUG_){ - printf("%d(serv):got %d request from %d\n",armci_me,msginfo->operation, - from); - fflush(stdout); - } - -/*if(msginfo->operation==GET)fprintf(stderr,"GET request received with tag: %d\n",msginfo->tag);*/ - - switch(msginfo->operation){ -# ifdef ARMCI_CHECK_STATE - case STATE: - printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - abort(); - if(DEBUG_){printf("\n%d:state request\n",armci_me);fflush(stdout);} - sarlist_print(msginfo->from); - armci_WriteToDirect(msginfo->from, msginfo, (msginfo+1)); - break; -# endif - case QUIT: - if(DEBUG_){ - printf("%d(serv):got QUIT request from %d\n",armci_me, from); - fflush(stdout); - } - armci_server_goodbye(msginfo); - break; /*pessimism?*/ - - case ACK: - // printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - // abort(); - if(DEBUG_) { - fprintf(stdout, "%d(server): got ACK request from %d\n", - armci_me, msginfo->from); fflush(stdout); - } - armci_server_ack(msginfo); - break; - - case ATTACH: - printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - abort(); - if(DEBUG_){ - printf("%d(serv):got ATTACH request from%d\n",armci_me, from); - fflush(stdout); - } - armci_server_ipc(msginfo, descr, buffer, buflen); - break; - case ARMCI_SWAP: - case ARMCI_SWAP_LONG: - case ARMCI_FETCH_AND_ADD: - case ARMCI_FETCH_AND_ADD_LONG: - // printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - // abort(); - armci_server_rmw(msginfo,descr,buffer); - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - portals_ds_send_ack(ev->initiator,ev->hdr_data); - # endif - break; - - case LOCK: - printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - abort(); - armci_server_lock(msginfo); - break; - - case UNLOCK: - printf("[ds %d]: operation=%d not supported yet\n",armci_me,msginfo->operation); - abort(); - armci_server_unlock(msginfo, descr); - msginfo->tag.ack=ARMCI_STAMP; - x_net_send_ack(msginfo,msginfo->from,msginfo->tag.ack_ptr,&msginfo->tag.ack); - break; - - default: - if(msginfo->format ==VECTOR){ - // if(msginfo->operation != PUT && msginfo->operation !=GET && !ACC(msginfo->operation)) { - // printf("[ds %d]: operation=%d (format==VECTOR) not supported yet\n",armci_me,msginfo->operation); - // abort(); - // } - armci_server_vector(msginfo, descr, buffer, buflen); - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - portals_ds_send_ack(ev->initiator,ev->hdr_data); - # else - if(msginfo->operation==PUT || ARMCI_ACC(msginfo->operation)){ - portals_ds_send_ack(ev->initiator,ev->hdr_data); - } - # endif - } - else if(msginfo->format ==STRIDED){ - // if(msginfo->operation != PUT && msginfo->operation != GET && !ACC(msginfo->operation)) { - // printf("[ds %d]: operation=%d (format==STRIDED) not supported yet\n",armci_me,msginfo->operation); - // abort(); - // } - armci_server(msginfo, descr, buffer, buflen); - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - portals_ds_send_ack(ev->initiator,ev->hdr_data); - # else - if(msginfo->operation==PUT || ARMCI_ACC(msginfo->operation)){ - portals_ds_send_ack(ev->initiator,ev->hdr_data); - } - # endif - } - else - armci_die2("armci_data_serv: unknown format code", - msginfo->format, msginfo->from); - } - ARMCI_PR_SDBG("exit",0); -} - - - diff --git a/armci/src-portals/fence.c b/armci/src-portals/fence.c deleted file mode 100644 index 1199d553a..000000000 --- a/armci/src-portals/fence.c +++ /dev/null @@ -1,92 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: fence.c,v 1.25.4.6 2007-08-30 19:17:02 manoj Exp $ */ -#include "armcip.h" -#include "armci.h" -#include "copy.h" -#include -#if defined(PVM) -# include -#elif defined(TCGMSG) -# include -#elif defined(BGML) -# include "bgml.h" -#else -# include -#endif - -char *_armci_fence_arr; - -void armci_init_fence() -{ -#if defined (DATA_SERVER) - _armci_fence_arr=calloc(armci_nproc,1); - if(!_armci_fence_arr)armci_die("armci_init_fence: calloc failed",0); -#endif -} - -void ARMCI_DoFence(int proc) -{ -int i; - if(!SAMECLUSNODE(proc) && (armci_nclus >1)){ - int cluster = armci_clus_id(proc); - armci_rem_ack(cluster); - } -} - -void PARMCI_Fence(int proc) -{ -int i; - -#if defined(DATA_SERVER) && !(defined(GM) && defined(ACK_FENCE)) -// printf("%d [cp] fence_arr(%d)=%d\n",armci_me,proc,FENCE_ARR(proc)); - if(FENCE_ARR(proc) && (armci_nclus >1)){ - - int cluster = armci_clus_id(proc); - int master=armci_clus_info[cluster].master; - - armci_rem_ack(cluster); - - /* one ack per cluster node suffices */ - /* note, in multi-threaded case it will only clear for current thread */ - bzero(&FENCE_ARR(master),armci_clus_info[cluster].nslave); - } -#elif defined(BGML) - BGML_WaitProc(proc); - MEM_FENCE; -#else - FENCE_NODE(proc); - MEM_FENCE; -#endif -} - - -/* - portals developers' note: - armci fence is not guaranteed to be correct unless PUT_START events are captured - PUT_ENDs do NOT guarantee order; only PUT_STARTs -*/ -void PARMCI_AllFence() -{ -#if defined(CLUSTER) - { int p; for(p=0;p -#include "armcip.h" -#include "locks.h" -#include "gpc.h" - -#define GPC_SLOTS 32 -#define GPC_OFFSET -100 -static void *_table[GPC_SLOTS]={ -(void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, -(void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, -(void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, -(void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0, (void*)0}; - -#if (defined(LAPI) || defined(GM) || defined(VAPI) || defined(QUADRICS)) && ARMCI_ENABLE_GPC_CALLS - -/*\ callback functions must be registered -- user gets int handle back -\*/ -int ARMCI_Gpc_register( int (*func) ()) -{ - int handle =-1, candidate = 0; - - PARMCI_Barrier(); - do{ - if(!_table[candidate]){ - handle = candidate; - _table[candidate]=func; - } - candidate++; - }while(candidate < GPC_SLOTS && handle == -1); - return(GPC_OFFSET-handle); -} - -/*\ release/deassociate handle with previously registered callback function -\*/ -void ARMCI_Gpc_release(int handle) -{ - int h = -handle + GPC_OFFSET; - - PARMCI_Barrier(); - if(h<0 || h >= GPC_SLOTS) armci_die("ARMCI_Gpc_release: bad handle",h); - _table[h] = (void*)0; -} - - - -/*\ Send Request to Execute callback function in a global address space - * Arguments: - * f - handle to the callback function - * p - remote processor - * hdr - header data - used to pack extra args for callback (local buffer) - * hlen - size of header data < ARMCI_GPC_HLEN - * data - bulk data passed to callback (local buffer) - * dlen - length of bulk data - * rhdr - ptr to reply header (return args from callback) - * rhlen - length of buffer to store reply header < ARMCI_GPC_HLEN - * rdata - ptr to where reply data from callback should be stored (local buf) - * rdlen - size of the buffer to store reply data - * nbh - nonblocking handle - * -\*/ -int ARMCI_Gpc_exec(int h, int p, void *hdr, int hlen, void *data, int dlen, - void *rhdr, int rhlen, void *rdata, int rdlen, gpc_hdl_t* nbh) -{ - int hnd = -h + GPC_OFFSET; - int err = 0; - armci_hdl_t *ahdl = (nbh ? &(nbh->ahdl): NULL); - - if(hnd <0 || hnd>= GPC_SLOTS) - err += fprintf(stderr, "ARMCI_Gpc_exec: bad callback handle %d: %d\n",hnd,GPC_SLOTS); - if(!_table[hnd]) - err += fprintf(stderr, "ARMCI_Gpc_exec: NULL function %d",hnd); - - if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send header size %d %d\n", hlen, ARMCI_Gpc_get_hlen()); - if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv header size %d %d\n", rhlen, ARMCI_Gpc_get_hlen()); - if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send data size %d %d\n", dlen, ARMCI_Gpc_get_dlen()); - if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv data size %d %d\n", rdlen, ARMCI_Gpc_get_dlen()); - - if(hlen>0 && hdr==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null send header for non-zero header size %d\n", hlen); - if(rhlen>0 && rhdr==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv header for non-zero header size %d\n", rhlen); - if(dlen>0 && data==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null send data for non-zero data size %d\n", dlen); - if(rdlen>0 && rdata==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv data for non-zero header size %d\n", rdlen); - - if(p<0 || p >= armci_nproc) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid target processor id %d\n", p, armci_nproc); - - if(err) - return FAIL; - - if(rhlen + rdlen == 0) - armci_die("Zero reply header + data length not yet supported", 0); - - if(nbh) - nbh->proc = p; -#if 1 - if(SAMECLUSNODE(p) && armci_nproc==1) { - int rhsize, rdsize; - int (*func)(); - -/* fprintf(stderr, "%d:: armci gpc exec. SAMECLUSNODE\n", armci_me); */ - - func = _table[hnd]; - if(func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, GPC_INIT) != GPC_DONE) { - func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, GPC_WAIT); - } -#ifndef VAPI - PARMCI_Fence(p); -#endif - return 0; - } -#endif - -/* fprintf(stderr, "%d:: armci gpc exec. invoking armci gpc\n", armci_me); */ - return armci_gpc(h, p, hdr, hlen, data, dlen, - rhdr, rhlen, rdata, rdlen, ahdl); -} - -/* - func - handle to the function executed at each process in the chain - callba- handle to the callback to be executed when - hdr - header data used to pack extra args for callback (local buffer) - hlen - size of header data < ARMCI_GPC_HLEN - data - bulk data passed to callback (local buffer) - dlen - length of bulk data - rhdr - ptr to reply header (return args from callback) - rhlen - length of buffer to store reply header < ARMCI_GPC_HLEN - rdata - ptr to where reply data from callback should be stored (local buf) - rdlen - size of the buffer to store reply data - idlen - number of ID's - idslst- list of id's in the chained GPC - nbh - nonblocking handle which also acts as a context for each individual - GPC - Tree - the id of tree function used (default is 0=>binary, 1=>binomial, - n=> user defined) -*/ -int ARMCI_Gpc_chained_exec(int func, int callback, void *hdr, int hlen, - void *data, int dlen, void *rhdr, int rhlen, void *rdata, - int rdlen, int idlen, int *idlst, gpc_hdl_t* nbh, int TREE) -{ -#if 0 -int hnd = -func + GPC_OFFSET; -int err = 0; - armci_hdl_t *ahdl = (nbh ? &(nbh->ahdl): NULL); - - if(hnd <0 || hnd>= GPC_SLOTS) - err += fprintf(stderr, "ARMCI_Gpc_exec: bad callback handle %d: %d\n",hnd,GPC_SLOTS); - if(!_table[hnd]) - err += fprintf(stderr, "ARMCI_Gpc_exec: NULL function %d",hnd); - - if(hlen<0 || hlen>=ARMCI_Gpc_get_hlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send header size %d %d\n", hlen, ARMCI_Gpc_get_hlen()); - if(rhlen<0 || rhlen>=ARMCI_Gpc_get_hlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv header size %d %d\n", rhlen, ARMCI_Gpc_get_hlen()); - if(dlen<0 || dlen>=ARMCI_Gpc_get_dlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid send data size %d %d\n", dlen, ARMCI_Gpc_get_dlen()); - if(rdlen<0 || rdlen>=ARMCI_Gpc_get_dlen()) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid recv data size %d %d\n", rdlen, ARMCI_Gpc_get_dlen()); - - if(hlen>0 && hdr==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null send header for non-zero header size %d\n", hlen); - if(rhlen>0 && rhdr==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv header for non-zero header size %d\n", rhlen); - if(dlen>0 && data==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null send data for non-zero data size %d\n", dlen); - if(rdlen>0 && rdata==NULL) - err += fprintf(stderr, "ARMCI_Gpc_exec: Null recv data for non-zero header size %d\n", rdlen); - - if(p<0 || p >= armci_nproc) - err += fprintf(stderr, "ARMCI_Gpc_exec: Invalid target processor id %d\n", p, armci_nproc); - - if(err) - return FAIL; - - if(rhlen + rdlen == 0) - armci_die("Zero reply header + data length not yet supported", 0); - - tree_id = armci_msg_generate_tree(idlst,idlen,id_tree,TREE); - if(nbh) - nbh->proc = p; - -#if 1 - if(SAMECLUSNODE(p) && armci_nproc==1) { - int rhsize, rdsize; - int (*func)(); - - /* fprintf(stderr, "%d:: armci gpc exec. SAMECLUSNODE\n", armci_me); */ - - func = _table[hnd]; - if(func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, GPC_INIT) != GPC_DONE) { - func(p, armci_me, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, GPC_WAIT); - } -#ifndef VAPI - PARMCI_Fence(p); -#endif - return 0; - } -#endif -/* fprintf(stderr, "%d:: armci gpc exec. invoking armci gpc\n", armci_me); */ - return armci_gpc(h, p, hdr, hlen, data, dlen, - rhdr, rhlen, rdata, rdlen, ahdl); -#endif -} - - - -int armci_gpc_local_exec(int h, int to, int from, void *hdr, int hlen, - void *data, int dlen, - void *rhdr, int rhlen, - void *rdata, int rdlen, int rtype) { - int rhsize, rdsize; - int (*func)(); - int hnd = -h + GPC_OFFSET; - - if(hnd <0 || hnd>= GPC_SLOTS) - armci_die2("armci_gpc_local_exec: bad callback handle",hnd,GPC_SLOTS); - if(!_table[hnd]) armci_die("armci_gpc_local_exec: NULL function",hnd); - - func = _table[hnd]; - - if(!SAMECLUSNODE(to)) - armci_die("armci_gpc_local_exec: GPC call to a different node received!", - armci_me); - -/* func(to, from, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, */ -/* rdata, rdlen, &rdsize); */ -/* return 0; */ - return func(to, from, hdr, hlen, data, dlen, rhdr, rhlen, &rhsize, - rdata, rdlen, &rdsize, rtype); -} - -/*\ - * This is a template for the callback function - * The arguments are passed as specified in ARMCI_Gpc_exec - * In addition, - * rhsize specifies the actual size of reply header data returned - * rdsize specifies the actual size of reply data returned -\*/ -int example_func(int to, int from, void *hdr, int hlen, - void *data, int dlen, - void *rhdr, int rhlen, int *rhsize, - void *rdata, int rdlen, int *rdsize, - int rtype); - - -#ifdef LAPI -void armci_gpc_set_serverpid(){ -} -#endif - - -/*\ - * Translate pointer to memory on processor "proc" - * to be used in a callback function send by processor "from" -\*/ -void * ARMCI_Gpc_translate(void *ptr, int proc, int from) -{ -return ptr; -} - - -/*\ acquire lock in a callback function executed in context of processor "proc" -\*/ -void ARMCI_Gpc_lock(int proc) -{ -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - NATIVE_LOCK(lock,proc); -} - -/*\ try acquire lock in a callback function to be executed in context of - * processor "proc" - * return value: 1 - success - * 0 - failure (already locked by another thread) -\*/ -int ARMCI_Gpc_trylock(int proc) -{ -armci_die("ARMCI_Gpc_trylock: not yet implemented",0); -return 0; -} - -/*\ release lock in a callback function executed in context of processor "proc" -\*/ -void ARMCI_Gpc_unlock(int proc) -{ -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - NATIVE_UNLOCK(lock,proc); -} - -void ARMCI_Gpc_init_handle(gpc_hdl_t *nbh) { - nbh->proc = armci_me; - ARMCI_INIT_HANDLE(&nbh->ahdl); -} - -void ARMCI_Gpc_wait(gpc_hdl_t *nbh) { - if(SAMECLUSNODE(nbh->proc)) - return; - PARMCI_Wait(&nbh->ahdl); -} - -void ARMCI_Gpc_test(gpc_hdl_t *nbh) { - if(SAMECLUSNODE(nbh->proc)) - return; - PARMCI_Test(&nbh->ahdl); -} - -#define ARMCI_GPC_HLEN 65536 -#define ARMCI_GPC_DLEN 65536 -int ARMCI_Gpc_get_hlen() { - return ARMCI_GPC_HLEN; -} - -int ARMCI_Gpc_get_dlen() { - return ARMCI_GPC_DLEN; -} - -#endif - diff --git a/armci/src-portals/gpc.h b/armci/src-portals/gpc.h deleted file mode 100644 index 289e87564..000000000 --- a/armci/src-portals/gpc.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef __GPCDEF -#define __GPCDEF - -#include "armci.h" - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -#define GPC_INIT 1 -#define GPC_PROBE 2 -#define GPC_WAIT 3 -#define GPC_DONE 4 -#define GPC_PENDING 5 - -typedef struct { - int proc; - armci_hdl_t ahdl; -}gpc_hdl_t; - -/* #define ARMCI_GPC_HLEN 1024 */ -/* #define ARMCI_GPC_DLEN 1024*1024 */ -extern int ARMCI_Gpc_register( int (*func) ()); -extern void ARMCI_Gpc_release(int handle); -extern void * ARMCI_Gpc_translate(void *ptr, int proc, int from); -extern void ARMCI_Gpc_lock(int proc); -extern void ARMCI_Gpc_unlock(int proc); -extern int ARMCI_Gpc_trylock(int proc); -extern int ARMCI_Gpc_exec(int h,int p, void *hdr, int hlen, void *data,int dlen, - void *rhdr, int rhlen, void *rdata, int rdlen, - gpc_hdl_t* nbh); -extern int PARMCI_Get_gpc_hlen(); -extern int PARMCI_Get_gpc_dlen(); - -extern void ARMCI_Gpc_init_handle(gpc_hdl_t *nbh); -extern void ARMCI_Gpc_wait(gpc_hdl_t *nbh); - -#if defined(__cplusplus) || defined(c_plusplus) -} -#endif - -#endif diff --git a/armci/src-portals/groups.c b/armci/src-portals/groups.c deleted file mode 100644 index 8601f80a5..000000000 --- a/armci/src-portals/groups.c +++ /dev/null @@ -1,568 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: groups.c,v 1.4.6.2 2007-08-15 08:37:16 manoj Exp $ */ - - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STDLIB_H -# include -#endif -#if HAVE_STRING_H -# include -#endif -#if HAVE_ASSERT_H -# include -#endif - -#ifndef MSG_COMMS_MPI -# define MSG_COMMS_MPI -#endif -#include "armcip.h" -#include "message.h" - -#define DEBUG_ 0 - -MPI_Comm ARMCI_COMM_WORLD; /*dup of MPI_COMM_WORLD. Initialized first thing in ARMCI_Init*/ - -ARMCI_Group ARMCI_Default_Proc_Group = 0; -ARMCI_Group ARMCI_World_Proc_Group = 0; - -typedef struct group_list_struct { - ARMCI_Group group; - ARMCI_iGroup igroup; - struct group_list_struct *next; -} group_list_t; - -group_list_t *group_list = NULL; - -ARMCI_iGroup* armci_get_igroup_from_group(ARMCI_Group *group) -{ - group_list_t *current_group_list_item = group_list; - - assert(group_list != NULL); - while (current_group_list_item != NULL) { - if (current_group_list_item->group == *group) { - return ¤t_group_list_item->igroup; - } - current_group_list_item = current_group_list_item->next; - } - armci_die("ARMCI_Group lookup failed", -1); - return NULL; -} - -static void armci_create_group_and_igroup(ARMCI_Group *group, ARMCI_iGroup **igroup) -{ - group_list_t *new_group_list_item = NULL; - group_list_t *last_group_list_item = NULL; - - /* create the new group in the linked list */ - last_group_list_item = group_list; - while (last_group_list_item->next != NULL) { - last_group_list_item = last_group_list_item->next; - } - - new_group_list_item = malloc(sizeof(group_list_t)); - new_group_list_item->group = last_group_list_item->group + 1; - new_group_list_item->next = NULL; - *igroup = &new_group_list_item->igroup; - *group = new_group_list_item->group; - last_group_list_item->next = new_group_list_item; -} - -#ifdef ARMCI_GROUP -void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Group *group) -{ - armci_msg_group_bcast_scope(SCOPE_ALL, buffer, len, - ARMCI_Absolute_id(group, root), - group); -} -#else -void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Comm comm) -{ - int result; - MPI_Comm_compare(comm, ARMCI_COMM_WORLD, &result); - if(result == MPI_IDENT) armci_msg_brdcst(buffer, len, root); - else MPI_Bcast(buffer, len, MPI_BYTE, root, (MPI_Comm)comm); -} -#endif - -int ARMCI_Group_rank(ARMCI_Group *group, int *rank) -{ - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); -#ifdef ARMCI_GROUP - if(!igroup) return MPI_ERR_GROUP; - *rank = igroup->grp_attr.grp_me; - return MPI_SUCCESS; -#else - return MPI_Group_rank((MPI_Group)(igroup->igroup), rank); -#endif -} - -void ARMCI_Group_size(ARMCI_Group *group, int *size) -{ - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); -#ifdef ARMCI_GROUP - *size = igroup->grp_attr.nproc; -#else - MPI_Group_size((MPI_Group)(igroup->igroup), size); -#endif -} - -int ARMCI_Absolute_id(ARMCI_Group *group,int group_rank) -{ - int abs_rank,status; - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); -#ifdef ARMCI_GROUP - assert(group_rank < igroup->grp_attr.nproc); - return igroup->grp_attr.proc_list[group_rank]; -#else - MPI_Group grp; - status = MPI_Comm_group(ARMCI_COMM_WORLD,&grp); - MPI_Group_translate_ranks(igroup->igroup,1,&group_rank,grp,&abs_rank); - return(abs_rank); -#endif -} - -void ARMCI_Group_set_default(ARMCI_Group *group) -{ - ARMCI_Default_Proc_Group = *group; -} - -void ARMCI_Group_get_default(ARMCI_Group *group_out) -{ - *group_out = ARMCI_Default_Proc_Group; -} - -void ARMCI_Group_get_world(ARMCI_Group *group_out) -{ - *group_out = ARMCI_World_Proc_Group; -} - -static void get_group_clus_id(ARMCI_iGroup *igroup, int grp_nproc, - int *grp_clus_id) -{ -#ifdef ARMCI_GROUP - int i; - assert(grp_nproc<=igroup->grp_attr.nproc); - for(i=0; igrp_attr.proc_list[i]); - } -#else - int i, *ranks1, *ranks2; - MPI_Group group2; - - /* Takes the list of processes from one group and attempts to determine - * the corresponding ranks in a second group (here, ARMCI_COMM_WORLD) */ - - ranks1 = (int *)malloc(2*grp_nproc*sizeof(int)); - ranks2 = ranks1 + grp_nproc; - for(i=0; iigroup, grp_nproc, ranks1, group2, ranks2); - - /* get the clus_id of processes */ - for(i=0; iicomm; -#endif - - int grp_me, grp_nproc, grp_nclus, grp_clus_me; - armci_clus_t *grp_clus_info=NULL; -#ifdef CLUSTER - int i, len, root=0; -#endif - -#ifndef ARMCI_GROUP - if(comm==MPI_COMM_NULL || igroup->igroup==MPI_GROUP_NULL) - armci_die("group_process_list: NULL COMMUNICATOR",0); -#endif - - ARMCI_Group_rank(group, &grp_me); - ARMCI_Group_size(group, &grp_nproc); - -#ifdef CLUSTER -# ifdef ARMCI_GROUP - /*all processes construct the clus_info structure in parallel*/ - grp_clus_info = group_construct_clusinfo(&grp_nclus, group); -# else - /* process 0 gets group cluster information: grp_nclus, grp_clus_info */ - if(grp_me == 0) { - grp_clus_info = group_construct_clusinfo(&grp_nclus, group); - } - - /* process 0 broadcasts group cluster information */ - len = sizeof(int); - ARMCI_Bcast_(&grp_nclus, len, root, comm); - - if(grp_me){ - /* allocate memory */ - grp_clus_info = (armci_clus_t*)malloc(grp_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info)armci_die("malloc failed for clusinfo",armci_nclus); - } - - len = sizeof(armci_clus_t)*grp_nclus; - ARMCI_Bcast_(grp_clus_info, len, root, comm); -# endif - /* determine current group cluster node id by comparing me to master */ - grp_clus_me = grp_nclus-1; - for(i =0; i< grp_nclus-1; i++) { - if(grp_me < grp_clus_info[i+1].master){ - grp_clus_me=i; - break; - } - } -#else /* !CLUSTER */ - grp_clus_me = 0; - grp_nclus = 1; - grp_clus_info = (armci_clus_t*)malloc(grp_nclus*sizeof(armci_clus_t)); - if(!grp_clus_info)armci_die("malloc failed for clusinfo",grp_nclus); - strcpy(grp_clus_info[0].hostname, armci_clus_info[0].hostname); - grp_clus_info[0].master=0; - grp_clus_info[0].nslave=grp_nproc; -#endif /* CLUSTER */ -#ifdef ARMCI_GROUP - /*Set in ARMCI_Group_create. ARMCI_Group_rank is used before - setting this field. So moving it there in the generic - implementation.*/ -#else - grp_attr->grp_me = grp_me; -#endif - grp_attr->grp_clus_info = grp_clus_info; - grp_attr->grp_nclus = grp_nclus; - grp_attr->grp_clus_me = grp_clus_me; -} - -/* attribute caching: group_cluster_information and memory_offset should - be cached in group data structure */ -static void armci_cache_attr(ARMCI_Group *group) { - armci_grp_attr_t *grp_attr; - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - - /* allocate storage for the attribute content. Note: Attribute contents - should be stored in persistent memory */ - grp_attr = &(igroup->grp_attr); - - /* get group cluster information and grp_attr */ - group_process_list(group, grp_attr); -} - -armci_grp_attr_t *ARMCI_Group_getattr(ARMCI_Group *group) -{ - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - return(&(igroup->grp_attr)); - -} - -static void armci_igroup_finalize(ARMCI_iGroup *igroup) { -#ifdef ARMCI_GROUP - int world_me, i; - - world_me = armci_msg_me(); - for(i=0; igrp_attr.nproc; i++) { - if(igroup->grp_attr.proc_list[i] == world_me) { - break; - } - } - if(i==igroup->grp_attr.nproc) { - return; /*not in group to be freed*/ - } - - assert(igroup); - free(igroup->grp_attr.grp_clus_info); - free(igroup->grp_attr.proc_list); - igroup->grp_attr.nproc = 0; -#else - int rv; - - assert(igroup); - /*the following was causing seg fault*/ - /*free(igroup->grp_attr.grp_clus_info);*/ - - rv=MPI_Group_free(&(igroup->igroup)); - if(rv != MPI_SUCCESS) armci_die("MPI_Group_free: Failed ",armci_me); - - if(igroup->icomm != MPI_COMM_NULL) { - rv = MPI_Comm_free( (MPI_Comm*)&(igroup->icomm) ); - if(rv != MPI_SUCCESS) armci_die("MPI_Comm_free: Failed ",armci_me); - } -#endif -} - -void ARMCI_Group_free(ARMCI_Group *group) { - group_list_t *current_group_list_item = group_list; - group_list_t *previous_group_list_item = NULL; - - /* find the group to free */ - while (current_group_list_item != NULL) { - if (current_group_list_item->group == *group) { - break; - } - previous_group_list_item = current_group_list_item; - current_group_list_item = current_group_list_item->next; - } - /* make sure we found a group */ - assert(current_group_list_item != NULL); - /* remove the group from the linked list */ - if (previous_group_list_item != NULL) { - previous_group_list_item->next = current_group_list_item->next; - } - /* free the group */ - armci_igroup_finalize(¤t_group_list_item->igroup); - free(current_group_list_item); -} - -/* - Create a child group for to the given group. - @param n IN #procs in this group (<= that in group_parent) - @param pid_list IN The list of proc ids (w.r.t. group_parent) - @param group_out OUT Handle to store the created group - @param group_parent IN Parent group - */ -void ARMCI_Group_create_child(int n, int *pid_list, ARMCI_Group *group_out, - ARMCI_Group *grp_parent) -{ - int i,grp_me; - ARMCI_iGroup *igroup = NULL; - -#ifdef ARMCI_GROUP - int world_me, parent_grp_me; - armci_grp_attr_t *grp_attr = NULL; -#else - int rv; - ARMCI_iGroup *igroup_parent = NULL; - MPI_Group *group_parent = NULL; - MPI_Comm *comm_parent = NULL; -#endif - - armci_create_group_and_igroup(group_out, &igroup); - -#ifdef ARMCI_GROUP - grp_attr = &igroup->grp_attr; - ARMCI_Group_rank(grp_parent, &parent_grp_me); - for(i=0; inproc=0; - grp_attr->proc_list = NULL; - return; /*not in group to be created*/ - } - for(i=0; i pid_list[i+1]){ - armci_die("ARMCI_Group_create: Process ids are not sorted ",armci_me); - break; - } - } - grp_attr->grp_clus_info = NULL; - grp_attr->nproc = n; - grp_attr->proc_list = (int *)malloc(n*sizeof(int)); - assert(grp_attr->proc_list!=NULL); - for(i=0; iproc_list[i] = ARMCI_Absolute_id(grp_parent,pid_list[i]); - } - world_me = armci_msg_me(); - grp_attr->grp_me = grp_me = MPI_UNDEFINED; - for(i=0; igrp_attr.proc_list[i] == world_me) { - grp_attr->grp_me = grp_me = i; - break; - } - } - if(grp_me != MPI_UNDEFINED) armci_cache_attr(group_out); - armci_msg_group_barrier(group_out); -#else - igroup_parent = armci_get_igroup_from_group(grp_parent); - /* NOTE: default group is the parent group */ - group_parent = &(igroup_parent->igroup); - comm_parent = &(igroup_parent->icomm); - - rv=MPI_Group_incl(*group_parent, n, pid_list, &(igroup->igroup)); - if(rv != MPI_SUCCESS) armci_die("MPI_Group_incl: Failed ",armci_me); - - { - MPI_Comm comm, comm1, comm2; - int lvl=1, local_ldr_pos; - MPI_Group_rank((MPI_Group)(igroup->igroup), &grp_me); - if(grp_me == MPI_UNDEFINED) { - igroup->icomm = MPI_COMM_NULL; /*FIXME: keeping the group around for now*/ - return; - } - assert(grp_me>=0); /*SK: sanity check for the following bitwise operations*/ - MPI_Comm_dup(MPI_COMM_SELF, &comm); /*FIXME: can be optimized away*/ - local_ldr_pos = grp_me; - while(n> lvl) { - int tag=0; - int remote_ldr_pos = local_ldr_pos^lvl; - if(remote_ldr_pos < n) { - int remote_leader = pid_list[remote_ldr_pos]; - MPI_Comm peer_comm = *comm_parent; - int high = (local_ldr_posicomm = comm; - MPI_Group_free(&igroup->igroup); /*cleanup temporary group*/ - MPI_Comm_group(igroup->icomm, &igroup->igroup); /*the group associated with comm*/ - igroup->grp_attr.grp_clus_info=NULL; - /* processes belong to this group should cache attributes */ - armci_cache_attr(group_out); - } - -#endif -} - -void ARMCI_Group_create(int n, int *pid_list, ARMCI_Group *group_out) { - ARMCI_Group_create_child(n, pid_list, group_out, (ARMCI_Group *)&ARMCI_Default_Proc_Group); -} - -void armci_group_init() -{ -#ifdef ARMCI_GROUP - int i; -#else - int grp_me; -#endif - ARMCI_iGroup *igroup; - - /* Initially, World group is the default group */ - ARMCI_World_Proc_Group = 0; - ARMCI_Default_Proc_Group = 0; - - /* create the head of the group linked list */ - assert(group_list == NULL); - group_list = malloc(sizeof(group_list_t)); - group_list->group = ARMCI_World_Proc_Group; - group_list->next = NULL; - igroup = &group_list->igroup; - -#ifdef ARMCI_GROUP - /*setup the world proc group*/ - igroup->grp_attr.nproc = armci_msg_nproc(); - igroup->grp_attr.grp_me = armci_msg_me(); - igroup->grp_attr.proc_list = (int *)malloc(igroup->grp_attr.nproc*sizeof(int)); - assert(igroup->grp_attr.proc_list != NULL); - for(i=0; igrp_attr.nproc; i++) { - igroup->grp_attr.proc_list[i] = i; - } - igroup->grp_attr.grp_clus_info = NULL; - armci_cache_attr(&ARMCI_World_Proc_Group); -#else - /* save MPI world group and communicatior in ARMCI_World_Proc_Group */ - igroup->icomm = ARMCI_COMM_WORLD; - MPI_Comm_group(ARMCI_COMM_WORLD, &(igroup->igroup)); - - /* processes belong to this group should cache attributes */ - MPI_Group_rank((MPI_Group)(igroup->igroup), &grp_me); - if(grp_me != MPI_UNDEFINED) { - armci_cache_attr(&ARMCI_World_Proc_Group); - } -#endif -} - -void armci_group_finalize() { - group_list_t *current_group_list_item = group_list; - group_list_t *previous_group_list_item = NULL; - - /* don't free the world group (the list head) */ - current_group_list_item = current_group_list_item->next; - - while (current_group_list_item != NULL) { - previous_group_list_item = current_group_list_item; - current_group_list_item = current_group_list_item->next; - armci_igroup_finalize(&previous_group_list_item->igroup); - free(previous_group_list_item); - } -} - -/* - ISSUES: - 1. Make sure ARMCI_Group_free frees the attribute data structures - 2. replace malloc with, kr_malloc using local_context. -*/ diff --git a/armci/src-portals/kr_malloc.c b/armci/src-portals/kr_malloc.c deleted file mode 100644 index fea4b1628..000000000 --- a/armci/src-portals/kr_malloc.c +++ /dev/null @@ -1,606 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: kr_malloc.c,v 1.24 2006-09-12 23:21:21 andriy Exp $ */ -#include -#include "kr_malloc.h" -#include "armcip.h" /* for DEBUG purpose only. remove later */ -#include "locks.h" - -#define DEBUG 0 - -/* Storage allocator basically copied from ANSI K&R and corrupted */ - -extern char *armci_allocate(); /* Used to get memory from the system */ -extern void armci_die(); -static char *kr_malloc_shmem(size_t nbytes, context_t *ctx); -static void kr_free_shmem(char *ap, context_t *ctx); - -/** - * DEFAULT_NALLOC: No. of units of length ALIGNMENT to get in every - * request to the system for memory (8MB/64 => 128*1024units) - * DEFAULT_MAX_NALLOC: Maximum number of units that can get i.e.1GB - * (if unit size=64bytes, then max units=1024MB/64 = 16*1024*1024) - */ -#define DEFAULT_NALLOC (128*1024) -#define DEFAULT_NALLOC_ALIGN 1024 -#define DEFAULT_MAX_NALLOC (1024*1024*16) - -/* mutual exclusion defs go here */ -#define LOCKED 100 -#define UNLOCKED 101 -static int lock_mode=UNLOCKED; - -/* enable locking only after armci is initailized as locks (and lock - data structures) are initialized in PARMCI_Init */ -#define LOCKIT(p) \ - if(_armci_initialized && lock_mode==UNLOCKED) { \ - NAT_LOCK(0,p); lock_mode=LOCKED; \ - } -#define UNLOCKIT(p) \ - if(_armci_initialized && lock_mode==LOCKED) { \ - NAT_UNLOCK(0,p); lock_mode=UNLOCKED; \ - } - -static int do_verify = 0; /* Flag for automatic heap verification */ - -#define VALID1 0xaaaaaaaa /* For validity check on headers */ -#define VALID2 0x55555555 - -#define USEDP 0 /* CHECK. By default anable this. */ - -static void kr_error(char *s, unsigned long i, context_t *ctx) { -char string[256]; - sprintf(string,"kr_malloc: %s %ld(0x%lx)\n", s, i, i); -#if 0 - kr_malloc_print_stats(ctx); -#endif - armci_die(string, i); -} - -static Header *morecore(size_t nu, context_t *ctx, size_t *last_size, char **last_ptr) { - char *cp; - Header *up; - -#if DEBUG - (void) printf("%d: morecore 1: Getting %ld more units of length %d nalloc=%d\n", armci_me, (long)nu, sizeof(Header),ctx->nalloc); - (void) fflush(stdout); -#endif - - if (ctx->total >= ctx->max_nalloc) { -# if DEBUG - armci_die("kr_malloc: morecore: maximum allocation reached",armci_me); -# endif - return (Header *) NULL; /* Enforce upper limit on core usage */ - } - -#if 1 - /* 07/03 ctx->nalloc is now the minimum # units we ask from OS */ - nu = DEFAULT_NALLOC_ALIGN*((nu-1)/DEFAULT_NALLOC_ALIGN+1); - if(nu < ctx->nalloc) nu = ctx->nalloc; -#else - nu = ctx->nalloc*((nu-1)/ctx->nalloc+1); /* nu must by a multiplicity of nalloc */ -#endif - -#if DEBUG - (void) printf("%d: morecore: Getting %ld more units of length %d\n", - armci_me, (long)nu, sizeof(Header)); - (void) fflush(stdout); -#endif - - if ((cp =(char *)(*ctx->alloc_fptr)((size_t)nu * sizeof(Header))) == (char *)NULL) - return (Header *) NULL; - if(last_size!=NULL && last_ptr!=NULL){ - *last_size = ((size_t)nu * sizeof(Header)); - *last_ptr = cp; - /*printf("\n%d:%s:got %p %d",armci_me,FUNCTION_NAME,*last_ptr,*last_size);*/ - } -/* if(armci_nclus==armci_nproc && armci_nclus!=1) - armci_register_shmem(cp,((size_t)nu * sizeof(Header)),NULL,0,cp); - */ - - ctx->total += nu; /* Have just got nu more units */ - ctx->nchunk++; /* One more chunk */ - ctx->nfrags++; /* Currently one more frag */ - ctx->inuse += nu; /* Inuse will be decremented by kr_free */ - - up = (Header *) cp; - up->s.size = nu; - up->s.valid1 = VALID1; - up->s.valid2 = VALID2; - - /* Insert into linked list of blocks in use so that kr_free works - ... for debug only */ - up->s.ptr = ctx->usedp; - ctx->usedp = up; - - kr_free((char *)(up+1), ctx); /* Try to join into the free list */ - return ctx->freep; -} - -void kr_malloc_init(size_t usize, /* unit size in bytes */ - size_t nalloc, - size_t max_nalloc, - void * (*alloc_fptr)(), /* memory alloc routine */ - int debug, - context_t *ctx) { - int scale; - - if(usize <= 0) usize = sizeof(Header); - - scale = usize>>LOG_ALIGN; - if(scale<1)fprintf(stderr,"Error: kr_malloc_init !!!\n"); - - if(nalloc==0) nalloc = DEFAULT_NALLOC; - if(max_nalloc==0) max_nalloc = DEFAULT_MAX_NALLOC; - - ctx->usize = sizeof(Header); - ctx->nalloc = nalloc * scale; - ctx->max_nalloc = max_nalloc * scale; - ctx->alloc_fptr = alloc_fptr; - ctx->freep = NULL; - ctx->usedp = NULL; - ctx->shmid = -1; - ctx->shmoffset = 0; - ctx->shmsize = 0; - ctx->ctx_type = -1; - do_verify = debug; -} - -char *_kr_last_ptr; -size_t _kr_last_size; - -char *kr_malloc(size_t nbytes, context_t *ctx, int record_allocation, char **new_base, size_t *new_size) { - Header *p, *prevp; - size_t nunits; - char *return_ptr; - -#if !((defined(SUN) || defined(SOLARIS)) && !defined(SHMMAX_SEARCH_NO_FORK)) - if(ctx->ctx_type == KR_CTX_SHMEM) return kr_malloc_shmem(nbytes,ctx); -#endif - - /* If first time in need to initialize the free list */ - if ((prevp = ctx->freep) == NULL) { - - if (sizeof(Header) != ALIGNMENT) - kr_error("Alignment is not valid", (unsigned long) ALIGNMENT, ctx); - - ctx->total = 0; /* Initialize statistics */ - ctx->nchunk = 0; - ctx->inuse = 0; - ctx->nfrags = 0; - ctx->maxuse = 0; - ctx->nmcalls= 0; - ctx->nfcalls= 0; - /* Initialize linked list */ - ctx->base.s.ptr = ctx->freep = prevp = &(ctx->base); - ctx->base.s.size = 0; - ctx->base.s.valid1 = VALID1; - ctx->base.s.valid2 = VALID2; - } - - ctx->nmcalls++; - - if (do_verify) - kr_malloc_verify(ctx); - - /* Rather than divide make the alignment a known power of 2 */ - - nunits = ((nbytes + sizeof(Header) - 1)>>LOG_ALIGN) + 1; - - for (p=prevp->s.ptr; ; prevp = p, p = p->s.ptr) { - if (p->s.size >= nunits) { /* Big enuf */ - if (p->s.size == nunits) /* exact fit */ - prevp->s.ptr = p->s.ptr; - else { /* allocate tail end */ - p->s.size -= nunits; - p += p->s.size; - p->s.size = nunits; - p->s.valid1 = VALID1; - p->s.valid2 = VALID2; - ctx->nfrags++; /* Have just increased the fragmentation */ - } - - /* Insert into linked list of blocks in use ... for debug only */ - p->s.ptr = ctx->usedp; - ctx->usedp = p; - - ctx->inuse += nunits; /* Record usage */ - if (ctx->inuse > ctx->maxuse) - ctx->maxuse = ctx->inuse; - ctx->freep = prevp; - return_ptr = (char *) (p+1); - break; - } - - if (p == ctx->freep) { /* wrapped around the free list */ - if ((p = morecore(nunits, ctx, &_kr_last_size,&_kr_last_ptr)) == (Header *) NULL) { - return_ptr = (char *) NULL; - break; - } - } - } - if(record_allocation){ - *((char **)new_base)=_kr_last_ptr; - *new_size=_kr_last_size; - } - return return_ptr; -} - - -void kr_free(char *ap, context_t *ctx) { - Header *bp, *p, **up; - -#if !((defined(SUN) || defined(SOLARIS)) && !defined(SHMMAX_SEARCH_NO_FORK)) - if(ctx->ctx_type == KR_CTX_SHMEM) { kr_free_shmem(ap,ctx); return; } -#endif - - ctx->nfcalls++; - - - if (do_verify) - kr_malloc_verify(ctx); - - /* only do something if pointer is not NULL */ - - if ( ap ) { - - bp = (Header *) ap - 1; /* Point to block header */ - - if (bp->s.valid1 != VALID1 || bp->s.valid2 != VALID2) - kr_error("kr_free: pointer not from kr_malloc", - (unsigned long) ap, ctx); - - ctx->inuse -= bp->s.size; /* Decrement memory ctx->usage */ - - /* Extract the block from the used linked list - ... for debug only */ - - for (up=&(ctx->usedp); ; up = &((*up)->s.ptr)) { - if (!*up) - kr_error("kr_free: block not found in used list\n", - (unsigned long) ap, ctx); - if (*up == bp) { - *up = bp->s.ptr; - break; - } - } - - /* Join the memory back into the free linked list */ - - for (p=ctx->freep; !(bp > p && bp < p->s.ptr); p = p->s.ptr) - if (p >= p->s.ptr && (bp > p || bp < p->s.ptr)) - break; /* Freed block at start or end of arena */ - - if (bp + bp->s.size == p->s.ptr) {/* join to upper neighbour */ - bp->s.size += p->s.ptr->s.size; - bp->s.ptr = p->s.ptr->s.ptr; - ctx->nfrags--; /* Lost a fragment */ - } else - bp->s.ptr = p->s.ptr; - - if (p + p->s.size == bp) { /* Join to lower neighbour */ - p->s.size += bp->s.size; - p->s.ptr = bp->s.ptr; - ctx->nfrags--; /* Lost a fragment */ - } else - p->s.ptr = bp; - ctx->freep = p; - - } /* end if on ap */ -} - -/* - Print to standard output the usage statistics. -*/ -void kr_malloc_print_stats(context_t *ctx) { - fflush(stderr); - printf("\nkr_malloc statistics\n-------------------\n\n"); - - printf("Total memory from system ... %ld bytes\n", - (long)(ctx->total*ctx->usize)); - printf("Current memory usage ....... %ld bytes\n", - (long)(ctx->inuse*ctx->usize)); - printf("Maximum memory usage ....... %ld bytes\n", - (long)(ctx->maxuse*ctx->usize)); - printf("No. chunks from system ..... %ld\n", ctx->nchunk); - printf("No. of fragments ........... %ld\n", ctx->nfrags); - printf("No. of calls to kr_malloc ... %ld\n", ctx->nmcalls); - printf("No. of calls to kr_free ..... %ld\n", ctx->nfcalls); - printf("\n"); - - fflush(stdout); -} - -/* - Currently assumes that are working in a single region. -*/ -void kr_malloc_verify(context_t *ctx) { - Header *p; - - if(_armci_initialized && lock_mode==UNLOCKED) { - LOCKIT(armci_master); lock_mode=LOCKED; - } - - if ( ctx->freep ) { - - /* Check the used list */ - - for (p=ctx->usedp; p; p=p->s.ptr) { - if (p->s.valid1 != VALID1 || p->s.valid2 != VALID2) - kr_error("invalid header on usedlist", - (unsigned long) p->s.valid1, ctx); - - if (p->s.size > ctx->total) - kr_error("invalid size in header on usedlist", - (unsigned long) p->s.size, ctx); - } - - /* Check the free list */ - - p = ctx->base.s.ptr; - while (p != &(ctx->base)) { - if (p->s.valid1 != VALID1 || p->s.valid2 != VALID2) - kr_error("invalid header on freelist", - (unsigned long) p->s.valid1, ctx); - - if (p->s.size > ctx->total) - kr_error("invalid size in header on freelist", - (unsigned long) p->s.size, ctx); - - p = p->s.ptr; - } - } /* end if */ - - if(_armci_initialized && lock_mode==LOCKED) { - UNLOCKIT(armci_master); lock_mode=UNLOCKED; - } -} - -/********************** BEGIN: kr_malloc for ctx_shmem *********************/ -#if defined(SYSV) || defined(MMAP) - -#include "armci_shmem.h" - -extern int armci_get_shmem_info(char *addrp, int* shmid, long *shmoffset, - size_t *shmsize); -extern Header *armci_get_shmem_ptr(int shmid, long shmoffset, size_t shmsize); - -/* returns, address of the shared memory region based on shmid, offset. - * (i.e. return_addr = stating address of shmid + offset) */ -#define SHM_PTR(hdr) armci_get_shmem_ptr((hdr)->s.shmid, (hdr)->s.shmoffset, (hdr)->s.shmsize) - -/* - * kr_malloc_shmem: memory allocator for shmem context (i.e ctx_shmem) - */ -static char *kr_malloc_shmem(size_t nbytes, context_t *ctx) { - Header *p, *prevp; - size_t nunits, prev_shmsize=0; - char *return_ptr; - int prev_shmid=-1; - long prev_shmoffset=0; - - LOCKIT(armci_master); - - /* Rather than divide make the alignment a known power of 2 */ - nunits = ((nbytes + sizeof(Header) - 1)>>LOG_ALIGN) + 1; - - /* If first time in need to initialize the free list */ - if ((prevp = ctx->freep) == NULL) { - - if (sizeof(Header) != ALIGNMENT) - kr_error("kr_malloc_shmem: Alignment is not valid", - (unsigned long) ALIGNMENT, ctx); - - ctx->total = 0; /* Initialize statistics */ - ctx->nchunk = ctx->inuse = ctx->maxuse = 0; - ctx->nfrags = ctx->nmcalls = ctx->nfcalls = 0; - - /* Initialize linked list */ - ctx->base.s.size = 0; - ctx->base.s.shmid = -1; - ctx->base.s.shmoffset = 0; - ctx->base.s.shmsize = 0; - ctx->base.s.valid1 = VALID1; - ctx->base.s.valid2 = VALID2; - if ((p = morecore(nunits, ctx,NULL,NULL)) == (Header *) NULL) return NULL; - ctx->base.s.ptr = prevp = ctx->freep; /* CHECK */ - } - - prev_shmid = ctx->shmid; - prev_shmoffset = ctx->shmoffset; - prev_shmsize = ctx->shmsize; - prevp = ctx->freep = armci_get_shmem_ptr(ctx->shmid, ctx->shmoffset, - ctx->shmsize); - - ctx->nmcalls++; - - if (do_verify) kr_malloc_verify(ctx); - - for (p=SHM_PTR(prevp); ; prevp = p, p = SHM_PTR(p)) { - - if (p->s.size >= nunits) { /* Big enuf */ - if (p->s.size == nunits) { /* exact fit */ - prevp->s.ptr = p->s.ptr; - prevp->s.shmid = p->s.shmid; - prevp->s.shmoffset = p->s.shmoffset; - prevp->s.shmsize = p->s.shmsize; - } - else { /* allocate tail end */ - p->s.size -= nunits; - p += p->s.size; - p->s.size = nunits; - p->s.valid1 = VALID1; - p->s.valid2 = VALID2; - ctx->nfrags++; /* Have just increased the fragmentation */ - } -#if USEDP - /* Insert into linked list of blocks in use ... for debug only */ - p->s.ptr = ctx->usedp; - ctx->usedp = p; -#endif - - ctx->inuse += nunits; /* Record usage */ - if (ctx->inuse > ctx->maxuse) - ctx->maxuse = ctx->inuse; - ctx->freep = prevp; - ctx->shmid = prev_shmid; - ctx->shmoffset = prev_shmoffset; - ctx->shmsize = prev_shmsize; - return_ptr = (char *) (p+1); - break; - } - - prev_shmid = prevp->s.shmid; - prev_shmoffset = prevp->s.shmoffset; - prev_shmsize = prevp->s.shmsize; - - if (p == ctx->freep) { /* wrapped around the free list */ - if ((p = morecore(nunits, ctx,NULL,NULL)) == (Header *) NULL) { - return_ptr = (char *) NULL; - break; - } - prev_shmid = ctx->shmid; - prev_shmoffset = ctx->shmoffset; - prev_shmsize = ctx->shmsize; - } - } - - UNLOCKIT(armci_master); - return return_ptr; -} - - -static void kr_free_shmem(char *ap, context_t *ctx) { - Header *bp, *p, **up, *nextp; - int shmid=-1; - long shmoffset=0; - size_t shmsize=0; - - LOCKIT(armci_master); - - ctx->nfcalls++; - - if (do_verify) - kr_malloc_verify(ctx); - - /* only do something if pointer is not NULL */ - - if ( ap ) { - - bp = (Header *) ap - 1; /* Point to block header */ - - if (bp->s.valid1 != VALID1 || bp->s.valid2 != VALID2) - kr_error("kr_free_shmem: pointer not from kr_malloc", - (unsigned long) ap, ctx); - - ctx->inuse -= bp->s.size; /* Decrement memory ctx->usage */ - -#if USEDP - /* Extract the block from the used linked list - ... for debug only */ - - for (up=&(ctx->usedp); ; up = &((*up)->s.ptr)) { - if (!*up) - kr_error("kr_free_shmem: block not found in used list\n", - (unsigned long) ap, ctx); - if (*up == bp) { - *up = bp->s.ptr; - break; - } - } -#endif - - if(ctx->shmid==-1) { - armci_get_shmem_info((char*)bp, &ctx->shmid, &ctx->shmoffset, - &ctx->shmsize); - - ctx->base.s.shmid = ctx->shmid; - ctx->base.s.shmsize = ctx->shmsize; - ctx->base.s.shmoffset = ctx->shmoffset; - - p = ctx->freep = bp; - p->s.ptr = bp; - p->s.size-=SHMEM_CTX_BYTES; /*memory to store shmem info in context*/ - p->s.shmid = ctx->shmid; - p->s.shmsize = ctx->shmsize; - p->s.shmoffset = ctx->shmoffset; - - UNLOCKIT(armci_master); - return; - } - - ctx->freep = armci_get_shmem_ptr(ctx->shmid, ctx->shmoffset, - ctx->shmsize); - - shmid = ctx->shmid; - shmoffset = ctx->shmoffset; - shmsize = ctx->shmsize; - - /* Join the memory back into the free linked list */ - p = ctx->freep; - nextp = SHM_PTR(p); - - for ( ; !(bp > p && bp < nextp); p=nextp, nextp=SHM_PTR(p)) { - if (p >= nextp && (bp > p || bp < nextp)) - break; /* Freed block at start or end of arena */ - nextp = SHM_PTR(p); - shmid = p->s.shmid; - shmoffset = p->s.shmoffset; - shmsize = p->s.shmsize; - } - - if (bp + bp->s.size == nextp) {/* join to upper neighbour */ - bp->s.size += nextp->s.size; - bp->s.ptr = nextp->s.ptr; - ctx->nfrags--; /* Lost a fragment */ - bp->s.shmid = nextp->s.shmid; - bp->s.shmoffset = nextp->s.shmoffset; - bp->s.shmsize = nextp->s.shmsize; - } else { - bp->s.ptr = nextp; - bp->s.shmid = p->s.shmid; - bp->s.shmoffset = p->s.shmoffset; - bp->s.shmsize = p->s.shmsize; - } - - if (p + p->s.size == bp) { /* Join to lower neighbour */ - p->s.size += bp->s.size; - p->s.ptr = bp->s.ptr; - ctx->nfrags--; /* Lost a fragment */ - p->s.shmid = bp->s.shmid; - p->s.shmoffset = bp->s.shmoffset; - p->s.shmsize = bp->s.shmsize; - } else { - p->s.ptr = bp; - armci_get_shmem_info((char*)bp, &p->s.shmid, &p->s.shmoffset, - &p->s.shmsize); - } - - ctx->freep = p; - ctx->shmid = shmid; - ctx->shmoffset = shmoffset; - ctx->shmsize = shmsize; - } /* end if on ap */ - - UNLOCKIT(armci_master); -} -#else /* #ifdef SYSV */ -/* What are doing here */ -static char *kr_malloc_shmem(size_t nbytes, context_t *ctx) -{ - armci_die("kr_malloc_shmem(): Invalid Function Call", 0L); -} -static void kr_free_shmem(char *ap, context_t *ctx) -{ - armci_die("kr_free_shmem(): Invalid Function Call", 0L); -} -#endif /* #ifdef SYSV */ -/********************** END: kr_malloc for ctx_shmem *********************/ - - -/** -issues: -1. do usage statistics only if debug/DEBUG is enabled -*/ diff --git a/armci/src-portals/kr_malloc.h b/armci/src-portals/kr_malloc.h deleted file mode 100644 index aa7886bff..000000000 --- a/armci/src-portals/kr_malloc.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef KR_MALLOC_H /* K&R malloc */ -#define KR_MALLOC_H - -#ifdef CRAY -#define LOG_ALIGN 6 -#elif defined(KSR) -#define LOG_ALIGN 7 -#else -#define LOG_ALIGN 6 -#endif - -#define ALIGNMENT (1 << LOG_ALIGN) - -#define KR_CTX_SHMEM 101 -#define KR_CTX_LOCALMEM 102 - -union header{ - struct { - unsigned valid1; /* Token to check if is not overwritten */ - union header *ptr; /* next block if on free list */ - int shmid; /* next block's shared memory id */ - long shmoffset; /* next block's shmem offset */ - size_t shmsize; /* next block's shared memory segment size */ - size_t size; /* size of this block*/ - unsigned valid2; /* Another token acting as a guard */ - } s; - char align[ALIGNMENT]; /* Align to ALIGNMENT byte boundary */ -}; - -typedef union header Header; - -typedef struct malloc_context { - size_t usize; /* unit size in bytes */ - size_t nalloc; /* No. of units of length ALIGNMENT */ - size_t max_nalloc; /* Maximum no. of units that can get */ - void * (*alloc_fptr)(); /* function pointer to memory alloc routine */ - size_t total; /* Amount request from system in units */ - long nchunk; /* No. of chunks of system memory */ - long inuse; /* Amount in use in units */ - long maxuse; /* Maximum value of inuse */ - long nfrags; /* No. of fragments divided into */ - long nmcalls; /* No. of calls to _armci_alloc() */ - long nfcalls; /* No. of calls to memfree */ - int ctx_type; /* context id. - -1 represents ctx_local context. - otherwise, it is ctx_shmem context. */ - int shmid; /* first free block's (i.e.freep) shmem id */ - long shmoffset; /* first free block's shmem offset */ - size_t shmsize; /* first free block's shmem total size */ - Header base; /* empty list to get started */ - Header *freep; /* start of free list */ - Header *usedp; /* start of used list */ -} context_t; - -/* Memory required to store the shmem context in shared memory. This shmem - context shuld be stored in shared memory and is stored in the first shared - memory segment created (i.e.armci_krmalloc_init_ctxshmem) */ -#define SHMEM_CTX_MEM (sizeof(context_t)+sizeof(void*)) -#define SHMEM_CTX_BYTES ((SHMEM_CTX_MEM + sizeof(Header) - 1)>>LOG_ALIGN) + 1; - -extern void kr_malloc_init(size_t usize, /* unit size in bytes */ - size_t nalloc, - size_t max_nalloc, - void * (*alloc_fptr)(), /* memory alloc routine */ - int debug, - context_t *ctx); - -/* - Returns data aligned on a quad boundary. Even if the request - size is zero it returns a non-zero pointer. -*/ -extern char *kr_malloc(size_t size, context_t *ctx, int new_allocation, char **new_base, size_t *new_size); - -/* - Frees memory allocated by kr_malloc(). Ignores NULL pointers - but must not be called twice for the same pointer or called - with non-memalloc'ed pointers -*/ -extern void kr_free(char *ptr, context_t *ctx); - -/* - Print to standard output the usage statistics ... a wrapper - for kr_malloc_stats(); -*/ -extern void kr_malloc_print_stats(context_t *ctx); - -extern void kr_malloc_verify(context_t *ctx); - -#endif diff --git a/armci/src-portals/locks.c b/armci/src-portals/locks.c deleted file mode 100644 index 8bc95b21c..000000000 --- a/armci/src-portals/locks.c +++ /dev/null @@ -1,90 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: locks.c,v 1.15.6.1 2006-12-14 13:24:36 manoj Exp $ */ -#define _LOCKS_C_ -#include "armcip.h" -#include "locks.h" -#ifndef WIN32 -# include -#endif -#include - - -extern void armci_die(char*,int); - -#if defined(SPINLOCK) || defined(PMUTEXES) - -void **ptr_arr; - -void CreateInitLocks(int num_locks, lockset_t *plockid) -{ -int locks_per_proc, size; -extern void armci_set_serv_mutex_arr(void *); - ARMCI_PR_DBG("enter",0); - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - locks_per_proc = (num_locks*armci_nclus)/armci_nproc + 1; - size=locks_per_proc*sizeof(PAD_LOCK_T); - PARMCI_Malloc(ptr_arr, size); - _armci_int_mutexes = (PAD_LOCK_T*) ptr_arr[armci_master]; -# ifdef PORTALS_SPECIFIC_QUESTION - if(armci_me==armci_master)armci_set_serv_mutex_arr(_armci_int_mutexes); -# endif - - if(!_armci_int_mutexes) armci_die("Failed to create spinlocks",size); - -#ifdef PMUTEXES - if(armci_me == armci_master) { - int i; - pthread_mutexattr_t pshared; - if(pthread_mutexattr_init(&pshared)) - armci_die("armci_allocate_locks: could not init mutex attr",0); -# ifndef LINUX - if(pthread_mutexattr_setpshared(&pshared,PTHREAD_PROCESS_SHARED)) - armci_die("armci_allocate_locks: could not set PROCESS_SHARED",0); -# endif - - for(i=0; i< locks_per_proc*armci_clus_info[armci_clus_me].nslave; i++){ - if(pthread_mutex_init(_armci_int_mutexes+i,&pshared)) - armci_die("armci_allocate_locks: could not init mutex",i); - } - } -#else - - bzero((char*)ptr_arr[armci_me],size); - ARMCI_PR_DBG("exit",0); -#endif -} - -void InitLocks(int num_locks, lockset_t lockid) -{ - /* what are you doing here ? - All processes should've called CreateInitLocks(). - Check preprocessor directtives and see lock allocation in armci_init */ - armci_die("InitLocks(): what are you doing here ?",armci_me); -} - - -void DeleteLocks(lockset_t lockid) -{ - _armci_int_mutexes = (PAD_LOCK_T*)0; -} - -#else -/*********************** every thing else *************************/ - -void CreateInitLocks(int num_locks, lockset_t *lockid) -{} - -void InitLocks(int num_locks, lockset_t lockid) -{ -} - - -void DeleteLocks(lockset_t lockid) -{ -} - -#endif - diff --git a/armci/src-portals/locks.h b/armci/src-portals/locks.h deleted file mode 100644 index 2504b6176..000000000 --- a/armci/src-portals/locks.h +++ /dev/null @@ -1,174 +0,0 @@ -#ifndef _ARMCI_LOCKS_H_ -#define _ARMCI_LOCKS_H_ - -#if HAVE_SYS_TYPES_H -# include -#endif - -#define MAX_LOCKS 1024 -#define NUM_LOCKS MAX_LOCKS - -#if !(defined(PMUTEX) || defined(PSPIN) || defined(CYGNUS) || defined(CRAY_XT)) -# include "spinlock.h" -#endif - -#if !(defined(PMUTEX) || defined(PSPIN) || defined(SPINLOCK)) -# error cannot run -#endif - -#if (defined(SPINLOCK) || defined(PMUTEX) || defined(PSPIN) || defined(HITACHI)) && !(defined(BGML) || defined(DCMF)) -# include "armci_shmem.h" -typedef struct { - long off; - long idlist[SHMIDLEN]; -} lockset_t; -extern lockset_t lockid; -#elif defined(BGML) || defined(DCMF) -typedef int lockset_t; -#endif - -#if defined(PMUTEX) -# warning SPINLOCK: pthread_mutex_lock -# include -# include -# define NAT_LOCK(x,p) pthread_mutex_lock(_armci_int_mutexes +x) -# define NAT_UNLOCK(x,p) pthread_mutex_unlock(_armci_int_mutexes +x) -# define LOCK_T pthread_mutex_t -# define PAD_LOCK_T LOCK_T -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(PSPIN) -# warning SPINLOCK: pthread_spin_lock -# include -# include -# define NAT_LOCK(x,p) pthread_spin_lock(_armci_int_mutexes +x) -# define NAT_UNLOCK(x,p) pthread_spin_unlock(_armci_int_mutexes +x) -# define LOCK_T pthread_spinlock_t -# define PAD_LOCK_T LOCK_T -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(SPINLOCK) && defined(SGIALTIX) -# define NAT_LOCK(x,p) armci_acquire_spinlock((LOCK_T*)( ((PAD_LOCK_T*)(((void**)_armci_int_mutexes)[p]))+x )) -# define NAT_UNLOCK(x,p) armci_release_spinlock((LOCK_T*)( ((PAD_LOCK_T*)(((void**)_armci_int_mutexes)[p]))+x )) -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(SPINLOCK) -# define NAT_LOCK(x,p) armci_acquire_spinlock((LOCK_T*)(_armci_int_mutexes+(x))) -# define NAT_UNLOCK(x,p) armci_release_spinlock((LOCK_T*)(_armci_int_mutexes+(x))) -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(HITACHI) -extern void armcill_lock(int mutex, int proc); -extern void armcill_unlock(int mutex, int proc); -# define LOCK_T int -# define PAD_LOCK_T LOCK_T -# define NAT_LOCK(x,p) armcill_lock((x),(p)) -# define NAT_UNLOCK(x,p) armcill_unlock((x),(p)) -PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(SGI) -# define SGI_SPINS 100 -# include -typedef struct { - int id; - ulock_t * lock_array[NUM_LOCKS]; -}lockset_t; -extern lockset_t lockset; -# define NAT_LOCK(x,p) (void) uswsetlock(lockset.lock_array[(x)],SGI_SPINS) -# define NAT_UNLOCK(x,p) (void) usunsetlock(lockset.lock_array[(x)]) - -#elif defined(CONVEX) -# include -typedef struct{ - unsigned state; - unsigned pad[15]; -} lock_t; -typedef int lockset_t; -extern lock_t *lock_array; -extern void setlock(unsigned * volatile lp); -extern void unsetlock(unsigned * volatile lp); -# define NAT_LOCK(x,p) (void) setlock(&lock_array[x].state) -# define NAT_UNLOCK(x,p) (void) unsetlock(&lock_array[(x)].state) - -#elif defined(WIN32) -typedef int lockset_t; -extern void setlock(int); -extern void unsetlock(int); -# define NAT_LOCK(x,p) setlock(x) -# define NAT_UNLOCK(x,p) unsetlock(x) - -#elif defined(CRAY_YMP) && !defined(__crayx1) -# include -typedef int lockset_t; -extern lock_t cri_l[NUM_LOCKS]; -# pragma _CRI common cri_l -# define NAT_LOCK(x,p) t_lock(cri_l+(x)) -# define NAT_UNLOCK(x,p) t_unlock(cri_l+(x)) - -#elif defined(CRAY_T3E) || defined(__crayx1) || defined(CATAMOUNT) || defined(CRAY_SHMEM) || defined(PORTALS) -# include -# if defined(CRAY) || defined(CRAY_XT) -# include -# endif -# if defined(DECOSF) || defined(LINUX64) || defined(__crayx1) || defined(CATAMOUNT) -# define _INT_MIN_64 (LONG_MAX-1) -# endif -# undef NUM_LOCKS -# define NUM_LOCKS 4 -static long armci_lock_var[4]={0,0,0,0}; -typedef int lockset_t; -# define INVALID (long)(_INT_MIN_64 +1) -# define NAT_LOCK(x,p) while( shmem_swap(armci_lock_var+(x),INVALID,(p)) ) -# define NAT_UNLOCK(x,p) shmem_swap(armci_lock_var+(x), 0, (p)) - -#elif defined(SYSV) && defined(LAPI) && defined(AIX) -int **_armci_int_mutexes; -# define NAT_LOCK(x,p) armci_lapi_lock(_armci_int_mutexes[armci_master]+(x)) -# define NAT_UNLOCK(x,p) armci_lapi_unlock(_armci_int_mutexes[armci_master]+(x)) -typedef int lockset_t; - -#elif defined(CYGNUS) -typedef int lockset_t; -# define NAT_LOCK(x,p) armci_die("does not run in parallel",0) -# define NAT_UNLOCK(x,p) armci_die("does not run in parallel",0) - -#elif defined(LAPI) && !defined (LINUX) -# include -typedef int lockset_t; -extern pthread_mutex_t _armci_mutex_thread; -# define NAT_LOCK(x,p) pthread_mutex_lock(&_armci_mutex_thread) -# define NAT_UNLOCK(x,p) pthread_mutex_unlock(&_armci_mutex_thread) - -#elif defined(FUJITSU) -typedef int lockset_t; -# include "fujitsu-vpp.h" - -#elif defined(SYSV) || defined(MACX) -# include "semaphores.h" -# undef NUM_LOCKS -# define NUM_LOCKS ((MAX_LOCKS< SEMMSL) ? MAX_LOCKS:SEMMSL) -# define NAT_LOCK(x,p) P_semaphore(x) -# define NAT_UNLOCK(x,p) V_semaphore(x) -# ifndef _LOCKS_C_ -# define CreateInitLocks Sem_CreateInitLocks -# define InitLocks Sem_InitLocks -# define DeleteLocks Sem_DeleteLocks -# endif - -#else -# error -#endif - -extern void CreateInitLocks(int num, lockset_t *id); -extern void InitLocks(int num , lockset_t id); -extern void DeleteLocks(lockset_t id); - -#ifdef FUJITSU -# define NATIVE_LOCK(x,p) if(armci_nproc>1) { NAT_LOCK(p); } -# define NATIVE_UNLOCK(x,p) if(armci_nproc>1) { NAT_UNLOCK(p); } -#else -# define NATIVE_LOCK(x,p) if(armci_nproc>1) { NAT_LOCK(x,p); } -# define NATIVE_UNLOCK(x,p) if(armci_nproc>1) { NAT_UNLOCK(x,p); } -#endif - -#endif /* _ARMCI_LOCKS_H_ */ diff --git a/armci/src-portals/memlock.c b/armci/src-portals/memlock.c deleted file mode 100644 index dfae952c7..000000000 --- a/armci/src-portals/memlock.c +++ /dev/null @@ -1,269 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: memlock.c,v 1.24.2.3 2007-08-29 17:32:32 manoj Exp $ */ -#include "armcip.h" -#include "locks.h" -#include "copy.h" -#include "memlock.h" -#include - -#define DEBUG_ 0 -#define INVALID_VAL -9999999 - -#ifdef DATA_SERVER -# define CORRECT_PTR -#endif -size_t armci_mem_offset=0; - -/* We start by using table: assign address of local variable set to 1 - * On shmem systems, this addres is overwritten by a shared memory location - * when memlock array is allocated in armci_init - * Therefore, any process within shmem node can reset armci_use_memlock_table - * to "not used" when offset changes. Since the variable is in shmem, everybody - * on that SMP node will see the change and use the same locking functions - */ -int init_use_memlock_table=1; -int *armci_use_memlock_table=&init_use_memlock_table; - -static int locked_slot=INVALID_VAL; - -volatile double armci_dummy_work=0.; -void **memlock_table_array; - -/* constants for cache line alignment */ -# define CALGN 64 -# define LOG_CALGN 6 - -#define ALIGN_ADDRESS(x) (char*)((((unsigned long)x) >> LOG_CALGN) << LOG_CALGN) -static memlock_t table[MAX_SLOTS]; - - -/*\ simple locking scheme that ignores addresses -\*/ -void armci_lockmem_(void *pstart, void *pend, int proc) -{ - -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - - if(DEBUG_){ - printf("%d: armci_lockmem_ proc=%d lock=%d\n",armci_me,proc,lock); - fflush(stdout); - } - - NATIVE_LOCK(lock,proc); - if(DEBUG_){ - printf("%d: armci_lockmem_ done\n",armci_me); - fflush(stdout); - } -} - -void armci_unlockmem_(int proc) -{ - -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - if(DEBUG_){ - printf("%d: armci_unlockmem_ proc=%d lock=%d\n",armci_me,proc,lock); - fflush(stdout); - } - NATIVE_UNLOCK(lock,proc); -} - - - -/*\ idle for a time proportional to factor -\*/ -void armci_waitsome(int factor) -{ -int i=factor*100000; - - if(factor <= 1) armci_dummy_work =0.; - if(factor < 1) return; - while(--i){ - armci_dummy_work = armci_dummy_work + 1./(double)i; - } -} - -/*\ acquire exclusive LOCK to MEMORY area owned by process "proc" - * . only one area can be locked at a time by the calling process - * . must unlock it with armci_unlockmem -\*/ -void armci_lockmem(void *start, void *end, int proc) -{ - register void* pstart, *pend; - register int slot, avail=0; - int turn=0, conflict=0; - memlock_t *memlock_table; -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - -#ifdef CORRECT_PTR - if(! *armci_use_memlock_table){ - /* if offset invalid, use dumb locking scheme ignoring addresses */ - armci_lockmem_(start, end, proc); - return; - } - -# ifndef SGIALTIX - /* when processes are attached to a shmem region at different addresses, - * addresses written to memlock table must be adjusted to the node master - */ - if(armci_mem_offset){ - start = armci_mem_offset + (char*)start; - end = armci_mem_offset + (char*)end; - } -# endif -#endif - - if(DEBUG_){ - printf("%d: calling armci_lockmem for %d range %p -%p\n", - armci_me, proc, start,end); - fflush(stdout); - } - memlock_table = (memlock_t*)memlock_table_array[proc]; - - -#ifdef ALIGN_ADDRESS - /* align address range on cache line boundary to avoid false sharing */ - pstart = ALIGN_ADDRESS(start); - pend = CALGN -1 + ALIGN_ADDRESS(end); -#else - pstart=start; - pend =end; -#endif - -#ifdef CRAY_SHMEM - { /* adjust according the remote process raw address */ - long bytes = (long) ((char*)pend-(char*)pstart); - extern void* armci_shmalloc_remote_addr(void *ptr, int proc); - pstart = armci_shmalloc_remote_addr(pstart, proc); - pend = (char*)pstart + bytes; - } -#endif - while(1){ - NATIVE_LOCK(lock,proc); - - armci_get(memlock_table, table, sizeof(table), proc); -/* armci_copy(memlock_table, table, sizeof(table));*/ - - /* inspect the table */ - conflict = 0; avail =-1; - for(slot = 0; slot < MAX_SLOTS; slot ++){ - - /* nonzero starting address means the slot is occupied */ - if(table[slot].start == NULL){ - - /* remember a free slot to store address range */ - avail = slot; - - }else{ - /*check for conflict: overlap between stored and current range*/ - if( (pstart >= table[slot].start && pstart <= table[slot].end) - || (pend >= table[slot].start && pend <= table[slot].end) ){ - - conflict = 1; - break; - - } - /* - printf("%d: locking %ld-%ld (%d) conflict\n", - armci_me, */ - } - } - - if(avail != -1 && !conflict) break; - - NATIVE_UNLOCK(lock,proc); - armci_waitsome( ++turn ); - - } - - /* we got the memory lock: enter address into the table */ - table[avail].start = pstart; - table[avail].end = pend; - armci_put(table+avail,memlock_table+avail,sizeof(memlock_t),proc); - - FENCE_NODE(proc); - - NATIVE_UNLOCK(lock,proc); - locked_slot = avail; - -} - - -/*\ release lock to the memory area locked by previous call to armci_lockemem -\*/ -void armci_unlockmem(int proc) -{ - void *null[2] = {NULL,NULL}; - memlock_t *memlock_table; - -#ifdef CORRECT_PTR - if(! *armci_use_memlock_table){ - /* if offset invalid, use dumb locking scheme ignoring addresses */ - armci_unlockmem_(proc); - return; - } -#endif - -#ifdef DEBUG - if(locked_slot == INVALID_VAL) armci_die("armci_unlock: empty",0); - if(locked_slot >= MAX_SLOTS || locked_slot <0) - armci_die("armci_unlock: corrupted slot?",locked_slot); -#endif - - memlock_table = (memlock_t*)memlock_table_array[proc]; - armci_put(null,&memlock_table[locked_slot].start,2*sizeof(void*),proc); - -} - - - -/*\ based on address for set by master, determine correction for - * memory addresses set in memlock table - * if the correction/offset ever changes stop using memlock table locking -\*/ -void armci_set_mem_offset(void *ptr) -{ - extern size_t armci_mem_offset; - size_t off; - static int first_time=1; - volatile void *ref_ptr; - - ARMCI_PR_DBG("enter",0); - /* do not care if memlock not used */ - if(! *armci_use_memlock_table) return; - - if(!ptr) armci_die("armci_set_mem_offset : null ptr",0); - ref_ptr = *(void**)ptr; - off = (size_t)((char*)ref_ptr - (char*)ptr); - - if(first_time){ - - armci_mem_offset =off; - first_time =0; - if(DEBUG_){ - printf("%d memlock offset=%ld ref=%p ptr=%p\n",armci_me, - (long)armci_mem_offset, ref_ptr, ptr); fflush(stdout); - } - - }else{ - if(armci_mem_offset != off){ - *armci_use_memlock_table =0; - fprintf(stderr, "%d: WARNING:armci_set_mem_offset: offset changed %ld to %ld\n", - armci_me, (long)armci_mem_offset, (long)off); fflush(stdout); - } - } -} diff --git a/armci/src-portals/memlock.h b/armci/src-portals/memlock.h deleted file mode 100644 index ce4c868aa..000000000 --- a/armci/src-portals/memlock.h +++ /dev/null @@ -1,37 +0,0 @@ -/* $Id: memlock.h,v 1.18 2004-09-21 17:26:23 manoj Exp $ */ -#ifndef _MEMLOCK_H_ -#define _MEMLOCK_H_ - - -/* data structure for locking memory areas */ -#define MAX_SLOTS 8 -typedef struct{ - void *start; - void *end; -} memlock_t; - -/* SGI Altix Stuff */ -typedef struct { - void *seg_addr; /* master's starting address of the segment */ - size_t seg_size; - size_t tile_size; - size_t mem_offset; -}armci_memoffset_t; - -extern void** memlock_table_array; -extern int *armci_use_memlock_table; - -#if defined(LAPI) || defined(FUJITSU) || defined(PTHREADS) || defined(QUADRICS)\ - || defined(PORTALS) || defined(HITACHI) || (defined(LINUX64)&&defined(__GNUC__)&&defined(__alpha__))\ - || defined(CYGWIN) || defined(__crayx1) || defined(NEC) -# define ARMCI_LOCKMEM armci_lockmem_ -# define ARMCI_UNLOCKMEM armci_unlockmem_ -#else -# define ARMCI_LOCKMEM armci_lockmem -# define ARMCI_UNLOCKMEM armci_unlockmem -#endif - -extern void ARMCI_LOCKMEM(void *pstart, void *pend, int proc); -extern void ARMCI_UNLOCKMEM(int proc); -#define MEMLOCK_SHMEM_FLAG -#endif diff --git a/armci/src-portals/memory.c b/armci/src-portals/memory.c deleted file mode 100644 index 3137d601b..000000000 --- a/armci/src-portals/memory.c +++ /dev/null @@ -1,953 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: memory.c,v 1.56.2.3 2007-04-25 23:49:55 d3p687 Exp $ */ -#ifndef NEW_MALLOC -#include -#include -#include "armcip.h" -#include "message.h" -#include "kr_malloc.h" - -#define DEBUG_ 0 -#define USE_MALLOC -#define USE_SHMEM_ -#define SHM_UNIT 1024 - -static context_t ctx_localmem; -/* -static context_t ctx_mlocalmem; -*/ -#if defined(SYSV) || defined(WIN32) || defined(MMAP) || defined(HITACHI) -#include "armci_shmem.h" - -#if !defined(USE_SHMEM) && (defined(HITACHI) || defined(MULTI_CTX)) -# define USE_SHMEM -#endif - -#if !(defined(LAPI)||defined(QUADRICS)||defined(SERVER_THREAD)) ||\ - defined(USE_SHMEM) -#define RMA_NEEDS_SHMEM -#endif - -void kr_check_local() -{ -#if 0 -kr_malloc_print_stats(&ctx_localmem); -#endif -kr_malloc_verify(&ctx_localmem); -} - -void armci_print_ptr(void **ptr_arr, int bytes, int size, void* myptr, int off) -{ -int i; -int nproc = armci_clus_info[armci_clus_me].nslave; - - ARMCI_PR_DBG("enter",0); - for(i=0; i< armci_nproc; i++){ - int j; - if(armci_me ==i){ - printf("%d master =%d nproc=%d off=%d\n",armci_me, - armci_master,nproc, off); - printf("%d:bytes=%d mptr=%p s=%d ",armci_me, bytes, myptr,size); - for(j = 0; j< armci_nproc; j++)printf(" %p",ptr_arr[j]); - printf("\n"); fflush(stdout); - } - armci_msg_barrier(); - } - ARMCI_PR_DBG("exit",0); -} - - -/*\ master exports its address of shmem region at the beggining of that region -\*/ -static void armci_master_exp_attached_ptr(void* ptr) -{ - ARMCI_PR_DBG("enter",0); - if(!ptr) armci_die("armci_master_exp_att_ptr: null ptr",0); - *(volatile void**)ptr = ptr; - ARMCI_PR_DBG("exit",0); -} - - -/*\ Collective Memory Allocation on shared memory systems -\*/ -void armci_shmem_malloc(void *ptr_arr[], armci_size_t bytes) -{ - void *myptr=NULL, *ptr=NULL; - long idlist[SHMIDLEN]; - long size=0, offset=0; - long *size_arr; - void **ptr_ref_arr; - int i,cn, len; - int nproc = armci_clus_info[armci_clus_me].nslave; - ARMCI_PR_DBG("enter",0); - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - - /* allocate work arrays */ - size_arr = (long*)calloc(armci_nproc,sizeof(long)); - if(!size_arr)armci_die("armci_malloc:calloc failed",armci_nproc); - /* allocate arrays for cluster address translations */ - - ptr_ref_arr = calloc(armci_nclus,sizeof(void*)); /* must be zero */ - if(!ptr_ref_arr)armci_die("armci_malloc:calloc 2 failed",armci_nclus); - - /* combine all memory requests into size_arr */ - size_arr[armci_me] = bytes; - armci_msg_lgop(size_arr, armci_nproc, "+"); - - /* determine aggregate request size on the cluster node */ - for(i=0, size=0; i< nproc; i++) size += size_arr[i+armci_master]; - - /* master process creates shmem region and then others attach to it */ - if(armci_me == armci_master ){ - /* can malloc if there is no data server process and has 1 process/node*/ -# ifndef RMA_NEEDS_SHMEM - if(nproc == 1) - myptr = kr_malloc(size, &ctx_localmem, 0, NULL, NULL); - else -# endif - myptr = Create_Shared_Region(idlist+1,size,idlist); - if(!myptr && size>0 )armci_die("armci_malloc: could not create", (int)(size>>10)); - - /* place its address at begining of attached region for others to see */ - if(size)armci_master_exp_attached_ptr(myptr); - - if(DEBUG_){ - printf("%d:armci_malloc addr mptr=%p size=%ld\n",armci_me,myptr,size); - fflush(stdout); - } - } - - /* broadcast shmem id to other processes on the same cluster node */ - armci_msg_clus_brdcst(idlist, SHMIDLEN*sizeof(long)); - - if(armci_me != armci_master){ - myptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!myptr)armci_die("armci_malloc: could not attach", (int)(size>>10)); - - /* now every process in a SMP node needs to find out its offset - * w.r.t. master - this offset is necessary to use memlock table - */ - if(size) armci_set_mem_offset(myptr); - if(DEBUG_){ - printf("%d:armci_malloc attached addr mptr=%p ref=%p size=%ld\n", - armci_me,myptr, *(void**)myptr,size); fflush(stdout); - } - } -# if defined(DATA_SERVER) - /* get server reference address for every cluster node to perform - * remote address translation for global address space */ - if(armci_nclus>1){ - if(armci_me == armci_master){ -# ifdef SERVER_THREAD - ptr_ref_arr[armci_clus_me]=myptr; -# else - { - extern int _armci_server_started; - if(_armci_server_started) { - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &ptr, sizeof(void*)); - ptr_ref_arr[armci_clus_me]= myptr; /* from server*/ - } - else /* server not yet started */ - ptr_ref_arr[armci_clus_me]=myptr; - } - - if(DEBUG_){ - printf("%d:addresses server=%p myptr=%p\n",armci_me,ptr,myptr); - fflush(stdout); - } -# endif - } - /* exchange ref addr of shared memory region on every cluster node*/ - armci_exchange_address(ptr_ref_arr, armci_nclus); - # ifdef ARMCI_REGISTER_SHMEM - armci_register_shmem(myptr,size,idlist+1,idlist[0],ptr_ref_arr[armci_clus_me]); - # endif - }else { - ptr_ref_arr[armci_master] = myptr; - } - /* translate addresses for all cluster nodes */ - for(cn = 0; cn < armci_nclus; cn++){ - int master = armci_clus_info[cn].master; - offset = 0; - /* on local cluster node use myptr directly */ - ptr = (armci_clus_me == cn) ? myptr: ptr_ref_arr[cn]; - /* compute addresses pointing to the memory regions on cluster node*/ - for(i=0; i< armci_clus_info[cn].nslave; i++){ - /* NULL if request size is 0*/ - ptr_arr[i+master] = (size_arr[i+master])? ((char*)ptr)+offset : NULL; - offset += size_arr[i+master]; - } - } -# else - /* compute addresses for local cluster node */ - offset =0; - for(i=0; i< nproc; i++) { - ptr_ref_arr[i] = (size_arr[i+armci_master])? ((char*)myptr)+offset : 0L; - offset += size_arr[i+armci_master]; - } - /* exchange addreses with all other processes */ - ptr_arr[armci_me] = (char*)ptr_ref_arr[armci_me-armci_master]; - armci_exchange_address(ptr_arr, armci_nproc); - /* overwrite entries for local cluster node with ptr_ref_arr */ - bcopy((char*)ptr_ref_arr, (char*)(ptr_arr+armci_master), nproc*sizeof(void*)); - /* armci_print_ptr(ptr_arr, bytes, size, myptr, offset);*/ -# endif - - armci_msg_barrier(); - - /* free work arrays */ - free(ptr_ref_arr); - free(size_arr); - ARMCI_PR_DBG("exit",0); - -} - -/******************************************************************** - * Non-collective Memory Allocation on shared memory systems -\*/ -void armci_shmem_memget(armci_meminfo_t *meminfo, size_t size) { - void *myptr=NULL; - void *armci_ptr=NULL; /* legal ARCMIptr used in ARMCI data xfer ops */ - long idlist[SHMIDLEN]; - - /* can malloc if there is no data server process & has 1 process/node*/ -#ifndef RMA_NEEDS_SHMEM - if( armci_clus_info[armci_clus_me].nslave == 1) - myptr = kr_malloc(size, &ctx_localmem, 0, NULL, NULL); - else -#endif - myptr = Create_Shared_Region(idlist+1,size,idlist); - - if(!myptr && size>0 ) - armci_die("armci_shmem_memget: create failed", (int)(size>>10)); - - if(DEBUG_) - { - printf("%d: armci_shmem_memget: addr=%p size=%ld %ld %ld \n", armci_me, - myptr, size, idlist[0], idlist[1]); - fflush(stdout); - } - - armci_ptr = myptr; - -#if defined(DATA_SERVER) - - /* get server reference address to perform - * remote address translation for global address space */ - if(armci_nclus>1) - { -# ifdef SERVER_THREAD - - /* data server thread runs on master process */ - if(armci_me != armci_master) { - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &armci_ptr, sizeof(void*)); - } - -# else - /* ask dataserver process to attach to region and get ptr*/ - { - extern int _armci_server_started; - if(_armci_server_started) { - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &armci_ptr, sizeof(void*)); - } - } -# endif - } -#endif - - /* fill the meminfo structure */ - meminfo->armci_addr = armci_ptr; - meminfo->addr = myptr; - meminfo->size = size; - meminfo->cpid = armci_me; - bcopy(idlist, meminfo->idlist, SHMIDLEN*sizeof(long)); - -} - -void* armci_shmem_memat(armci_meminfo_t *meminfo) { - void *ptr=NULL; - long size = (long) meminfo->size; - long *idlist = (long*) meminfo->idlist; - - if(SAMECLUSNODE(meminfo->cpid)) - { - /* Attach to the shared memory segment */ - ptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!ptr)armci_die("ARMCi_Memat: could not attach", (int)(size>>10)); - - /* CHECK: now every process in a SMP node needs to find out its offset - * w.r.t. master - this offset is necessary to use memlock table - */ - if(size) armci_set_mem_offset(ptr); - } - else - { - ptr = meminfo->armci_addr; /* remote address */ - } - - return ptr; -} - -void armci_shmem_memctl(armci_meminfo_t *meminfo) { - - /* only the creator can delete the segment */ - if(meminfo->cpid == armci_me) { - void *ptr = meminfo->addr; - -#ifdef RMA_NEEDS_SHMEM - Free_Shmem_Ptr(0,0,ptr); -#else - if(armci_clus_info[armci_clus_me].nslave>1) - Free_Shmem_Ptr(0,0,ptr); - else kr_free(ptr, &ctx_localmem); -#endif - } -} - -/****** End: Non-collective memory allocation on shared memory systems *****/ - -#ifdef MSG_COMMS_MPI -/******************************************************************** - * Group Memory Allocation on shared memory systems for ARMCI Groups -\*/ -void armci_shmem_malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) -{ - void *myptr=NULL, *ptr=NULL; - long idlist[SHMIDLEN]; - long size=0, offset=0; - long *size_arr; - void **ptr_ref_arr; - int i,cn, len; - /* int nproc = armci_clus_info[armci_clus_me].nslave; ? change ? */ - int grp_me, grp_nproc, grp_nclus, grp_master, grp_clus_nproc, grp_clus_me; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - ARMCI_PR_DBG("enter",0); - - /* Get the group info: group size & group rank */ - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(grp_me == MPI_UNDEFINED) { /* check if the process is in this group */ - armci_die("armci_malloc_group: process is not a member in this group", - armci_me); - } - - grp_nclus = grp_attr->grp_nclus; - grp_clus_me = grp_attr->grp_clus_me; - grp_master = grp_attr->grp_clus_info[grp_clus_me].master; - grp_clus_nproc = grp_attr->grp_clus_info[grp_clus_me].nslave; - - bzero((char*)ptr_arr,grp_nproc*sizeof(void*)); - - /* allocate work arrays */ - size_arr = (long*)calloc(grp_nproc,sizeof(long)); - if(!size_arr)armci_die("armci_malloc_group:calloc failed",grp_nproc); - - /* allocate arrays for cluster address translations */ -# if defined(DATA_SERVER) - len = grp_nclus; -# else - len = grp_clus_nproc; -# endif - - ptr_ref_arr = calloc(len,sizeof(void*)); /* must be zero */ - if(!ptr_ref_arr)armci_die("armci_malloc_group:calloc 2 failed",len); - - /* combine all memory requests into size_arr */ - size_arr[grp_me] = bytes; - armci_msg_group_gop_scope(SCOPE_ALL, size_arr, grp_nproc, "+", ARMCI_LONG, - group); - - /* determine aggregate request size on the cluster node */ - for(i=0, size=0; i< grp_clus_nproc; i++) size += size_arr[i+grp_master]; - - /* master process creates shmem region and then others attach to it */ - if(grp_me == grp_master ){ - - - /* can malloc if there is no data server process and has 1 process/node*/ -# ifndef RMA_NEEDS_SHMEM - if( armci_clus_info[armci_clus_me].nslave == 1) - myptr = kr_malloc(size, &ctx_localmem, 0, NULL, NULL); - else -# endif - myptr = Create_Shared_Region(idlist+1,size,idlist); - if(!myptr && size>0 ) - armci_die("armci_malloc_group: could not create", (int)(size>>10)); - - /* place its address at begining of attached region for others to see */ - if(size)armci_master_exp_attached_ptr(myptr); - - if(DEBUG_){ - printf("%d:armci_malloc_group addr mptr=%p ref=%p size=%ld %ld %ld \n",armci_me,myptr,*(void**)myptr, size,idlist[0],idlist[1]); - fflush(stdout); - } - } - - /* broadcast shmem id to other processes (in the same group) on the - same cluster node */ - armci_grp_clus_brdcst(idlist, SHMIDLEN*sizeof(long), grp_master, - grp_clus_nproc, group); - - if(grp_me != grp_master){ - myptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!myptr)armci_die("armci_malloc_group: could not attach", (int)(size>>10)); - - /* now every process in a SMP node needs to find out its offset - * w.r.t. master - this offset is necessary to use memlock table - */ - if(size) armci_set_mem_offset(myptr); - if(DEBUG_){ - printf("%d:armci_malloc_group attached addr mptr=%p ref=%p size=%ld\n", - armci_me,myptr, *(void**)myptr,size); fflush(stdout); - } - } - -# if defined(DATA_SERVER) - - /* get server reference address for every cluster node in the group - * to perform remote address translation for global address space */ - if(grp_nclus>1){ - if(grp_me == grp_master){ - -# ifdef SERVER_THREAD - - /* data server thread runs on master process */ - if(ARMCI_Absolute_id(group,grp_master)!=armci_master){ - /*printf("\n%d: grp_master=%d %ld %ld \n",armci_me,ARMCI_Absolute_id(group,grp_master),idlist[0],idlist[1]);*/ - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &ptr, sizeof(void*)); - ptr_ref_arr[grp_clus_me]= ptr; /* from server*/ - } - else - ptr_ref_arr[grp_clus_me]=myptr; - -# else - /* ask data server process to attach to the region and get ptr */ - { - extern int _armci_server_started; - if(_armci_server_started) { - armci_serv_attach_req(idlist, SHMIDLEN*sizeof(long), size, - &ptr, sizeof(void*)); - ptr_ref_arr[grp_clus_me]= ptr; /* from server*/ - } - else /* server not yet started */ - ptr_ref_arr[grp_clus_me]=myptr; - } - - if(DEBUG_){ - printf("%d:addresses server=%p myptr=%p\n",grp_me,ptr,myptr); - fflush(stdout); - } -# endif - } - /* exchange ref addr of shared memory region on every cluster node*/ - { - int ratio = sizeof(void*)/sizeof(int); - if(DEBUG_)printf("%d: exchanging %ld ratio=%d\n",armci_me, - (long)ptr_arr[grp_me], ratio); - armci_msg_group_gop_scope(SCOPE_ALL, ptr_ref_arr, grp_nclus*ratio, - "+", ARMCI_INT, group); - # ifdef ARMCI_REGISTER_SHMEM - armci_register_shmem_grp(myptr,size,idlist+1,idlist[0],ptr_ref_arr[armci_clus_me],group); - # endif - } - }else { - - ptr_ref_arr[grp_master] = myptr; - - } - - /* translate addresses for all cluster nodes */ - for(cn = 0; cn < grp_nclus; cn++){ - - int master = grp_attr->grp_clus_info[cn].master; - offset = 0; - - /* on local cluster node use myptr directly */ - ptr = (grp_clus_me == cn) ? myptr: ptr_ref_arr[cn]; - - /* compute addresses pointing to the memory regions on cluster node*/ - for(i=0; i< grp_attr->grp_clus_info[cn].nslave; i++){ - - /* NULL if request size is 0*/ - ptr_arr[i+master] =(size_arr[i+master])? ((char*)ptr)+offset: NULL; - offset += size_arr[i+master]; - } - } - -# else - - /* compute addresses for local cluster node */ - offset =0; - for(i=0; i< grp_clus_nproc; i++) { - - ptr_ref_arr[i] = (size_arr[i+grp_master])? ((char*)myptr)+offset : 0L; - offset += size_arr[i+grp_master]; - - } - - /* exchange addreses with all other processes */ - ptr_arr[grp_me] = (char*)ptr_ref_arr[grp_me-grp_master]; - armci_exchange_address_grp(ptr_arr, grp_nproc, group); - - /* overwrite entries for local cluster node with ptr_ref_arr */ - bcopy((char*)ptr_ref_arr, (char*)(ptr_arr+grp_master), grp_clus_nproc*sizeof(void*)); - -# endif - - /* armci_print_ptr(ptr_arr, bytes, size, myptr, offset);*/ - - armci_msg_group_barrier(group); - - /* free work arrays */ - free(ptr_ref_arr); - free(size_arr); - ARMCI_PR_DBG("exit",0); -} -#endif /* ifdef MSG_COMMS_MPI */ - -#else - -void armci_shmem_malloc(void* ptr_arr[], int bytes) -{ - armci_die("armci_shmem_malloc should never be called on this system",0); -} -void armci_shmem_memget(armci_meminfo_t *meminfo, size_t size) { - armci_die("armci_shmem_memget should never be called on this system",0); -} -void* armci_shmem_memat(armci_meminfo_t *meminfo) { - armci_die("armci_shmem_memat should never be called on this system",0); -} -void armci_shmem_memctl(armci_meminfo_t *meminfo) { - armci_die("armci_shmem_memctl should never be called on this system",0); -} -# ifdef MSG_COMMS_MPI - void armci_shmem_malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) { - armci_die("armci_shmem_malloc_group should never be called on this system",0); - } -# endif - -#endif - - -/* public constructor to initialize the kr_malloc context */ -void armci_krmalloc_init_localmem() { -#if defined(ALLOW_PIN) - kr_malloc_init(0, 0, 0, reg_malloc, 0, &ctx_localmem); - kr_malloc_init(0, 0, 0, malloc, 0, &ctx_mlocalmem); - ctx_mlocalmem.ctx_type = KR_CTX_LOCALMEM; -#elif defined(CRAY_SHMEM) && defined(CRAY_XT) -# ifdef CATAMOUNT - int units_avail = (cnos_shmem_size() - 1024 * 1024) / SHM_UNIT; -# else - extern size_t get_xt_heapsize(); - int units_avail = (get_xt_heapsize() - 1024 * 1024) / SHM_UNIT; -# endif - - if(DEBUG_) - { - fprintf(stderr,"%d:krmalloc_init_localmem: symheap=%llu,units(%d)=%d\n", - armci_me, SHM_UNIT*units_avail, SHM_UNIT, units_avail); - } - kr_malloc_init(SHM_UNIT, units_avail, units_avail, shmalloc, 0, - &ctx_localmem); - armci_shmalloc_exchange_offsets(&ctx_localmem); -#else - - kr_malloc_init(0, 0, 0, malloc, 0, &ctx_localmem); - -#endif - - ctx_localmem.ctx_type = KR_CTX_LOCALMEM; -} - -/** - * Local Memory Allocation and Free - */ -void *PARMCI_Malloc_local(armci_size_t bytes) { - void *rptr; - ARMCI_PR_DBG("enter",0); - ARMCI_PR_DBG("exit",0); - rptr = (void *)kr_malloc((size_t)bytes, &ctx_localmem, 0, NULL, NULL); - //printf("\n%d:%s:%d:%p\n",armci_me,FUNCTION_NAME,bytes,rptr); - return rptr; -} - -int PARMCI_Free_local(void *ptr) { - ARMCI_PR_DBG("enter",0); - kr_free((char *)ptr, &ctx_localmem); - ARMCI_PR_DBG("exit",0); - return 0; -} - - -/*\ Collective Memory Allocation - * returns array of pointers to blocks of memory allocated by everybody - * Note: as the same shared memory region can be mapped at different locations - * in each process address space, the array might hold different values - * on every process. However, the addresses are legitimate - * and can be used in the ARMCI data transfer operations. - * ptr_arr[nproc] -\*/ -int PARMCI_Malloc(void *ptr_arr[], armci_size_t bytes) -{ - void *ptr; - char *new_base; - size_t new_size=0; - ARMCI_PR_DBG("enter",0); - if(DEBUG_){ - fprintf(stderr,"%d bytes in armci_malloc %d\n",armci_me, (int)bytes); - fflush(stderr); - armci_msg_barrier(); - } - -# ifdef USE_MALLOC - if(armci_nproc == 1) { - ptr = kr_malloc((size_t) bytes, &ctx_localmem, 0, NULL, NULL); - if(bytes) if(!ptr) armci_die("armci_malloc:malloc 1 failed",(int)bytes); - ptr_arr[armci_me] = ptr; - ARMCI_PR_DBG("exit",0); - return (0); - } -# endif - - // static int one_time = 0; - - if( ARMCI_Uses_shm() ) { - // if(one_time++ == 0 && armci_me==0) printf("%d: ARMCI_Uses_shm = true\n",armci_me); - armci_shmem_malloc(ptr_arr,bytes); - } else { - /* on distributed-memory systems just malloc & collect all addresses */ - ptr = kr_malloc(bytes, &ctx_localmem, 1, &new_base, &new_size); - if(bytes) if(!ptr) armci_die("armci_malloc:malloc 2 failed",bytes); - - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - ptr_arr[armci_me] = ptr; - - /* now combine individual addresses into a single array */ - armci_exchange_address(ptr_arr, armci_nproc); - # ifdef ARMCI_REGISTER_SHMEM - if(new_size) - armci_register_shmem(new_base,new_size,NULL,0,new_base); - else - armci_register_shmem(ptr,bytes,NULL,0,ptr); - # endif - } - ARMCI_PR_DBG("exit",0); - //printf("\n%d:%s:%d:%p\n",armci_me,FUNCTION_NAME,bytes,ptr_arr[armci_me]); - return(0); -} - -/*\ - * Wrapper on PARMCI_Malloc so that old code still works -\*/ -int PARMCI_Malloc_memdev(void *ptr_arr[], armci_size_t bytes, const char *device) -{ - return PARMCI_Malloc(ptr_arr,bytes); -} - - -/*\ shared memory is released to kr_malloc only on process 0 - * with data server malloc cannot be used -\*/ -int PARMCI_Free(void *ptr) -{ - ARMCI_PR_DBG("enter",0); - if(!ptr)return 1; - -# if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(NO_SHM) -# ifdef USE_MALLOC - if(armci_nproc > 1) -# endif - if(ARMCI_Uses_shm()){ - if(armci_me==armci_master){ -# ifdef RMA_NEEDS_SHMEM - Free_Shmem_Ptr(0,0,ptr); -# else - if(armci_clus_info[armci_clus_me].nslave>1) - Free_Shmem_Ptr(0,0,ptr); - else kr_free(ptr, &ctx_localmem); -# endif - } - ptr = NULL; - return 0; - } -# endif - kr_free(ptr, &ctx_localmem); - //armci_unregister_shmem(ptr,0); - ptr = NULL; - ARMCI_PR_DBG("exit",0); - return 0; -} - -/*\ - * Wrapper on PARMCI_Free_memdev so that old code still works -\*/ -int PARMCI_Free_memdev(void *ptr) -{ - PARMCI_Free(void *ptr); -} - - -int ARMCI_Uses_shm() -{ - int uses=0; - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) \ - && !defined(NO_SHM) -# ifdef RMA_NEEDS_SHMEM - if(armci_nproc >1) uses= 1; /* always unless serial mode */ -# else - if(armci_nproc != armci_nclus)uses= 1; /* only when > 1 node used */ -# endif -#endif - if(DEBUG_) fprintf(stderr,"%d:uses shmem %d\n",armci_me, uses); - return uses; -} -#ifdef MSG_COMMS_MPI - -int ARMCI_Uses_shm_grp(ARMCI_Group *group) -{ - int uses=0, grp_me, grp_nproc, grp_nclus; - ARMCI_PR_DBG("enter",0); - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - grp_nclus = grp_attr->grp_nclus; - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) \ - && !defined(NO_SHM) -# ifdef RMA_NEEDS_SHMEM - if(grp_nproc >1) uses= 1; /* always unless serial mode */ -# else - if(grp_nproc != grp_nclus)uses= 1; /* only when > 1 node used */ -# endif -#endif - if(DEBUG_) fprintf(stderr,"%d (grp_id=%d):uses shmem %d\n",armci_me, grp_me, uses); - ARMCI_PR_DBG("exit",0); - return uses; -} - -/*\ ************** Begin Group Collective Memory Allocation ****************** - * returns array of pointers to blocks of memory allocated by everybody - * Note: as the same shared memory region can be mapped at different locations - * in each process address space, the array might hold different values - * on every process. However, the addresses are legitimate - * and can be used in the ARMCI data transfer operations. - * ptr_arr[nproc] -\*/ -int ARMCI_Malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) -{ - void *ptr; - int grp_me, grp_nproc; - ARMCI_PR_DBG("enter",0); - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(DEBUG_)fprintf(stderr,"%d (grp_id=%d) bytes in armci_malloc_group %d\n", - armci_me, grp_me, (int)bytes); -#ifdef USE_MALLOC - if(grp_nproc == 1) { - ptr = kr_malloc((size_t) bytes, &ctx_localmem, 0, NULL, NULL); - if(bytes) if(!ptr) armci_die("armci_malloc_group:malloc 1 failed",(int)bytes); - ptr_arr[grp_me] = ptr; - ARMCI_PR_DBG("exit",0); - return (0); - } -#endif - - if( ARMCI_Uses_shm_grp(group) ) { -# ifdef SGIALTIX - armci_altix_shm_malloc_group(ptr_arr,bytes,group); -# else - armci_shmem_malloc_group(ptr_arr,bytes,group); -# endif - } - else { - void *new_base=NULL; - size_t new_size=NULL; - ptr = kr_malloc(bytes, &ctx_localmem, 1, &new_base, &new_size); - if(bytes) if(!ptr) armci_die("armci_malloc:malloc 2 failed",bytes); - - bzero((char*)ptr_arr,grp_nproc*sizeof(void*)); - ptr_arr[grp_me] = ptr; - - /* now combine individual addresses into a single array */ - armci_exchange_address_grp(ptr_arr, grp_nproc, group); - - } - ARMCI_PR_DBG("exit",0); - return(0); -} - -/*\ - * Wrapper on PARMCI_Malloc_group so that old code still works -\*/ -int ARMCI_Malloc_group_memdev(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group, const char *device) -{ - return ARMCI_Malloc_group(ptr_arr,bytes,group); -} - -/*\ shared memory is released to kr_malloc only on process 0 - * with data server malloc cannot be used - \*/ -int ARMCI_Free_group(void *ptr, ARMCI_Group *group) -{ - int grp_me, grp_nproc, grp_master, grp_clus_me; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - ARMCI_PR_DBG("enter",0); - - if(!ptr)return 1; - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(grp_me == MPI_UNDEFINED) { /* check if the process is in this group */ - armci_die("armci_malloc_group: process is not a member in this group", - armci_me); - } - /* get the group cluster info */ - grp_clus_me = grp_attr->grp_clus_me; - grp_master = grp_attr->grp_clus_info[grp_clus_me].master; - -# if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(NO_SHM) -# ifdef USE_MALLOC - if(grp_nproc > 1) -# endif - if(ARMCI_Uses_shm_grp(group)){ - if(grp_me == grp_master) { -# ifdef RMA_NEEDS_SHMEM - Free_Shmem_Ptr(0,0,ptr); -# else - if(armci_clus_info[armci_clus_me].nslave>1) Free_Shmem_Ptr(0,0,ptr); - else kr_free(ptr, &ctx_localmem); -# endif - } - ptr = NULL; - ARMCI_PR_DBG("exit",0); - return 0; - } -# endif - kr_free(ptr, &ctx_localmem); - - ptr = NULL; - ARMCI_PR_DBG("exit",0); - return 0; -} -/* ***************** End Group Collective Memory Allocation ******************/ - -/* ************** Begin Non-Collective Memory Allocation ****************** - * Prototype similar to SysV shared memory. - */ - -/** - * CHECK: On Altix we are forced to use SysV as shmalloc is collective. We - * may use a preallocated shmalloc memory, however, it may NOT still solve - * our problem... - * NOTE: "int memflg" option for future optimiztions. - */ -void PARMCI_Memget(size_t bytes, armci_meminfo_t *meminfo, int memflg) { - - void *myptr=NULL; - void *armci_ptr=NULL; /* legal ARCMI ptr used in ARMCI data xfer ops*/ - size_t size = bytes; - - if(size<=0) armci_die("ARMCI_Memget: size must be > 0", (int)size); - if(meminfo==NULL) armci_die("ARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - if(memflg!=0) armci_die("ARMCI_Memget: Invalid memflg", memflg); - - if( !ARMCI_Uses_shm() ) - { - armci_ptr = myptr = kr_malloc(size, &ctx_localmem, 0, NULL, NULL); - if(size) if(!myptr) armci_die("ARMCI_Memget failed", (int)size); - - /* fill the meminfo structure */ - meminfo->armci_addr = armci_ptr; - meminfo->addr = myptr; - meminfo->size = size; - meminfo->cpid = armci_me; - /* meminfo->attr = NULL; */ - } - else - { - armci_shmem_memget(meminfo, size); - } - - if(DEBUG_){ - printf("%d: ARMCI_Memget: addresses server=%p myptr=%p bytes=%ld\n", - armci_me, meminfo->armci_addr, meminfo->addr, bytes); - fflush(stdout); - } -} - -void* PARMCI_Memat(armci_meminfo_t *meminfo, long offset) { - void *ptr=NULL; - - if(meminfo==NULL) armci_die("ARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - if(offset!=0) armci_die("ARMCI_Memget: Invalid offset", offset); - - if(meminfo->cpid==armci_me) { ptr = meminfo->addr; return ptr; } - - if( !ARMCI_Uses_shm()) - { - ptr = meminfo->addr; - } - else - { - ptr = armci_shmem_memat(meminfo); - } - - if(DEBUG_) - { - printf("%d:ARMCI_Memat: attached addr mptr=%p size=%ld\n", - armci_me, ptr, meminfo->size); fflush(stdout); - } - - return ptr; -} - -void ARMCI_Memdt(armci_meminfo_t *meminfo, long offset) { - /** - * Do nothing. May be we need to have reference counting in future. This - * is to avoid the case of dangling pointers when the creator of shm - * segment calls Memctl and other processes are still attached to this - * segment - */ -} - -void ARMCI_Memctl(armci_meminfo_t *meminfo) { - - if(meminfo==NULL) armci_die("ARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - - /* only the creator can delete the segment */ - if(meminfo->cpid == armci_me) - { - if( !ARMCI_Uses_shm() ) - { - void *ptr = meminfo->addr; - kr_free(ptr, &ctx_localmem); - } - else - { - armci_shmem_memctl(meminfo); - } - } - - meminfo->addr = NULL; - meminfo->armci_addr = NULL; - /* if(meminfo->attr!=NULL) free(meminfo->attr); */ -} - -/* ***************** End Non-Collective Memory Allocation ******************/ - -#endif -#endif diff --git a/armci/src-portals/message.c b/armci/src-portals/message.c deleted file mode 100644 index 2df951907..000000000 --- a/armci/src-portals/message.c +++ /dev/null @@ -1,2174 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: message.c,v 1.58.6.4 2007-04-24 10:08:26 vinod Exp $ */ -#if defined(BGML) -# include "bgml.h" -#elif defined(PVM) -# include -#elif defined(TCGMSG) -# include -#else -# ifndef MSG_COMMS_MPI -# define MSG_COMMS_MPI -# endif -# include -#endif -#include "message.h" -#include "armcip.h" -#include "copy.h" -#if HAVE_STDIO_H -# include -#endif -#if HAVE_ASSERT_H -# include -#endif -#ifdef _POSIX_PRIORITY_SCHEDULING -#ifndef HITACHI -# include -#endif -#endif -#include "armci.h" -#include "acc.h" - -#define DEBUG_ 0 -#if defined(SYSV) || defined(MMAP) ||defined (WIN32) -# include "armci_shmem.h" -#endif - -/* global operations are use buffer size of BUF_SIZE doubles */ -#define BUF_SIZE (4*2048) -#define INFO_BUF_SIZE (BUF_SIZE*sizeof(BUF_SIZE) - sizeof(double)) -#undef EMPTY -#define EMPTY 0 -#define FULL 1 - -static double *work=NULL; -static long *lwork = NULL; -static long long *llwork = NULL; -static int *iwork = NULL; -static float *fwork = NULL; -static int _armci_gop_init=0; /* tells us if we have a buffers allocated */ -static int _armci_gop_shmem =0; /* tells us to use shared memory for gops */ -extern void armci_util_wait_int(volatile int *, int , int ); -static int empty=EMPTY,full=FULL; -#if !defined(SGIALTIX) && defined(SYSV) || defined(MMAP) || defined(WIN32) -static void **ptr_arr=NULL; -#endif - -typedef struct { - union { - volatile int flag; - double dummy[16]; - }a; - union { - volatile int flag; - double dummy[16]; - }b; - double array[BUF_SIZE]; -} bufstruct; - -static bufstruct *_gop_buffer; - -#define GOP_BUF(p) (_gop_buffer+((p)-armci_master)) - -/*\ macro to set a flag includes mem barrier to assure that flag is not set - * before any outstanding writes complete -\*/ -#ifdef NEED_MEM_SYNC -# ifdef AIX -# define SET_SHM_FLAG(_flg,_val) _clear_lock((int *)(_flg),_val); -# elif defined(NEC) -# define SET_SHM_FLAG(_flg,_val) MEM_FENCE; *(_flg)=(_val) -# elif defined(__ia64) -# if defined(__GNUC__) && !defined (__INTEL_COMPILER) -# define SET_SHM_FLAG(_flg,_val)\ - __asm__ __volatile__ ("mf" ::: "memory"); *(_flg)=(_val) -# else /* Intel Compiler */ - extern void _armci_ia64_mb(); -# define SET_SHM_FLAG(_flg,_val)\ - _armci_ia64_mb(); *(_flg)=(_val); -# endif -# elif defined(MACX) -# if defined(__GNUC__) -# define SET_SHM_FLAG(_flg,_val)\ - *(_flg)=(_val);__asm__ __volatile__ ("isync" ::: "memory") -# endif -# endif -#endif - -#ifndef SET_SHM_FLAG -# define SET_SHM_FLAG(_flg,_val) *(_flg)=_val; -#endif - - - -/*\ - * Variables/structures for use in Barrier and for Binomial tree -\*/ -#if HAVE_MATH_H -# include -#endif -int barr_switch; -static int LnB=0,powof2nodes,Lp2; -typedef struct { - volatile int flag1; - double dum[16]; - volatile int flag2; -} barrier_struct; -barrier_struct *_bar_buff; -#define BAR_BUF(p) (_bar_buff+((p))) -void **barr_snd_ptr,**barr_rcv_ptr; -int _armci_barrier_init=0; -int _armci_barrier_shmem=0; - - -/*\ - * Tree generation code -\*/ -static void _dfs_bintree_parse(int *idlist, int index, int max, int *result) -{ -int left = (int)2*index+1; -int right = (int) 2*index+2; -static int pos=0; -int r_end,l_end; - l_end=pos++; - result[pos++]=idlist[index]; - if(leftarray; /* each process finds its place */ - GOP_BUF(armci_me)->a.flag=EMPTY; /* initially buffer is empty */ - GOP_BUF(armci_me)->b.flag=EMPTY; /* initially buffer is empty */ - if(armci_me == armci_master ){ - GOP_BUF(armci_clus_last+1)->a.flag=EMPTY;/*initially buffer is empty*/ - GOP_BUF(armci_clus_last+2)->a.flag=EMPTY;/*initially buffer is empty*/ - GOP_BUF(armci_clus_last+1)->b.flag=EMPTY;/*initially buffer is empty*/ - GOP_BUF(armci_clus_last+2)->b.flag=EMPTY;/*initially buffer is empty*/ - } - _armci_gop_shmem = 1; - } -#endif - /*stuff needed for barrier and binomial bcast/reduce*/ -#ifdef LAPI - if(!_armci_barrier_shmem){ - int size = 2*sizeof(int); - /*allocate memory to send/rcv data*/ - barr_snd_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - barr_rcv_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - - if(PARMCI_Malloc(barr_snd_ptr,size))armci_die("malloc barrinit failed",0); - if(PARMCI_Malloc(barr_rcv_ptr,size))armci_die("malloc barrinit failed",0); - if(barr_rcv_ptr[armci_me]==NULL || barr_snd_ptr[armci_me]==NULL) - armci_die("problems in malloc barr_init",0); - powof2nodes=1; - LnB = floor(log(armci_nclus)/log(2))+1; - if(pow(2,LnB-1)-1) _armci_dummy_work *=DUMMY_INIT; - if(_armci_dummy_work>(double)armci_msg_nproc())_armci_dummy_work=DUMMY_INIT; -} - - -/***************************Barrier Code*************************************/ - -void armci_msg_barr_init(){ -#if defined(SYSV) || defined(MMAP) || defined(WIN32) - int size=sizeof(barrier_struct)*armci_clus_info[armci_clus_me].nslave; - char *tmp; - void **ptr_arr; - barr_switch=0; - /*First allocate space for flags*/ - - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - if(armci_me==armci_master) size = size+128; - else size=0; - PARMCI_Malloc(ptr_arr, size); - tmp = (char*)ptr_arr[armci_master]; - size=2*sizeof(int); - - if(!tmp)armci_die("allocate barr shm failed",0); - _bar_buff=(barrier_struct *)tmp; - - SET_SHM_FLAG(&(BAR_BUF(armci_me-armci_master)->flag1),empty); - SET_SHM_FLAG(&(BAR_BUF(armci_me-armci_master)->flag2),empty); - - /*allocate memory to send/rcv data*/ - barr_snd_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - barr_rcv_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - - if(PARMCI_Malloc(barr_snd_ptr,size))armci_die("malloc barr_init failed",0); - if(PARMCI_Malloc(barr_rcv_ptr,size))armci_die("malloc barr_init failed",0); - if(barr_rcv_ptr[armci_me]==NULL || barr_snd_ptr[armci_me]==NULL) - armci_die("problems in malloc barr_init",0); - - /*we have to figure if we have power of ,two nodes*/ - powof2nodes=1; - LnB = (int)floor(log(armci_nclus)/log(2))+1; - if(pow(2,LnB-1)flag1,FULL,100000); - SET_SHM_FLAG(&(BAR_BUF(i)->flag1),empty); - } - if(armci_nclus>1){ - last = ((int)pow(2,(LnB-1)))^armci_clus_me; - if(last>=0 && lastarmci_clus_me){ /*the pow2 set of procs*/ - if(last=0 && next armci_me){ - armci_msg_snd(ARMCI_TAG, srcp,4,next_node); - armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_node); - } - else{ - /*would we gain anything by doing a snd,rcv instead of rcv,snd*/ - armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_node); - armci_msg_snd(ARMCI_TAG, srcp,4,next_node); - } - armci_util_wait_int((volatile int *)dstn,barr_count,100000); - } - } - if(last1*/ - for(i=1;iflag2),full); - } - else { /*if not master, partake in the smp barrier,only*/ - i=armci_me-armci_master; - SET_SHM_FLAG(&(BAR_BUF(i)->flag1),full); - armci_util_wait_int(&BAR_BUF(i)->flag2,FULL,100000); - SET_SHM_FLAG(&(BAR_BUF(i)->flag2),empty); - } -} - -#endif /*barrier enabled only for lapi*/ -void parmci_msg_barrier() -{ -#ifdef BGML - bgml_barrier (3); /* this is always faster than MPI_Barrier() */ -#elif defined(MSG_COMMS_MPI) - MPI_Barrier(ARMCI_COMM_WORLD); -# elif defined(PVM) - pvm_barrier(mp_group_name, armci_nproc); -# elif defined(LAPI) -#if !defined(NEED_MEM_SYNC) - if(_armci_barrier_init) - _armci_msg_barrier(); - else -#endif - { - tcg_synch(ARMCI_TAG); - } -# else - { - tcg_synch(ARMCI_TAG); - } -# endif -} -/***********************End Barrier Code*************************************/ - - -void armci_msg_init(int *argc, char ***argv) -{ -#if defined(TCGMSG) - if (!tcg_ready()) { - tcg_pbegin(argc,argv); - } -#elif defined(BGML) - /* empty */ -#elif defined(MSG_COMMS_MPI) - int flag=0; - MPI_Initialized(&flag); - if (!flag) { -# if defined(DCMF) || defined(MPI_MT) - int provided; - MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided); -# else - MPI_Init(argc, argv); -# endif - } - if (!PARMCI_Initialized()) { - MPI_Comm_dup(MPI_COMM_WORLD, &ARMCI_COMM_WORLD); - } -#endif -} - -#ifdef MSG_COMMS_MPI -void armci_msg_init_comm(MPI_Comm comm) -{ - if (!PARMCI_Initialized()) { - MPI_Comm_dup(comm, &ARMCI_COMM_WORLD); - } -} -#endif - - -int armci_msg_me() -{ -#ifdef BGML - return BGML_Messager_rank(); -#elif defined(DCMF) - return DCMF_Messager_rank(); -#elif defined(MSG_COMMS_MPI) - static int counter = 0; - if (counter == 0) { - int me; - MPI_Comm_rank(ARMCI_COMM_WORLD, &me); - armci_me = me; - counter = 1; - } - return armci_me; - -#elif defined(PVM) - return(pvm_getinst(mp_group_name,pvm_mytid())); -#else - return (int)tcg_nodeid(); -#endif -} - - -int armci_msg_nproc() -{ -#ifdef BGML - return BGML_Messager_size(); -#elif defined(DCMF) - return DCMF_Messager_size(); -#elif defined(MSG_COMMS_MPI) - static int counter = 0; - if (counter == 0) { - int nproc; - MPI_Comm_size(ARMCI_COMM_WORLD, &nproc); - armci_nproc = nproc; - counter = 1; - } - return armci_nproc; -#elif defined(PVM) - return(pvm_gsize(mp_group_name)); -#else - return (int)tcg_nnodes(); -#endif -} - -#ifdef CRAY_YMP -#define BROKEN_MPI_ABORT -#endif - -#ifndef PVM -double armci_timer() -{ -#ifdef BGML - return BGML_Timer(); -#elif defined(DCMF) - return DCMF_Timer(); -#elif defined(MSG_COMMS_MPI) - - return MPI_Wtime(); -#else - return tcg_time(); -#endif -} -#endif - - -void armci_msg_abort(int code) -{ -#ifdef BGML - fprintf(stderr,"ARMCI aborting [%d]\n", code); -#elif defined(DCMF) - fprintf(stderr,"ARMCI aborting [%d]\n", code); -#elif defined(MSG_COMMS_MPI) -# ifndef BROKEN_MPI_ABORT - MPI_Abort(ARMCI_COMM_WORLD,code); -# endif -#elif defined(PVM) - char error_msg[25]; - sprintf(error_msg, "ARMCI aborting [%d]", code); - pvm_halt(); -#else - tcg_error("ARMCI aborting",(long)code); -#endif - fprintf(stderr,"%d:aborting\n",armci_me); - /* trap for broken abort in message passing libs */ - _exit(1); -} - -void armci_msg_finalize() -{ -#if defined(TCGMSG) - tcg_pend(); -#elif defined(MSG_COMMS_MPI) - MPI_Finalize(); -#endif -} - -void armci_msg_bintree(int scope, int* Root, int *Up, int *Left, int *Right) -{ -int root, up, left, right, index, nproc; - if(scope == SCOPE_NODE){ - root = armci_clus_info[armci_clus_me].master; - nproc = armci_clus_info[armci_clus_me].nslave; - index = armci_me - root; - up = (index-1)/2 + root; if( up < root) up = -1; - left = 2*index + 1 + root; if(left >= root+nproc) left = -1; - right = 2*index + 2 + root; if(right >= root+nproc)right = -1; - }else if(scope ==SCOPE_MASTERS){ - root = armci_clus_info[0].master; - nproc = armci_nclus; - if(armci_me != armci_master){up = -1; left = -1; right = -1; } - else{ - index = armci_clus_me - root; - up = (index-1)/2 + root; - up = ( up < root)? -1: armci_clus_info[up].master; - left = 2*index + 1 + root; - left = ( left >= root+nproc)? -1: armci_clus_info[left].master; - right = 2*index + 2 + root; - right =( right >= root+nproc)? -1: armci_clus_info[right].master; - } - }else{ - root = 0; - nproc = armci_nproc; - index = armci_me - root; - up = (index-1)/2 + root; if( up < root) up = -1; - left = 2*index + 1 + root; if(left >= root+nproc) left = -1; - right = 2*index + 2 + root; if(right >= root+nproc)right = -1; - } - - *Up = up; - *Left = left; - *Right = right; - *Root = root; -} - -/*\ root broadcasts to everyone else -\*/ -void armci_msg_bcast_scope(int scope, void *buf, int len, int root) -{ - int up, left, right, Root; - - if(!buf)armci_die("armci_msg_bcast: NULL pointer", len); -#ifdef BGML - BGTr_Bcast(root, buf, len, 3); -#else - armci_msg_bintree(scope, &Root, &up, &left, &right); - - if(root !=Root){ - if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root); - if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root); - } - - /* printf("%d: scope=%d left=%d right=%d up=%d\n",armci_me, scope, - left, right, up);*/ - - if(armci_me != Root && up!=-1) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, up); - if (left > -1) armci_msg_snd(ARMCI_TAG, buf, len, left); - if (right > -1) armci_msg_snd(ARMCI_TAG, buf, len, right); -#endif -} - - - - -/*\ shared memory based broadcast for a single SMP node -\*/ -void armci_smp_bcast(void *x, int n , int root) -{ -int ndo, len,i, bufsize = BUF_SIZE*sizeof(double); -static int bufid=1; - - if(armci_clus_info[armci_clus_me].nslave<2) return; /* nothing to do */ - - if(!x)armci_die("armci_msg_bcast: NULL pointer", n); - - /* enable or balance pipeline for messages comparable to bufsize */ - if((n>bufsize/2) && (n <(2*bufsize-64))){ - bufsize = n/2; bufsize>>=3; bufsize<<=3; - } - - while ((ndo = (n<=bufsize) ? n : bufsize)) { - len = ndo; - - if(armci_me==root){ - - /* wait for the flag protecting the buffer to clear */ - armci_util_wait_int(&(GOP_BUF(armci_clus_last+bufid)->a.flag),EMPTY,100); - SET_SHM_FLAG(&(GOP_BUF(armci_clus_last+bufid)->a.flag),full); -#if 0 - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root)armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100); - armci_copy(x,GOP_BUF(armci_clus_last+bufid+1)->array,len); - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root) GOP_BUF(i)->b.flag=FULL; -#else - armci_copy(x,GOP_BUF(armci_clus_last+bufid)->array,len); - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root){ - armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100); - SET_SHM_FLAG(&(GOP_BUF(i)->b.flag),full); - } -#endif - }else{ - armci_util_wait_int(&GOP_BUF(armci_me)->b.flag, FULL, 100); - armci_copy(GOP_BUF(armci_clus_last+bufid)->array,x,len); - SET_SHM_FLAG(&(GOP_BUF(armci_me)->b.flag),empty); - } - - n -=ndo; - x = len + (char*)x; - - bufid = (bufid)%2 +1; - - /* since root waited for everybody to check in the previous buffer is free*/ - if(armci_me==root){ - SET_SHM_FLAG(&(GOP_BUF(armci_clus_last+bufid)->a.flag),empty); - } - } -} - - - -/*\ shared memory based broadcast for a single SMP node out of shmem buffer -\*/ -void armci_smp_buf_bcast(void *x, int n, int root, void *shmbuf ) -{ -int i, nslave = armci_clus_info[armci_clus_me].nslave; - - if(nslave<2){ - armci_copy(shmbuf,x,n); - return; /* nothing to do */ - } - if(!x)armci_die("armci_msg_bcast: NULL pointer", n); - if(!shmbuf)armci_die("armci_msg_bcast: NULL pointer", n); - - if(armci_me==root){ - /* notify others that the data in buffer is ready */ - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root){ - armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100); - GOP_BUF(i)->b.flag=FULL; - } - /* root also needs to copy */ - armci_copy(shmbuf,x,n); - /* wait until everybody is finished -- can reclaim buffer */ - for(i=armci_clus_first; i <= armci_clus_last; i++) - if(i!=root)armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY,100000); - - }else{ - /* spin until data in buffer is ready */ - armci_util_wait_int(&GOP_BUF(armci_me)->b.flag , FULL, 100000); - armci_copy(shmbuf,x,n); /* copy data */ - GOP_BUF(armci_me)->b.flag = EMPTY; /* indicate we are done */ - } -} - -void _armci_msg_binomial_bcast(void *buf, int len, int root){ - int Root = armci_master; - int nslave = armci_clus_info[armci_clus_me].nslave; - int i,next_node,next; -/* int my_rank,root_rank,next_rank; */ - /* inter-node operation between masters */ - if(root !=armci_clus_info[0].master){ - Root = armci_clus_info[0].master; - if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root); - if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root); - root = Root; - Root = armci_master; - } - if(armci_nclus>1 &&armci_me==armci_master){/*the internode bcast, first*/ - /*first do the recv*/ - int rcv_proc=armci_clus_me,flag=1,diff=1; - if(armci_me!=root){ - while(!(rcv_proc & flag)){ - diff=diff<<1; - flag=flag<<1; - } - rcv_proc = armci_clus_info[armci_clus_me-diff].master; - armci_msg_rcv(ARMCI_TAG, buf,len,NULL,rcv_proc); - /*printf("\n%d: recv from %d \n",armci_me,rcv_proc);fflush(stdout);*/ - } - else - diff = Lp2; - - /*printf("\n%d: %d diff>>1 = %d\n",armci_me,Lp2,diff>>1);*/ - for(i=diff>>1;i>=1;i=i>>1){ - next=i^armci_clus_me; - if(next>=0 && next1)armci_msg_bcast_scope(SCOPE_MASTERS, buf, len, root); - else Root = root; - - /* intra-node operation */ -#if 1 - if(_armci_gop_shmem && nslave<33) - armci_smp_bcast(buf, len, Root); - else -#endif - armci_msg_bcast_scope(SCOPE_NODE, buf, len, Root); -} -#endif - - - -void armci_msg_brdcst(void* buffer, int len, int root) -{ - if(!buffer)armci_die("armci_msg_brdcast: NULL pointer", len); - -#ifdef BGML - BGTr_Bcast(root, buffer, len, PCLASS); -# elif defined(MSG_COMMS_MPI) - MPI_Bcast(buffer, len, MPI_CHAR, root, ARMCI_COMM_WORLD); -# elif defined(PVM) - armci_msg_bcast(buffer, len, root); -# else - { - long ttag=ARMCI_TAG, llen=len, rroot=root; - tcg_brdcst(ttag, buffer, llen, rroot); - } -# endif -} - - -void armci_msg_snd(int tag, void* buffer, int len, int to) -{ -# ifdef MSG_COMMS_MPI - MPI_Send(buffer, len, MPI_CHAR, to, tag, ARMCI_COMM_WORLD); -# elif defined(PVM) - pvm_psend(pvm_gettid(mp_group_name, to), tag, buffer, len, PVM_BYTE); -# elif defined(BGML) - /* We don't actually used armci_msg_snd in ARMCI. we use optimized - * collectives where - * armci_msg_snd is used. If you build Global Arrays, the MSG_COMMS_MPI flag is - * set, so that - * will work fine - */ - armci_die("bgl shouldn't use armci_msg_snd", armci_me); -# else - long ttag=tag, llen=len, tto=to, block=1; - tcg_snd(ttag, buffer, llen, tto, block); -# endif -} - - -/*\ receive message of specified tag from proc and get its len if msglen!=NULL -\*/ -void armci_msg_rcv(int tag, void* buffer, int buflen, int *msglen, int from) -{ -# ifdef MSG_COMMS_MPI - MPI_Status status; - MPI_Recv(buffer, buflen, MPI_CHAR, from, tag, ARMCI_COMM_WORLD, &status); - if(msglen) MPI_Get_count(&status, MPI_CHAR, msglen); -# elif defined(PVM) - int src, rtag,mlen; - pvm_precv(pvm_gettid(mp_group_name, from), tag, buffer, buflen, PVM_BYTE, - &src, &rtag, &mlen); - if(msglen)*msglen=mlen; -#elif defined(BGML) - armci_die("bgl shouldn't use armci_msg_rcv", armci_me); -# else - long ttag=tag, llen=buflen, mlen, ffrom=from, sender, block=1; - tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block); - if(msglen)*msglen = (int)mlen; -# endif -} - - -int armci_msg_rcvany(int tag, void* buffer, int buflen, int *msglen) -{ -#if defined(MSG_COMMS_MPI) - int ierr; - MPI_Status status; - - ierr = MPI_Recv(buffer, buflen, MPI_CHAR, MPI_ANY_SOURCE, tag, - ARMCI_COMM_WORLD, &status); - if(ierr != MPI_SUCCESS) armci_die("armci_msg_rcvany: Recv failed ", tag); - - if(msglen)if(MPI_SUCCESS!=MPI_Get_count(&status, MPI_CHAR, msglen)) - armci_die("armci_msg_rcvany: count failed ", tag); - return (int)status.MPI_SOURCE; -# elif defined(PVM) - int src, rtag,mlen; - pvm_precv(-1, tag, buffer, buflen, PVM_BYTE, &src, &rtag, &mlen); - if(msglen)*msglen=mlen; - return(pvm_getinst(mp_group_name,src)); -# elif defined (BGML) - armci_die("bgl shouldn't use armci_msg_rcvany", armci_me); -# else - long ttag=tag, llen=buflen, mlen, ffrom=-1, sender, block=1; - tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block); - if(msglen)*msglen = (int)mlen; - return (int)sender; -# endif -} - - -/*\ cluster master broadcasts to everyone else in the same cluster -\*/ -void armci_msg_clus_brdcst(void *buf, int len) -{ -int root, up, left, right; -int tag=ARMCI_TAG, lenmes; - - armci_msg_bintree(SCOPE_NODE, &root, &up, &left, &right); - if(armci_me != root) armci_msg_rcv(tag, buf, len, &lenmes, up); - if (left > -1) armci_msg_snd(tag, buf, len, left); - if (right > -1) armci_msg_snd(tag, buf, len, right); -} - - -/*\ reduce operation for long -\*/ -static void ldoop(int n, char *op, long *x, long* work) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ += *work++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ *= *work++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x |= *work; - x++; work++; - } - /* these are new */ - else if ((strncmp(op, "&&", 2) == 0) || (strncmp(op, "land", 4) == 0)) { - while(n--) { - *x = *x && *work; - x++; work++; - } - } - else if ((strncmp(op, "||", 2) == 0) || (strncmp(op, "lor", 3) == 0)) { - while(n--) { - *x = *x || *work; - x++; work++; - } - } - else if ((strncmp(op, "&", 1) == 0) || (strncmp(op, "band", 4) == 0)) { - while(n--) { - *x &= *work; - x++; work++; - } - } - else if ((strncmp(op, "|", 1) == 0) || (strncmp(op, "bor", 3) == 0)) { - while(n--) { - *x |= *work; - x++; work++; - } - } - else - armci_die("ldoop: unknown operation requested", n); -} - -/*\ reduce operation for long x= op(work,work2) -\*/ -static void ldoop2(int n, char *op, long *x, long* work, long* work2) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ = *work++ + *work2++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ = *work++ * *work2++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x = *work | *work2; - x++; work++; work2++; - } - else - armci_die("ldoop2: unknown operation requested", n); -} - -/*\ reduce operation for long long -\*/ -static void lldoop(int n, char *op, long long *x, long long* work) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ += *work++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ *= *work++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register long long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register long long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x |= *work; - x++; work++; - } - else - armci_die("lldoop: unknown operation requested", n); -} - -/*\ reduce operation for long long x= op(work,work2) -\*/ -static void lldoop2(int n, char *op, long long *x, long long* work, - long long* work2) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ = *work++ + *work2++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ = *work++ * *work2++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register long long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register long long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x = *work | *work2; - x++; work++; work2++; - } - else - armci_die("ldoop2: unknown operation requested", n); -} - -/*\ reduce operation for int -\*/ -static void idoop(int n, char *op, int *x, int* work) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ += *work++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ *= *work++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register int x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register int x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x |= *work; - x++; work++; - } - else - armci_die("idoop: unknown operation requested", n); -} - -/*\ reduce operation for int x= op(work,work2) -\*/ -static void idoop2(int n, char *op, int *x, int* work, int* work2) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ = *work++ + *work2++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ = *work++ * *work2++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register int x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register int x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"or",2) == 0) - while(n--) { - *x = *work | *work2; - x++; work++; work2++; - } - else - armci_die("idoop2: unknown operation requested", n); -} - -/*\ reduce operation for double -\*/ -static void ddoop(int n, char* op, double* x, double* work) -{ - if (strncmp(op,"+",1) == 0){ - if(n>63) FORT_DADD(&n,x,work); - else while(n--) *x++ += *work++; - }else if (strncmp(op,"*",1) == 0){ - if(n>63) FORT_DMULT(&n,x,work); - else while(n--) *x++ *= *work++; - }else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register double x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register double x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else - armci_die("ddoop: unknown operation requested", n); -} - -/*\ reduce operation for double x= op(work,work2) -\*/ -static void ddoop2(int n, char *op, double *x, double* work, double* work2) -{ - if (strncmp(op,"+",1) == 0){ - if(n>63) FORT_DADD2(&n,x,work,work2); - else while(n--) *x++ = *work++ + *work2++; - }else if (strncmp(op,"*",1) == 0){ - if(n>63) FORT_DMULT2(&n,x,work,work2); - while(n--) *x++ = *work++ * *work2++; - }else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register double x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register double x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else - armci_die("ddoop2: unknown operation requested", n); -} - - -/*\ reduce operation for float -\*/ -static void fdoop(int n, char* op, float* x, float* work) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ += *work++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ *= *work++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*x, *work); - x++; work++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*x, *work); - x++; work++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register float x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MAX(x1, x2); - x++; work++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register float x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work); - *x = ARMCI_MIN(x1, x2); - x++; work++; - } - else - armci_die("fdoop: unknown operation requested", n); -} - -/*\ reduce operation for float x= op(work,work2) -\*/ -static void fdoop2(int n, char *op, float *x, float* work, float* work2) -{ - if (strncmp(op,"+",1) == 0) - while(n--) - *x++ = *work++ + *work2++; - else if (strncmp(op,"*",1) == 0) - while(n--) - *x++ = *work++ * *work2++; - else if (strncmp(op,"max",3) == 0) - while(n--) { - *x = ARMCI_MAX(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"min",3) == 0) - while(n--) { - *x = ARMCI_MIN(*work2, *work); - x++; work++; work2++; - } - else if (strncmp(op,"absmax",6) == 0) - while(n--) { - register float x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MAX(x1, x2); - x++; work++; work2++; - } - else if (strncmp(op,"absmin",6) == 0) - while(n--) { - register float x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2); - *x = ARMCI_MIN(x1, x2); - x++; work++; work2++; - } - else - armci_die("fdoop2: unknown operation requested", n); -} - -/*\ combine array of longs/ints accross all processes -\*/ -void armci_msg_gop_scope(int scope, void *x, int n, char* op, int type) -{ -int root, up, left, right, size; -int tag=ARMCI_TAG; -int ndo, len, lenmes, orign =n, ratio; -void *origx =x; - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - if(work==NULL)_allocate_mem_for_work(); -#ifdef BGML - BGML_Dt dt; - BGML_Op theop; - - if(n > 0 && (strncmp(op, "+", 1) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_SUM; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else if(n > 0 && (strncmp(op, "max", 3) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_MAX; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else if(n > 0 && (strncmp(op, "min", 3) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_MIN; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else -#endif - { - armci_msg_bintree(scope, &root, &up, &left, &right); - - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - len = lenmes = ndo*size; - - if (left > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, left); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - - if (right > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, right); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up); - - n -=ndo; - x = len + (char*)x; - } - - /* Now, root broadcasts the result down the binary tree */ - len = orign*size; - armci_msg_bcast_scope(scope, origx, len, root); - } -} - - -void armci_msg_reduce_scope(int scope, void *x, int n, char* op, int type) -{ -int root, up, left, right, size; -int tag=ARMCI_TAG; -int ndo, len, lenmes, ratio; - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - if(work==NULL)_allocate_mem_for_work(); - - armci_msg_bintree(scope, &root, &up, &left, &right); - - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - len = lenmes = ndo*size; - - if (left > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, left); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - - if (right > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, right); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up); - - n -=ndo; - x = len + (char*)x; - } -} - -static void gop(int type, int ndo, char* op, void *x, void *work) -{ - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, (int*)work); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, (long*)work); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x, (long long*)work); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, (float*)work); - else ddoop(ndo, op, (double*)x, (double*)work); -} - - -static void gop2(int type, int ndo, char* op, void *x, void *work, void *work2) -{ -#if 0 - int size; - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - - armci_copy(work2,x,ndo*size); - - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, (int*)work); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, (long*)work); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x, (long long*)work); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, (float*)work); - else ddoop(ndo, op, (double*)x, (double*)work); -#else - if(type==ARMCI_INT) idoop2(ndo, op, (int*)x, (int*)work, (int*)work2); - else if(type==ARMCI_LONG)ldoop2(ndo,op,(long*)x,(long*)work,(long*)work2); - else if(type==ARMCI_LONG_LONG) lldoop2(ndo,op,(long long*)x,(long long*)work,(long long*)work2); - else if(type==ARMCI_FLOAT)fdoop2(ndo,op,(float*)x,(float*)work,(float*)work2); - else ddoop2(ndo, op, (double*)x, (double*)work,(double*)work2); -#endif -} - - - - -/*\ shared memory based reduction for a single SMP node -\*/ -static void armci_smp_reduce(void *x, int n, char* op, int type) -{ -int root, up, left, right, size; -int ndo, len, lenmes, ratio; -int nslave = armci_clus_info[armci_clus_me].nslave; - - if(nslave<2) return; /* nothing to do */ - - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - - armci_msg_bintree(SCOPE_NODE, &root, &up, &left, &right); - - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - len = lenmes = ndo*size; - - armci_util_wait_int(&GOP_BUF(armci_me)->a.flag, EMPTY, 100); - -#if 1 - if(left<0 && right<0) armci_copy(x,GOP_BUF(armci_me)->array,len); - - /* version oblivious to the order of data arrival */ - { - int need_left = left >-1; - int need_right = right >-1; - int from, first =1, maxspin=100, count=0; - bufstruct *b; - - while(need_left || need_right){ - from =-1; - if(need_left && GOP_BUF(left)->a.flag == FULL){ - from =left; - need_left =0; - }else if(need_right && GOP_BUF(right)->a.flag == FULL) { - from =right; - need_right =0; - } - if(from != -1){ - b = GOP_BUF(from); -#if 1 - if(armci_me == root) gop(type, ndo, op, x, b->array); - else { - if(first) - gop2(type, ndo, op, GOP_BUF(armci_me)->array, b->array,x); - else - gop(type, ndo, op, GOP_BUF(armci_me)->array, b->array); - } - first =0; -#else - gop(type, ndo, op, GOP_BUF(armci_me)->array, b->array); -#endif - SET_SHM_FLAG(&( b->a.flag),empty); - }else if((++count)array,len); - - /* this version requires a specific order of data arrival */ - if (left >-1) { - while(GOP_BUF(left)->a.flag != FULL) cpu_yield(); - gop(type, ndo, op, GOP_BUF(armci_me)->array, GOP_BUF(left)->array); - SET_SHM_FLAG(&( GOP_BUF(left)->a.flag),empty); - } - if (right >-1 ) { - while(GOP_BUF(right)->a.flag != FULL) cpu_yield(); - gop(type, ndo, op, GOP_BUF(armci_me)->array, GOP_BUF(right)->array); - GOP_BUF(right)->a.flag = EMPTY; - } -#endif - - if (armci_me != root ) { - SET_SHM_FLAG(&(GOP_BUF(armci_me)->a.flag),full); - } -#if 0 - else - /* NOTE: this copy can be eliminated in a cluster configuration */ - armci_copy(GOP_BUF(armci_me)->array,x,len); -#endif - - n -=ndo; - x = len + (char*)x; - } -} - -void _armci_msg_binomial_reduce(void *x, int n, char* op, int type){ - int root = armci_clus_info[0].master; - int i,next_node,next; - int size, ratio, ndo, lenmes,len; -/* int my_rank,root_rank,next_rank; */ - if(work==NULL)_allocate_mem_for_work(); - if(armci_me!=armci_master)return; - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - int snd_proc=armci_clus_me,flag=1,diff=1; - - len = lenmes = ndo*size; - if(armci_me!=root){ - while(!(snd_proc & flag)){ - diff=diff<<1; - flag=flag<<1; - } - snd_proc = armci_clus_info[armci_clus_me-diff].master; - } - else - diff = Lp2; - - /*printf("\n%d: %d diff>>1 = %d\n",armci_me,Lp2,diff>>1);*/ - for(i=diff>>1;i>=1;i=i>>1){ - next=i^armci_clus_me; - if(next>=0 && next1){ -#ifdef LAPI - if(_armci_gop_init) - _armci_msg_binomial_reduce(x,n,op,type); - else -#endif - armci_msg_reduce_scope(SCOPE_MASTERS, x, n, op, type); - } -} - - -static void armci_msg_gop2(void *x, int n, char* op, int type) -{ -int size, root=0; - if(work==NULL)_allocate_mem_for_work(); - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); -#ifdef BGML /*optimize what we can at the message layer */ - void *origx=x; - BGML_Dt dt; - BGML_Op rop; - - if(n>0 && (strncmp(op, "+", 1) == 0)) - { - rop=BGML_SUM; - if(type == ARMCI_INT) - { - dt=BGML_SIGNED_INT; - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - else if(type == ARMCI_LONG || type == ARMCI_LONG_LONG) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); -/* dt=BGML_UNSIGNED_LONG; */ -/* BGTr_Allreduce(origx, x, n, dt, rop, -1, 3);*/ - } - else if(type == ARMCI_DOUBLE) - { - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - else if(type == ARMCI_FLOAT) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } - else - { - fprintf(stderr,"Unknown data type\n"); - exit(1); - } - } - - else if(n>0 && ((strncmp(op, "max", 3) == 0) || (strncmp(op, "min", 3) ==0 ))) - { - if(strncmp(op, "max", 3) == 0) - rop=BGML_MAX; - else - rop=BGML_MIN; - - if(type == ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type == ARMCI_DOUBLE) - dt=BGML_DOUBLE; - else if(type == ARMCI_FLOAT) - dt=BGML_FLOAT; - else if(type == ARMCI_LONG) - dt=BGML_SIGNED_LONG; - else if(type == ARMCI_LONG_LONG) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } - else - { - fprintf(stderr,"Unknown data type\n"); - exit(1); - } - if(type != ARMCI_LONG_LONG) - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - - else -#endif - { /* brackets needed for final gelse clause of bgml */ - - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } -} - - -static void armci_sel(int type, char *op, void *x, void* work, int n) -{ -int selected=0; - switch (type) { - case ARMCI_INT: - if(strncmp(op,"min",3) == 0){ - if(*(int*)x > *(int*)work) selected=1; - }else - if(*(int*)x < *(int*)work) selected=1; - break; - case ARMCI_LONG: - if(strncmp(op,"min",3) == 0){ - if(*(long*)x > *(long*)work) selected=1; - }else - if(*(long*)x < *(long*)work) selected=1; - break; - case ARMCI_LONG_LONG: - if(strncmp(op,"min",3) == 0){ - if(*(long long*)x > *(long long*)work) selected=1; - }else - if(*(long long*)x < *(long long*)work) selected=1; - break; - case ARMCI_FLOAT: - if(strncmp(op,"min",3) == 0){ - if(*(float*)x > *(float*)work) selected=1; - }else - if(*(float*)x < *(float*)work) selected=1; - break; - default: - if(strncmp(op,"min",3) == 0){ - if(*(double*)x > *(double*)work) selected=1; - }else - if(*(double*)x < *(double*)work) selected=1; - } - if(selected)armci_copy(work,x, n); -} - - - -/*\ global for op with extra info -\*/ -void armci_msg_sel_scope(int scope, void *x, int n, char* op, int type, int contribute) -{ -int root, up, left, right; -int tag=ARMCI_TAG; -int len, lenmes, min; - - min = (strncmp(op,"min",3) == 0); - if(!min && (strncmp(op,"max",3) != 0)) - armci_die("armci_msg_gop_info: operation not supported ", 0); - - if(!x)armci_die("armci_msg_gop_info: NULL pointer", n); - - if(n>((int)INFO_BUF_SIZE))armci_die("armci_msg_gop_info: info too large",n); - - len = lenmes = n; - - armci_msg_bintree(scope, &root, &up, &left, &right); - - if (left > -1) { - - /* receive into work if contributing otherwise into x */ - if(contribute)armci_msg_rcv(tag, work, len, &lenmes, left); - else armci_msg_rcv(tag, x, len, &lenmes, left); - - if(lenmes){ - if(contribute) armci_sel(type, op, x, work, n); - else contribute =1; /* now we got data to pass */ - } - } - - if (right > -1) { - /* receive into work if contributing otherwise into x */ - if(contribute) armci_msg_rcv(tag, work, len, &lenmes, right); - else armci_msg_rcv(tag, x, len, &lenmes, right); - - if(lenmes){ - if(contribute) armci_sel(type, op, x, work, n); - else contribute =1; /* now we got data to pass */ - } - } - - if (armci_me != root){ - if(contribute) armci_msg_snd(tag, x, len, up); - else armci_msg_snd(tag, x, 0, up); /* send 0 bytes */ - } - - /* Now, root broadcasts the result down the binary tree */ - armci_msg_bcast_scope(scope, x, n, root); -} - - -/*\ combine array of longs/ints/doubles accross all processes -\*/ - -#if defined(NEC) - -void armci_msg_igop(int *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_INT); } - -void armci_msg_lgop(long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG); } - -void armci_msg_llgop(long long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG_LONG); } - -void armci_msg_dgop(double *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_DOUBLE); } - -void armci_msg_fgop (float *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_FLOAT);} - -#else -void armci_msg_igop(int *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_INT); } -void armci_msg_lgop(long *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_LONG); } -void armci_msg_llgop(long long *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_LONG_LONG); } -void armci_msg_fgop(float *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_FLOAT); } -void armci_msg_dgop(double *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_DOUBLE); } -#endif - - -/*\ add array of longs/ints within the same cluster node -\*/ -void armci_msg_clus_igop(int *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_INT); } - -void armci_msg_clus_lgop(long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_LONG); } - -void armci_msg_clus_llgop(long long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_LONG_LONG); } - -void armci_msg_clus_fgop(float *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_FLOAT); } - -void armci_msg_clus_dgop_scope(double *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_DOUBLE); } - - - -void armci_exchange_address(void *ptr_ar[], int n) -{ - int ratio = sizeof(void*)/sizeof(int); -/* - armci_msg_lgop((long*)ptr_ar, n, "+"); -*/ - if(DEBUG_)printf("%d: exchanging %ld ratio=%d\n",armci_me,(long)ptr_ar[armci_me],ratio); - - armci_msg_gop2(ptr_ar, n*ratio, "+",ARMCI_INT); -} - -/** - * ********************* Begin ARMCI Groups Code **************************** - * NOTE: This part is MPI dependent (i.e. ifdef MSG_COMMS_MPI) - */ -#ifdef MSG_COMMS_MPI -MPI_Comm armci_group_comm(ARMCI_Group *group) -{ -#ifdef ARMCI_GROUP - return MPI_COMM_NULL; -#else - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - return (MPI_Comm)(igroup->icomm); -#endif -} - -void parmci_msg_group_barrier(ARMCI_Group *group) -{ - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - -#ifdef ARMCI_GROUP - { - int val=0; - armci_msg_group_igop(&val, 1, "+", group); - } -#else - MPI_Barrier((MPI_Comm)(igroup->icomm)); -#endif -} - -#ifdef ARMCI_GROUP -extern void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Group *group); -#else -extern void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Comm comm); -#endif -void armci_grp_clus_brdcst(void *buf, int len, int grp_master, - int grp_clus_nproc, ARMCI_Group *mastergroup) { - ARMCI_iGroup *igroup = armci_get_igroup_from_group(mastergroup); - int i, *pid_list, root=0; -#ifdef ARMCI_GROUP - ARMCI_Group group; -#else - MPI_Group group_world; - MPI_Group group; - MPI_Comm comm; -#endif - - /* create a communicator for the processes with in a node */ - pid_list = (int *)malloc(grp_clus_nproc*sizeof(int)); - for(i=0; iicomm), &group_world); - MPI_Group_incl(group_world, grp_clus_nproc, pid_list, &group); - - MPI_Comm_create((MPI_Comm)(igroup->icomm), (MPI_Group)group, - (MPI_Comm*)&comm); - - /* Broadcast within the node (for this sub group of processes) */ - ARMCI_Bcast_(buf, len, root, comm); - - free(pid_list); - MPI_Comm_free(&comm); /* free the temporary communicator */ - MPI_Group_free(&group); -#endif -} - - -/* to avoid warning */ -extern int ARMCI_Absolute_id(ARMCI_Group *group,int group_rank); - -void armci_msg_group_bintree(int scope, int* Root, int *Up, int *Left, int *Right, - ARMCI_Group *group) -{ - int root, up, left, right, index, nproc,grp_clus_me,grp_me,grp_master,grp_nproc; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - grp_me = grp_attr->grp_me; - grp_clus_me = grp_attr->grp_clus_me; - grp_master = grp_attr->grp_clus_info[grp_clus_me].master; - ARMCI_Group_size(group, &grp_nproc); - if(scope == SCOPE_NODE){ - root = grp_attr->grp_clus_info[grp_clus_me].master; - nproc = grp_attr->grp_clus_info[grp_clus_me].nslave; - index = grp_me - root; - up = (index-1)/2 + root; if( up < root) up = -1; - left = 2*index + 1 + root; if(left >= root+nproc) left = -1; - right = 2*index + 2 + root; if(right >= root+nproc)right = -1; - }else if(scope ==SCOPE_MASTERS){ - root = grp_attr->grp_clus_info[0].master; - nproc = grp_attr->grp_nclus; - if(grp_me != grp_master){up = -1; left = -1; right = -1; } - else{ - index = grp_clus_me - root; - up = (index-1)/2 + root; - up = ( up < root)? -1: grp_attr->grp_clus_info[up].master; - left = 2*index + 1 + root; - left =( left >= root+nproc)?-1:grp_attr->grp_clus_info[left].master; - right= 2*index + 2 + root; - right=( right>=root+nproc)?-1:grp_attr->grp_clus_info[right].master; - } - }else{ - root = 0; - nproc = grp_nproc; - index = grp_me - root; - up = (index-1)/2 + root; if( up < root) up = -1; - left = 2*index + 1 + root; if(left >= root+nproc) left = -1; - right = 2*index + 2 + root; if(right >= root+nproc)right = -1; - } - - *Up = (up==-1)?up:ARMCI_Absolute_id(group,up); - *Left = (left==-1)?left:ARMCI_Absolute_id(group,left); - *Right = (right==-1)?right:ARMCI_Absolute_id(group,right); - *Root = (root==-1)?root:ARMCI_Absolute_id(group,root); -} - -void armci_msg_group_bcast_scope(int scope, void *buf, int len, int root, - ARMCI_Group *group) -{ - int up, left, right, Root; - int grp_me; - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - - if(!buf)armci_die("armci_msg_bcast: NULL pointer", len); - - if(!group)armci_msg_bcast_scope(scope,buf,len,root); - else grp_me = igroup->grp_attr.grp_me; - armci_msg_group_bintree(scope, &Root, &up, &left, &right,group); - - if(root !=Root){ - if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root); - if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root); - } - - /* printf("%d: scope=%d left=%d right=%d up=%d\n",armci_me, scope, - left, right, up);*/ - - if(armci_me != Root && up!=-1) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, up); - if (left > -1) armci_msg_snd(ARMCI_TAG, buf, len, left); - if (right > -1) armci_msg_snd(ARMCI_TAG, buf, len, right); -} - -void -armci_msg_group_gop_scope(int scope, void *x, int n, char* op, int type, - ARMCI_Group *group) -{ - int root, up, left, right, size; - int tag=ARMCI_TAG,grp_me; - int ndo, len, lenmes, orign =n, ratio; - void *origx =x; - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - - if(!group)armci_msg_gop_scope(scope,x,n,op,type); - else grp_me = igroup->grp_attr.grp_me; - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - if(work==NULL)_allocate_mem_for_work(); - - armci_msg_group_bintree(scope, &root, &up, &left, &right,group); - - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); - - ratio = sizeof(double)/size; - - while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { - len = lenmes = ndo*size; - - if (left > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, left); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x,llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - - if (right > -1) { - armci_msg_rcv(tag, lwork, len, &lenmes, right); - if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork); - else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork); - else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x, llwork); - else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork); - else ddoop(ndo, op, (double*)x, work); - } - if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up); - - n -=ndo; - x = len + (char*)x; - } - - /* Now, root broadcasts the result down the binary tree */ - len = orign*size; - armci_msg_group_bcast_scope(scope, origx, len, root,group); -} - -void armci_exchange_address_grp(void *ptr_arr[], int n, ARMCI_Group *group) -{ - int ratio = sizeof(void*)/sizeof(int); - ARMCI_iGroup *igroup = armci_get_igroup_from_group(group); - int grp_me = igroup->grp_attr.grp_me; - if(DEBUG_){ - printf("%d: exchanging %ld ratio=%d\n",armci_me, - (long)ptr_arr[grp_me], ratio); - } - armci_msg_group_gop_scope(SCOPE_ALL, ptr_arr, n*ratio, - "+", ARMCI_INT, group); -} - -/*\ combine array of longs/ints/doubles accross all processes -\*/ -void armci_msg_group_igop(int *x, int n, char* op, ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_INT,group); } - -void armci_msg_group_lgop(long *x, int n, char* op,ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG,group); } - -void armci_msg_group_llgop(long long *x, int n, char* op,ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG_LONG,group); } - -void armci_msg_group_fgop(float *x, int n, char* op,ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_FLOAT,group); } - -void armci_msg_group_dgop(double *x, int n, char* op,ARMCI_Group *group) -{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_DOUBLE,group); } - -# endif /* ifdef MSG_COMMS_MPI */ -/*********************** End ARMCI Groups Code ****************************/ - - -#ifdef PVM -/* set the group name if using PVM */ -void ARMCI_PVM_Init(char *mpgroup) -{ -#ifdef CRAY - mp_group_name = (char *)NULL; -#else - if(mpgroup != NULL) { -/* free(mp_group_name); */ - mp_group_name = (char *)malloc(25 * sizeof(char)); - strcpy(mp_group_name, mpgroup); - } -#endif -} -#endif diff --git a/armci/src-portals/message.h b/armci/src-portals/message.h deleted file mode 100644 index aa30cda89..000000000 --- a/armci/src-portals/message.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef _MESSAGE_H_ -#define _MESSAGE_H_ - -#include "armci.h" - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -#define ARMCI_INT -99 -#define ARMCI_LONG -101 -#define ARMCI_LONG_LONG -102 -#define ARMCI_FLOAT -306 -#define ARMCI_DOUBLE -307 - -#define SCOPE_ALL 333 -#define SCOPE_NODE 337 -#define SCOPE_MASTERS 339 - -#define armci_msg_sel(x,n,op,type,contribute)\ - armci_msg_sel_scope(SCOPE_ALL,(x),(n),(op),(type),(contribute)) -#if 0 -#define armci_msg_bcast(buffer, len, root)\ - armci_msg_bcast_scope(SCOPE_ALL, (buffer), (len), (root)) -#else -extern void armci_msg_bcast(void *buffer, int len, int root); -#endif - -extern void armci_msg_sel_scope(int scope, void *x, int n, char* op, - int type, int contribute); -extern void armci_msg_bcast_scope(int scope, void* buffer, int len, int root); -extern void armci_msg_brdcst(void* buffer, int len, int root); -extern void armci_msg_snd(int tag, void* buffer, int len, int to); -extern void armci_msg_rcv(int tag, void* buffer, int buflen, int *msglen, int from); -extern int armci_msg_rcvany(int tag, void* buffer, int buflen, int *msglen); -extern void armci_msg_reduce(void *x, int n, char *op, int type); -extern void armci_msg_reduce_scope(int scope, void *x, int n, char *op, int type); - -extern void armci_msg_gop_scope(int scope, void *x, int n, char* op, int type); -extern void armci_msg_igop(int *x, int n, char* op); -extern void armci_msg_lgop(long *x, int n, char* op); -extern void armci_msg_llgop(long long *x, int n, char* op); -extern void armci_msg_fgop(float *x, int n, char* op); -extern void armci_msg_dgop(double *x, int n, char* op); -extern void armci_exchange_address(void *ptr_ar[], int n); -extern void armci_msg_barrier(); -extern void armci_msg_bintree(int scope, int* Root, int *Up, int *Left, int *Right); - -extern int armci_msg_me(); -extern int armci_msg_nproc(); -extern void armci_msg_abort(int code); -extern void armci_msg_init(int *argc, char ***argv); -extern void armci_msg_init_comm(MPI_Comm comm); -extern void armci_msg_finalize(); -extern double armci_timer(); - -extern void armci_msg_clus_brdcst(void *buf, int len); -extern void armci_msg_clus_igop(int *x, int n, char* op); -extern void armci_msg_clus_fgop(float *x, int n, char* op); -extern void armci_msg_clus_lgop(long *x, int n, char* op); -extern void armci_msg_clus_llgop(long long *x, int n, char* op); -extern void armci_msg_clus_dgop(double *x, int n, char* op); - -extern void armci_msg_group_gop_scope(int scope, void *x, int n, char* op, int type, ARMCI_Group *group); -extern void armci_msg_group_igop(int *x, int n, char* op,ARMCI_Group *group); -extern void armci_msg_group_lgop(long *x, int n, char* op,ARMCI_Group *group); -extern void armci_msg_group_llgop(long long *x, int n, char* op,ARMCI_Group *group); -extern void armci_msg_group_fgop(float *x, int n, char* op,ARMCI_Group *group); -extern void armci_msg_group_dgop(double *x, int n,char* op,ARMCI_Group *group); -extern void armci_exchange_address_grp(void *ptr_arr[], int n, ARMCI_Group *group); -extern void armci_msg_group_barrier(ARMCI_Group *group); -extern void armci_msg_group_bcast_scope(int scope, void *buf, int len, int root, ARMCI_Group *group); -extern void armci_grp_clus_brdcst(void *buf, int len, int grp_master, int grp_clus_nproc,ARMCI_Group *mastergroup); - -#if defined(__cplusplus) || defined(c_plusplus) -} -#endif - - -#endif diff --git a/armci/src-portals/mutex.c b/armci/src-portals/mutex.c deleted file mode 100644 index 1927b212e..000000000 --- a/armci/src-portals/mutex.c +++ /dev/null @@ -1,408 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif - -/* $Id: mutex.c,v 1.24.10.1 2006-12-21 23:50:48 manoj Exp $ */ -#include "armcip.h" -#include "copy.h" -#include "parmci.h" -#include "request.h" - -#define DEBUG 0 -#define MAX_LOCKS 32768 -#define SPINMAX 1000 - -#if defined(LAPI) || defined(GM) -# define SERVER_LOCK -#endif - -double _dummy_work_=0.; -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ -int mymutexcount; -double _dummy_server_work_=0.; -#endif -static int num_mutexes=0, *tickets; - -typedef struct { - int mutex; - int turn; - msg_tag_t tag; -} waiting_list_t; - - -/* data structure to store info about blocked (waiting) process for mutex */ -static waiting_list_t* blocked=(waiting_list_t*)0; - - -typedef struct { -int* token; -int* turn; -int* tickets; -int count; -} mutex_entry_t; - -void** mutex_mem_ar; -mutex_entry_t *glob_mutex; - - -int PARMCI_Create_mutexes(int num) -{ -int rc,p, totcount; -int *mutex_count = (int*)armci_internal_buffer; - - if((sizeof(int)*armci_nproc) > armci_getbufsize()){ - mutex_count = (double *)malloc(sizeof(int)*armci_nproc); - } - if (num < 0 || num > MAX_LOCKS) return(FAIL); - if(num_mutexes) armci_die("mutexes already created",num_mutexes); - - if(armci_nproc == 1){ num_mutexes=1; return(0); } - - /* local memory allocation for mutex arrays*/ - mutex_mem_ar = (void*) malloc(armci_nproc*sizeof(void*)); - if(!mutex_mem_ar) armci_die("ARMCI_Create_mutexes: malloc failed",0); - glob_mutex = (void*)malloc(armci_nproc*sizeof(mutex_entry_t)); - if(!glob_mutex){ - free(mutex_mem_ar); - armci_die("ARMCI_Create_mutexes: malloc 2 failed",0); - } - - -/* bzero(mutex_count,armci_nproc*sizeof(int));*/ - bzero((char*)mutex_count,sizeof(int)*armci_nproc); - - /* find out how many mutexes everybody allocated */ - mutex_count[armci_me]=num; - armci_msg_igop(mutex_count, armci_nproc, "+"); - for(p=totcount=0; p< armci_nproc; p++)totcount+=mutex_count[p]; - - tickets = calloc(totcount,sizeof(int)); - if(!tickets) { - free(glob_mutex); - free(mutex_mem_ar); - return(FAIL2); - } - - /* we need memory for token and turn - 2 ints */ - rc = PARMCI_Malloc(mutex_mem_ar,2*num*sizeof(int)); - if(rc){ - free(glob_mutex); - free(mutex_mem_ar); - free(tickets); - return(FAIL3); - } - - if(num)bzero((char*)mutex_mem_ar[armci_me],2*num*sizeof(int)); - - /* setup global mutex array */ - for(p=totcount=0; p< armci_nproc; p++){ - glob_mutex[p].token = mutex_mem_ar[p]; - glob_mutex[p].turn = glob_mutex[p].token + mutex_count[p]; - glob_mutex[p].count = mutex_count[p]; - glob_mutex[p].tickets = tickets + totcount; - totcount += mutex_count[p]; - } - - num_mutexes= totcount; -#ifdef LAPI - mymutexcount = num; -#endif - PARMCI_Barrier(); - - if(DEBUG) - fprintf(stderr,"%d created (%d,%d) mutexes\n",armci_me,num,totcount); - - return(0); -} - - -void armci_serv_mutex_create() -{ - int mem = armci_nproc*sizeof(waiting_list_t); - blocked = (waiting_list_t*)malloc(mem); - if(!blocked) armci_die("armci server:error allocating mutex memory ",0); -} - - -void armci_serv_mutex_close() -{ - if(blocked) free(blocked ); - blocked = (waiting_list_t*)0; -} - - -int PARMCI_Destroy_mutexes() -{ -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ - int proc, mutex, i,factor=0; -#endif - if(num_mutexes==0)armci_die("armci_destroy_mutexes: not created",0); - if(armci_nproc == 1) return(0); - - armci_msg_barrier(); - -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ - for(proc=0;proc myturn) - armci_die2("armci: problem with tickets",myturn,next_in_line); - - /* apply a linear backoff delay before retrying */ - for(i=0; i< SPINMAX * factor; i++) _dummy_work_ += 1.; - - factor += 1; - - }while (myturn != next_in_line); - - glob_mutex[proc].tickets[mutex] = myturn; /* save ticket value */ -} - - -static void armci_generic_unlock(int mutex, int proc) -{ -int *mutex_ticket= glob_mutex[proc].turn + mutex; -int *newval = glob_mutex[proc].tickets +mutex; -int len=sizeof(int); - - /* update ticket for next process requesting this mutex */ - (*newval) ++; - - /* write new ticket value stored previously in tickets */ - PARMCI_Put(newval, mutex_ticket, len, proc); - MEM_FENCE; -} - - -/*\ Acquire mutex for "proc" - * -must be executed in hrecv/AM handler thread - * -application thread must use generic_lock routine -\*/ -int armci_server_lock_mutex(int mutex, int proc, msg_tag_t tag) -{ -int myturn; -int *mutex_ticket, next_in_line, len=sizeof(int); -int owner = armci_me; - - - if(DEBUG)fprintf(stderr,"SLOCK=%d owner=%d p=%d m=%d\n", - armci_me,owner, proc,mutex); - - mutex_ticket= glob_mutex[owner].turn + mutex; - myturn = register_in_mutex_queue(mutex, owner); - - armci_copy(mutex_ticket, &next_in_line, len); - - if(next_in_line > myturn) - armci_die2("armci-s: problem with tickets",myturn,next_in_line); - - if(next_in_line != myturn){ - if(!blocked)armci_serv_mutex_create(); - blocked[proc].mutex = mutex; - blocked[proc].turn = myturn; - blocked[proc].tag = tag; - if(DEBUG) fprintf(stderr,"SLOCK=%d proc=%d blocked (%d,%d)\n", - armci_me, proc, next_in_line,myturn); - return -1; - - } else { - - if(DEBUG) fprintf(stderr,"SLOCK=%d proc=%d sending ticket (%d)\n", - armci_me, proc, myturn); - - /* send ticket to requesting node */ - /* GA_SEND_REPLY(tag, &myturn, sizeof(int), proc); */ - return (myturn); - } -} - - - -/*\ Release mutex "id" held by proc - * called from hrecv/AM handler AND application thread -\*/ -int armci_server_unlock_mutex(int mutex, int proc, int Ticket, msg_tag_t* ptag) -{ -#define NOBODY -1 -int owner = armci_me; -int i, p=NOBODY, *mutex_ticket= glob_mutex[owner].turn + mutex; -int len=sizeof(int); - - if(DEBUG) fprintf(stderr,"SUNLOCK=%d node=%d mutex=%d ticket=%d\n", - armci_me,proc,mutex,Ticket); - - Ticket++; - armci_copy(&Ticket, mutex_ticket, len); - - /* if mutex is free then nobody is reqistered in queue */ - if(armci_mutex_free(mutex, proc)) return -1; - - /* search for the next process in queue waiting for this mutex */ - for(i=0; i< armci_nproc; i++){ - if(!blocked)break; /* not allocated yet - nobody is waiting */ - if(DEBUG)fprintf(stderr,"SUNLOCK=%d node=%d list=(%d,%d)\n", - armci_me, i, blocked[i].mutex, blocked[i].turn); - if((blocked[i].mutex == mutex) && (blocked[i].turn == Ticket)){ - p = i; - break; - } - } - - /* send Ticket to a process waiting for mutex */ - if(p != NOBODY){ - if(p == armci_me)armci_die("server_unlock: cannot unlock self",0); - else { - - if(DEBUG)fprintf(stderr,"SUNLOCK=%d node=%d unlock ticket=%d go=%d\n", - armci_me, proc, Ticket, p); - - /* GA_SEND_REPLY(blocked[p].tag, &Ticket, sizeof(int), p); */ - *ptag = blocked[p].tag; - return p; - - } - } - - return -1; /* nobody is waiting */ -} - - - -void PARMCI_Lock(int mutex, int proc) -{ -#if defined(SERVER_LOCK) -int direct; -#endif - - if(DEBUG)fprintf(stderr,"%d enter lock\n",armci_me); - - if(!num_mutexes) armci_die("armci_lock: create mutexes first",0); - - if(mutex > glob_mutex[proc].count) - armci_die2("armci_lock: mutex not allocated", mutex, - glob_mutex[proc].count); - - if(armci_nproc == 1) return; - -# if defined(SERVER_LOCK) - direct=SAMECLUSNODE(proc); - if(!direct) - armci_rem_lock(mutex,proc, glob_mutex[proc].tickets + mutex ); - else -# endif - armci_generic_lock(mutex,proc); - - if(DEBUG)fprintf(stderr,"%d leave lock\n",armci_me); -} - - - -void PARMCI_Unlock(int mutex, int proc) -{ - if(DEBUG)fprintf(stderr,"%d enter unlock\n",armci_me); - - if(!num_mutexes) armci_die("armci_lock: create mutexes first",0); - - if(mutex > glob_mutex[proc].count) - armci_die2("armci_lock: mutex not allocated", mutex, - glob_mutex[proc].count); - - if(armci_nproc == 1) return; - -# if defined(SERVER_LOCK) - if(armci_nclus >1) { - if(proc != armci_me) - armci_rem_unlock(mutex, proc, glob_mutex[proc].tickets[mutex]); - else { - int ticket = glob_mutex[proc].tickets[mutex]; - msg_tag_t tag; - int waiting; - - waiting = armci_server_unlock_mutex(mutex, proc, ticket, &tag); - if(waiting >-1) - armci_unlock_waiting_process(tag, waiting, ++ticket); - } - } - else -# endif - armci_generic_unlock(mutex, proc); - - if(DEBUG)fprintf(stderr,"%d leave unlock\n",armci_me); -} diff --git a/armci/src-portals/new_memory.c b/armci/src-portals/new_memory.c deleted file mode 100644 index 43ad9c15e..000000000 --- a/armci/src-portals/new_memory.c +++ /dev/null @@ -1,393 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include -#include -#include -#include -#include -#include "armcip.h" -#include "message.h" - -#define DEBUG_ 0 -#define USE_SHMEM_ -#define SHM_UNIT 1024 - -void armci_print_ptr(void **ptr_arr, int bytes, int size, void* myptr, int off) -{ -int i; -int nproc = armci_clus_info[armci_clus_me].nslave; - - ARMCI_PR_DBG("enter",0); - for(i=0; i< armci_nproc; i++){ - int j; - if(armci_me ==i){ - printf("%d master =%d nproc=%d off=%d\n",armci_me, - armci_master,nproc, off); - printf("%d:bytes=%d mptr=%p s=%d ",armci_me, bytes, myptr,size); - for(j = 0; j< armci_nproc; j++)printf(" %p",ptr_arr[j]); - printf("\n"); fflush(stdout); - } - armci_msg_barrier(); - } - ARMCI_PR_DBG("exit",0); -} - - -/******************************************************************** - * Non-collective Memory Allocation on shared memory systems -\*/ -void armci_shmem_memget(armci_meminfo_t *meminfo, size_t size) { - void *myptr=NULL; - void *armci_ptr=NULL; /* legal ARCMIptr used in ARMCI data xfer ops */ - - /* can malloc if there is no data server process & has 1 process/node*/ -} - -void* armci_shmem_memat(armci_meminfo_t *meminfo) { - return NULL; -} - -void armci_shmem_memctl(armci_meminfo_t *meminfo) { - -} - -/****** End: Non-collective memory allocation on shared memory systems *****/ - -/** - * Local Memory Allocation and Free - */ -void *ARMCI_Malloc_local(armci_size_t bytes) { - void *rptr; - ARMCI_PR_DBG("enter",0); - ARMCI_PR_DBG("exit",0); - return malloc(bytes); -} - -int ARMCI_Free_local(void *ptr) { - ARMCI_PR_DBG("enter",0); - free(ptr); - ARMCI_PR_DBG("exit",0); - return 0; -} - -/*\ A wrapper to shmget. Just to be sure that ID is not 0. -\*/ -static int armci_shmget(size_t size,char *from) -{ -int id; - - id = shmget(IPC_PRIVATE, size, (IPC_CREAT | 00600)); - - /*attaching with id 0 somehow fails (Seen on pentium4+linux24+gm163) - *so if id=0, shmget again. */ - while(id==0){ - /* free id=0 and get a new one */ - if(shmctl((int)id,IPC_RMID,(struct shmid_ds *)NULL)) { - fprintf(stderr,"id=%d \n",id); - armci_die("allocate: failed to _delete_ shared region ",id); - } - id = shmget(IPC_PRIVATE, size, (IPC_CREAT | 00600)); - } - if(DEBUG_){ - printf("\n%d:armci_shmget sz=%ld caller=%s id=%d\n",armci_me,(long)size, - from,id); - fflush(stdout); - } - return(id); -} - - -/*\ Collective Memory Allocation - * returns array of pointers to blocks of memory allocated by everybody - * Note: as the same shared memory region can be mapped at different locations - * in each process address space, the array might hold different values - * on every process. However, the addresses are legitimate - * and can be used in the ARMCI data transfer operations. - * ptr_arr[nproc] -\*/ -#define CLEANUP_CMD(command) sprintf(command,"/usr/bin/ipcrm shm %d",id); -int ARMCI_Malloc(void *ptr_arr[], armci_size_t bytes) -{ -int mynslave = armci_clus_info[armci_clus_me].nslave; -void *servptr,*mynodeptrs[mynslave]; -int id,nodeids[mynslave],mynodeid=armci_me-armci_master; - - ARMCI_PR_DBG("enter",0); -#ifdef DEBUG_MEM - fprintf(stderr,"%d bytes in armci_malloc %d\n",armci_me, (int)bytes); - fflush(stderr); - armci_msg_barrier(); -#endif - if(bytes>0){ - if(mynslave>1){ - -#ifdef DEBUG_MEM - printf("\n%d:%s:mynslave is %d",armci_me,FUNCTION_NAME,mynslave);fflush(stdout); -#endif - bzero((void *)nodeids,sizeof(int)*mynslave); - id =nodeids[mynodeid]= armci_shmget(bytes,"ARMCI_Malloc"); - armci_msg_gop_scope(SCOPE_NODE,nodeids,mynslave,"+",ARMCI_INT); - for(int i=0;i1){ - servptr = armci_server_ptr(id); - } - else servptr = mynodeptrs[mynodeid]; - - } - else{ -#ifdef DEBUG_MEM - printf("\n%d:%s:mynslave is %d, doing malloc",armci_me,FUNCTION_NAME,mynslave);fflush(stdout); -#endif - mynodeptrs[mynodeid] = servptr = malloc(bytes); - } - } - else{ - mynodeptrs[mynodeid] = servptr = NULL; - } - - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - /*ptr_arr[armci_me] = servptr;*/ - ptr_arr[armci_me] = mynodeptrs[mynodeid]; - armci_exchange_address(ptr_arr,armci_nproc); - - if(mynslave>1)for(int i=0;i1){ - armci_portals_memsetup((long)servptr-(long)ptr_arr[armci_me]); - } - - ARMCI_PR_DBG("exit",0); - return(0); - -} - - - -int ARMCI_Free(void *ptr) -{ - ARMCI_PR_DBG("enter",0); - if(!ptr)return 1; - - ARMCI_PR_DBG("exit",0); - return 0; -} - - -int ARMCI_Free_memdev(void *ptr) -{ - return ARMCI_Free(ptr); -} - - -int ARMCI_Uses_shm() -{ - int uses=0; - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) \ - && !defined(NO_SHM) -# ifdef RMA_NEEDS_SHMEM - if(armci_nproc >1) uses= 1; /* always unless serial mode */ -# else - if(armci_nproc != armci_nclus)uses= 1; /* only when > 1 node used */ -# endif -#endif - if(DEBUG_) fprintf(stderr,"%d:uses shmem %d\n",armci_me, uses); - return uses; -} -#ifdef MSG_COMMS_MPI - -int ARMCI_Uses_shm_grp(ARMCI_Group *group) -{ - int uses=0, grp_me, grp_nproc, grp_nclus; - ARMCI_PR_DBG("enter",0); - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - grp_nclus = grp_attr->grp_nclus; - -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) \ - && !defined(NO_SHM) -# ifdef RMA_NEEDS_SHMEM - if(grp_nproc >1) uses= 1; /* always unless serial mode */ -# else - if(grp_nproc != grp_nclus)uses= 1; /* only when > 1 node used */ -# endif -#endif - if(DEBUG_) fprintf(stderr,"%d (grp_id=%d):uses shmem %d\n",armci_me, grp_me, uses); - ARMCI_PR_DBG("exit",0); - return uses; -} - -/*\ ************** Begin Group Collective Memory Allocation ****************** - * returns array of pointers to blocks of memory allocated by everybody - * Note: as the same shared memory region can be mapped at different locations - * in each process address space, the array might hold different values - * on every process. However, the addresses are legitimate - * and can be used in the ARMCI data transfer operations. - * ptr_arr[nproc] -\*/ -int ARMCI_Malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) -{ - void *ptr; - int grp_me, grp_nproc; - ARMCI_PR_DBG("enter",0); - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(DEBUG_)fprintf(stderr,"%d (grp_id=%d) bytes in armci_malloc_group %d\n", - armci_me, grp_me, (int)bytes); - - ARMCI_PR_DBG("exit",0); - return(0); -} - - -int ARMCI_Free_group(void *ptr, ARMCI_Group *group) -{ - int grp_me, grp_nproc, grp_master, grp_clus_me; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - ARMCI_PR_DBG("enter",0); - - if(!ptr)return 1; - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - if(grp_me == MPI_UNDEFINED) { /* check if the process is in this group */ - armci_die("armci_malloc_group: process is not a member in this group", - armci_me); - } - /* get the group cluster info */ - grp_clus_me = grp_attr->grp_clus_me; - grp_master = grp_attr->grp_clus_info[grp_clus_me].master; - - ARMCI_PR_DBG("exit",0); - return 0; -} -/* ***************** End Group Collective Memory Allocation ******************/ - -/* ************** Begin Non-Collective Memory Allocation ****************** - * Prototype similar to SysV shared memory. - */ - -/** - * CHECK: On Altix we are forced to use SysV as shmalloc is collective. We - * may use a preallocated shmalloc memory, however, it may NOT still solve - * our problem... - * NOTE: "int memflg" option for future optimiztions. - */ -void ARMCI_Memget(size_t bytes, armci_meminfo_t *meminfo, int memflg) { - - void *myptr=NULL; - void *armci_ptr=NULL; /* legal ARCMI ptr used in ARMCI data xfer ops*/ - size_t size = bytes; - - if(size<=0) armci_die("ARMCI_Memget: size must be > 0", (int)size); - if(meminfo==NULL) armci_die("ARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - if(memflg!=0) armci_die("ARMCI_Memget: Invalid memflg", memflg); - - if( !ARMCI_Uses_shm() ) - { - - /* fill the meminfo structure */ - meminfo->armci_addr = armci_ptr; - meminfo->addr = myptr; - meminfo->size = size; - meminfo->cpid = armci_me; - /* meminfo->attr = NULL; */ - } - else - { - armci_shmem_memget(meminfo, size); - } - - if(DEBUG_){ - printf("%d: ARMCI_Memget: addresses server=%p myptr=%p bytes=%ld\n", - armci_me, meminfo->armci_addr, meminfo->addr, bytes); - fflush(stdout); - } -} - -void* ARMCI_Memat(armci_meminfo_t *meminfo, int memflg) { - void *ptr=NULL; - - if(meminfo==NULL) armci_die("ARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - if(memflg!=0) armci_die("ARMCI_Memget: Invalid memflg", memflg); - - if(meminfo->cpid==armci_me) { ptr = meminfo->addr; return ptr; } - - if( !ARMCI_Uses_shm()) - { - ptr = meminfo->addr; - } - else - { - ptr = armci_shmem_memat(meminfo); - } - - if(DEBUG_) - { - printf("%d:ARMCI_Memat: attached addr mptr=%p size=%ld\n", - armci_me, ptr, meminfo->size); fflush(stdout); - } - - return ptr; -} - -void ARMCI_Memdt(armci_meminfo_t *meminfo, int memflg) { - /** - * Do nothing. May be we need to have reference counting in future. This - * is to avoid the case of dangling pointers when the creator of shm - * segment calls Memctl and other processes are still attached to this - * segment - */ -} - -void ARMCI_Memctl(armci_meminfo_t *meminfo) { - - if(meminfo==NULL) armci_die("ARMCI_Memget: Invalid arg #2 (NULL ptr)",0); - - /* only the creator can delete the segment */ - if(meminfo->cpid == armci_me) - { - if( !ARMCI_Uses_shm() ) - { - void *ptr = meminfo->addr; - } - else - { - armci_shmem_memctl(meminfo); - } - } - - meminfo->addr = NULL; - meminfo->armci_addr = NULL; - /* if(meminfo->attr!=NULL) free(meminfo->attr); */ -} - -/* ***************** End Non-Collective Memory Allocation ******************/ - -#endif diff --git a/armci/src-portals/pack.c b/armci/src-portals/pack.c deleted file mode 100644 index 8db0b76a7..000000000 --- a/armci/src-portals/pack.c +++ /dev/null @@ -1,360 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: pack.c,v 1.36.10.1 2006-12-14 13:24:37 manoj Exp $ */ -#include "armcip.h" -#include - -#if !defined(ACC_COPY) &&!defined(CRAY_YMP) &&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML) -# define REMOTE_OP -#endif - -#if defined(REMOTE_OP) -# define OP_STRIDED armci_rem_strided -#else -# define OP_STRIDED(_a,_b,_c,_d,_e,_f,_g,_h,_i,_delete1,_j,_hdl)\ - armci_op_strided(_a,_b,_c,_d,_e,_f,_g,_h,_i,_j,_hdl) -#endif - - -/*\ determine if patch fits in the ARMCI buffer, and if not - * at which stride level (patch dim) need to decompose it - * *fit_level is the value of stride level to perform packing at - * *nb means number of elements of count[*fit_level] that fit in buf -\*/ -static void armci_fit_buffer(int count[], int stride_levels, int* fit_level, - int *nb, int bufsize) -{ - int bytes=1, sbytes=1; - int level; - - /* find out at which stride level BUFFER becomes too small */ - for(level=0; level<= stride_levels; level++){ - sbytes = bytes; /* store #bytes at current level to save div cost later */ - bytes *= count[level]; - if(bufsize < bytes) break; - } - - /* buffer big enough for entire patch */ - if(bufsize >= bytes){ - *fit_level = stride_levels; - *nb = count[stride_levels]; - return; - } - - /* buffer too small */ - switch (level){ - case 0: - /* smaller than a single column */ - *fit_level = 0; - *nb = bufsize; - break; - case -1: /* one column fits */ - *fit_level = 0; - *nb = sbytes; - break; - default: - /* it could keep nb instances of (level-1)-dimensional patch */ - *fit_level = level; - *nb = bufsize/sbytes; - } -} - - -/*\ The function decomposes a multi-dimensional patch so that it fits in the - * internal ARMCI buffer. - * It works by recursively reducing patch dimension until some portion of the - * subpatch fits in the buffer. - * The recursive process is controlled by "fit_level" and "nb" arguments, - * which have to be set to -1 at the top-level of the recursion tree. - * - * Argument last and variable looplast are used to indicate to sending/packing - * routine that we are dealing with the last portion of the request. - * Due to the recursive nature of packing code, the algorithm is following: - * if last=1 then internal for loop passes 1 for the last chunk - * else it passes 0 - * -\*/ -int armci_pack_strided(int op, void* scale, int proc, - void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, ext_header_t *h, - int fit_level, int nb, int last,armci_ihdl_t nb_handle) -{ - int rc=0, sn, bufsize=BUFSIZE,noswap=0; - void *src, *dst; -#ifdef REMOTE_OP - int flag=0; -#else - int flag=1; -#endif - int b; - static int call_count; - -#ifdef STRIDED_GET_BUFLEN - if(op==GET) bufsize=STRIDED_GET_BUFLEN; -# ifdef HITACHI - else - if(stride_levels || ARMCI_ACC(op)) bufsize=MSG_BUFLEN_SMALL-PAGE_SIZE; -# endif -#endif - -#if (defined(GM_) || defined(VIA_) || defined(VAPI_)) - /*we cant assume that the entire available buffer will be used for data, - fact that the header and descriptor also go in the same buffer should be - considered while packing. - */ - bufsize-=(sizeof(request_header_t)+(MAX_STRIDE_LEVEL+4)*sizeof(int)+2*sizeof(void *)); -# if defined(PIPE_BUFSIZE) && defined(MAX_PIPELINE_CHUNKS) - bufsize-=8*MAX_PIPELINE_CHUNKS; -# endif -#endif - -#ifdef BALANCE_FACTOR - /* Added the following for balancing buffers */ - if(op==PUT){ - int bytes=1, i; - for(i=0; i<= stride_levels; i++) - bytes *= count[i]; - if(bytes > bufsize && bytes/bufsize < 3 && bytes%bufsize < BALANCE_BUFSIZE){ - /* bytes div bufsize - 1 is to increase the balence factor for 3 buffer case */ - bufsize = bytes/ (bytes/bufsize - 1 + BALANCE_FACTOR); - noswap = 1; /*** yuck: if set to 1, error in buffers.c ***/ - } - bytes = bufsize%8; - bufsize -= bytes; - } -#endif - - /* determine decomposition of the patch to fit in the buffer */ - if(fit_level<0){ - armci_fit_buffer(count, stride_levels, &fit_level, &nb, bufsize); - last = 1; - } - -// printf("%s [cp]: pack_strided: flag=%d, bufsize=%ld; fit_level=%d; stride_level=%d; nb=%d\n",Portals_ID(),flag,(long) bufsize,fit_level,stride_levels,nb); - - if(fit_level == stride_levels){ - - /* we can fit subpatch into the buffer */ - int chunk = count[fit_level]; - int dst_stride, src_stride; - - if(nb == chunk){ /* take shortcut when whole patch fits in the buffer */ - if(h) h->last = last?1:0; - if(nb_handle && call_count ){ - nb_handle->bufid=NB_MULTI; - call_count++; - } - return(OP_STRIDED(op, scale, proc, src_ptr, src_stride_arr, - dst_ptr,dst_stride_arr,count,stride_levels,h,flag,nb_handle)); - } - - if(fit_level){ - dst_stride = dst_stride_arr[fit_level -1]; - src_stride = src_stride_arr[fit_level -1]; - }else{ - dst_stride = src_stride = 1; - } - if(op == GET || noswap == 1) b =nb; - else{ b = chunk%nb; if(b==0)b=nb; } /* put smallest piece first */ - - for(sn = 0; sn < chunk; ){ - src = (char*)src_ptr + src_stride* sn; - dst = (char*)dst_ptr + dst_stride* sn; - count[fit_level] = ARMCI_MIN(b, chunk-sn); /*modify count for this level*/ - - if(h) h->last = (last && ((sn+b)>=chunk))? 1: 0 ; - if(nb_handle)call_count++; - rc = OP_STRIDED( op, scale, proc, src, src_stride_arr, - dst,dst_stride_arr,count,fit_level,h,flag,nb_handle); - if(rc) break; - - sn += b; - b = nb; - } - count[fit_level] = chunk; /* restore original count */ - - } - else { - for(sn = 0; sn < count[stride_levels]; sn++){ - int looplast =0; - src = (char*)src_ptr + src_stride_arr[stride_levels -1]* sn; - dst = (char*)dst_ptr + dst_stride_arr[stride_levels -1]* sn; - - if(last && (sn == count[stride_levels]-1)) looplast =1; - rc = armci_pack_strided(op, scale, proc, src, src_stride_arr, - dst, dst_stride_arr, count, stride_levels -1, - h,fit_level, nb, looplast,nb_handle); - if(rc) return rc; - } - } - if(nb_handle && call_count ) - nb_handle->bufid=NB_MULTI; - return rc; -} - -/*\ decompose strided data into chunks and call func on each chunk -\*/ -void armci_dispatch_strided(void *ptr, int stride_arr[], int count[], - int strides, int fit_level, int nb, int bufsize, - void (*fun)(void*,int*,int*,int,void*), void *arg) -{ - int sn,first_call=0; - void *ptr_upd; - - /* determine decomposition of the patch to fit in the buffer */ - if(fit_level<0){ - first_call=1; - armci_fit_buffer(count, strides, &fit_level, &nb, bufsize); - } - - - if(fit_level == strides){ - - /* we can fit subpatch into the buffer */ - int chunk = count[fit_level]; - int stride_upd; - -# ifdef PIPE_MEDIUM_BUFSIZE_ - /* for first call we adjust nb for performance in medium request */ - if(first_call && strides==0) - if(chunk<2*bufsize && chunk>PIPE_MEDIUM_BUFSIZE) - nb = PIPE_MEDIUM_BUFSIZE; -# endif - - if(nb == chunk){ /* take shortcut when whole patch fits in the buffer */ - fun(ptr, stride_arr, count, strides, arg); - } - - if(fit_level) - stride_upd = stride_arr[fit_level -1]; - else - stride_upd = 1; - - for(sn = 0; sn < chunk; sn += nb){ - - ptr_upd = (char*)ptr + stride_upd* sn; - count[fit_level] = ARMCI_MIN(nb, chunk-sn); /*modify count for this level*/ - fun(ptr_upd, stride_arr, count, fit_level, arg); - } - count[fit_level] = chunk; /* restore original count */ - - }else for(sn = 0; sn < count[strides]; sn++){ - ptr_upd = (char*)ptr + stride_arr[strides -1]* sn; - armci_dispatch_strided(ptr_upd, stride_arr, count, strides -1, - fit_level, nb, bufsize, fun, arg); - } -} - -/* how much space is needed to move data + reduced descriptor ? */ -int armci_vector_bytes( armci_giov_t darr[], int len) -{ -int i, bytes=0; - for(i=0; isrc_ptr_array=NULL; - /* go through the sets looking for set to be split */ - for(s=0;sBUFSIZE1){ - - split =(BUFSIZE1 -bytes-2*sizeof(int))/(darr[s].bytes +sizeof(void*)); - if(split == 0) s--; /* no room available - do not split */ - break; - - }else bytes+=csize; - - if(BUFSIZE1 -bytes < 64) break; /* stop here if almost full */ - } - - if(s==len)s--; /* adjust loop counter should be < number of sets */ - *nlen = s+1; - - if(split){ - - /* save the value to be overwritten only if "save" is not filled */ - if(!save->src_ptr_array)*save= darr[s]; - - /* split the set: reduce # of elems, "extra" keeps info for rest of set*/ - *extra = darr[s]; - darr[s].ptr_array_len = split; - extra->ptr_array_len -= split; - extra->src_ptr_array = &extra->src_ptr_array[split]; - extra->dst_ptr_array = &extra->dst_ptr_array[split]; - } -} - - - -int armci_pack_vector(int op, void *scale, armci_giov_t darr[],int len, - int proc,armci_ihdl_t nb_handle) -{ -armci_giov_t extra; /* keeps data remainder of set to be processed in chunks */ -armci_giov_t save; /* keeps original value of set to be processed in chunks */ -armci_giov_t *ndarr; /* points to first array element to be processed now */ -int rc=0, nlen, count=0; - - ndarr = darr; - - save.src_ptr_array=NULL; /* indicates that save slot is empty */ - while(len){ - - armci_split_dscr_array(ndarr, len, &extra, &nlen, &save); -# if defined(REMOTE_OP) - rc = armci_rem_vector(op, scale, ndarr,nlen,proc,0,nb_handle); -# else - if(ARMCI_ACC(op))rc=armci_acc_vector(op,scale,ndarr,nlen,proc); - else rc = armci_copy_vector(op,ndarr,nlen,proc); -# endif - if(rc) break; - - /* non-NULL pointer indicates that set was split */ - if(extra.src_ptr_array){ - - if(nb_handle) { - nb_handle->bufid = NB_MULTI; /*can be set multiple times here; but not reset here*/ - } - - ndarr[nlen-1]=extra; /* set the pointer to remainder of last set */ - nlen--; /* since last set not done in full need to process it again */ - - }else{ - - if(save.src_ptr_array){ - ndarr[0]=save; - save.src_ptr_array=NULL; /* indicates that save slot is empty */ - } - - if(nlen==0) - armci_die("vector packetization problem:buffer too small",BUFSIZE1); - } - - len -=nlen; - ndarr +=nlen; - count ++; - } - - return rc; -} diff --git a/armci/src-portals/parmci.h b/armci/src-portals/parmci.h deleted file mode 100644 index 72dc51f4e..000000000 --- a/armci/src-portals/parmci.h +++ /dev/null @@ -1,123 +0,0 @@ -#include "armci.h" - -int PARMCI_AccV (int op, void *scale, armci_giov_t * darr, int len, int proc); - -void PARMCI_Barrier (); - -int PARMCI_AccS (int optype, void *scale, void *src_ptr, int *src_stride_arr, - void *dst_ptr, int *dst_stride_arr, int *count, - int stride_levels, int proc); - -void PARMCI_Finalize (); - -int PARMCI_NbPut (void *src, void *dst, int bytes, int proc, - armci_hdl_t * nb_handle); - -int PARMCI_GetValueInt (void *src, int proc); - -int PARMCI_Put_flag (void *src, void *dst, int bytes, int *f, int v, - int proc); - -int PARMCI_NbGetS (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, - int proc, armci_hdl_t * nb_handle); - -void *PARMCI_Malloc_local (armci_size_t bytes); - -int PARMCI_Free_local (void *ptr); - -int PARMCI_Get (void *src, void *dst, int bytes, int proc); - -int PARMCI_Put (void *src, void *dst, int bytes, int proc); - -int PARMCI_Destroy_mutexes (); - -int PARMCI_GetS (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, - int proc); - -int PARMCI_NbAccV (int op, void *scale, armci_giov_t * darr, int len, - int proc, armci_hdl_t * nb_handle); - -float PARMCI_GetValueFloat (void *src, int proc); - -int PARMCI_Malloc (void **ptr_arr, armci_size_t bytes); -int PARMCI_Malloc_memdev (void **ptr_arr, armci_size_t bytes, const char *device); - -int PARMCI_NbAccS (int optype, void *scale, void *src_ptr, - int *src_stride_arr, void *dst_ptr, int *dst_stride_arr, - int *count, int stride_levels, int proc, - armci_hdl_t * nb_handle); - -int PARMCI_PutS (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, - int proc); - -void *PARMCI_Memat (armci_meminfo_t * meminfo, long offset); - -int PARMCI_PutV (armci_giov_t * darr, int len, int proc); - -int PARMCI_Free (void *ptr); - -int PARMCI_Free_memdev (void *ptr); - -int PARMCI_Init_args (int *argc, char ***argv); - -int PARMCI_PutValueInt (int src, void *dst, int proc); - -void PARMCI_Memget (size_t bytes, armci_meminfo_t * meminfo, int memflg); - -void PARMCI_AllFence (); - -int PARMCI_NbPutV (armci_giov_t * darr, int len, int proc, - armci_hdl_t * nb_handle); - -int PARMCI_PutValueDouble (double src, void *dst, int proc); - -int PARMCI_GetV (armci_giov_t * darr, int len, int proc); - -int PARMCI_Test (armci_hdl_t * nb_handle); - -void PARMCI_Unlock (int mutex, int proc); - -void PARMCI_Fence (int proc); - -int PARMCI_Create_mutexes (int num); - -int PARMCI_PutS_flag (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, - int *flag, int val, int proc); - -int PARMCI_WaitProc (int proc); - -void PARMCI_Lock (int mutex, int proc); - -double PARMCI_GetValueDouble (void *src, int proc); - -int PARMCI_NbGetV (armci_giov_t * darr, int len, int proc, - armci_hdl_t * nb_handle); - -int PARMCI_Rmw (int op, int *ploc, int *prem, int extra, int proc); - -int PARMCI_Init (); - -int PARMCI_WaitAll (); - -int PARMCI_NbGet (void *src, void *dst, int bytes, int proc, - armci_hdl_t * nb_handle); - -int PARMCI_PutValueFloat (float src, void *dst, int proc); - -int PARMCI_NbPutS (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, - int proc, armci_hdl_t * nb_handle); - -int PARMCI_PutS_flag_dir (void *src_ptr, int *src_stride_arr, void *dst_ptr, - int *dst_stride_arr, int *count, int stride_levels, - int *flag, int val, int proc); - -int PARMCI_PutValueLong (long src, void *dst, int proc); - -int PARMCI_Wait (armci_hdl_t * nb_handle); - -long PARMCI_GetValueLong (void *src, int proc); diff --git a/armci/src-portals/pendbufs.c b/armci/src-portals/pendbufs.c deleted file mode 100644 index a4a1888e4..000000000 --- a/armci/src-portals/pendbufs.c +++ /dev/null @@ -1,698 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if defined(PEND_BUFS) - -#include "pendbufs.h" -#include "armcip.h" -#include -#include -#include -#include - -#define DEBUG_SERVER 0 - -/*-------------------Attributes-------------------------*/ - -/**Attributes to control buffer count and sizes. Implement this way to - hide the global variables, and provide get/set methods.*/ - -#define NUM_ATTRIBUTES 4 -#define ATTRIB_IMMBUF_LEN 0 -#define ATTRIB_IMMBUF_NUM 1 -#define ATTRIB_PNDBUF_LEN 2 -#define ATTRIB_PNDBUF_NUM 3 - -/** List of hidden attributes and their operations. - * @param attid IN Attribute id. Choose from the list above - * @param gs IN Get(=0)/Set(=1) - * @param v IN Value (used only when gs==1) - * @return Value of the attribute on return - */ -static int att_ops(int attid, int gs, int v) { - static not_first[NUM_ATTRIBUTES]; /*auto-init to zero*/ - static val[NUM_ATTRIBUTES]; - assert(attid>=0 && attidIMM_BUF_LEN*/ - INIT=7, /**buf; - proc_waitlist_t *info = &pbuf_proc_list_info[msginfo->from]; - assert(msginfo->tag.imm_msg == 0); - pbuf = _armci_serv_pendbuf_getbuf(); - if(pbuf) { - pbuf->status = INIT; - pbuf->avail = 0; - pbuf->vbuf = vbuf; - memcpy(pbuf->buf, vbuf->buf, sizeof(request_header_t)+msginfo->dscrlen); -/* pbuf_proc_list_info[msginfo->from].waiting_on=pbuf; */ - pbuf->order_prev = info->order_tail; - if(info->order_tail) info->order_tail->order_next = pbuf; - info->order_tail = pbuf; - if(!info->order_head) info->order_head = pbuf; - } - return pbuf; -} - - -/**Free a pending buffer - * @param pbuf IN Pointer to Pending buffer to be freed - * @return none - */ -static void _armci_serv_pendbuf_freebuf(pendbuf_t *pbuf){ - const request_header_t *msginfo = (request_header_t *)pbuf->buf; - proc_waitlist_t *info = &pbuf_proc_list_info[msginfo->from]; - ARMCI_PR_DBG("enter",0); - assert(pbuf != NULL); - pbuf->avail=1; - pbuf->status = -1; - pbuf->vbuf = NULL; -/* assert(info->waiting_on == pbuf); */ -/* info->waiting_on = NULL; */ - if(pbuf->order_prev) - pbuf->order_prev->order_next = pbuf->order_next; - if(pbuf->order_next) - pbuf->order_next->order_prev = pbuf->order_prev; - if(info->order_head == pbuf) { - assert(pbuf->order_prev == NULL); - info->order_head = pbuf->order_next; - } - if(info->order_tail == pbuf) { - assert(pbuf->order_next == NULL); - info->order_tail = pbuf->order_prev; - } - pbuf->order_prev = pbuf->order_next = NULL; /*not necessary here*/ - - _nPendBufsUsed -= 1; - assert(_nPendBufsUsed>=0); - ARMCI_PR_DBG("exit",0); -} - -/** Implement ordering between messages. This function needs to be - * implemented in conjunction with @_armci_serv_pendbuf_promote to - * ensure ordered processing of messages. - * @param vbuf IN Message in immediate buffer being checked - * @return 1 if the message can be progressed (either in-place or - * after copying to a pending buffer). 0 therwise. - */ -static int _armci_serv_pendbuf_can_progress(immbuf_t *vbuf) { - const request_header_t *msginfo=(request_header_t*)vbuf->buf; - const int proc = msginfo->from; - const proc_waitlist_t *info = &pbuf_proc_list_info[proc]; - - if(_pbufOrder == ONE_PBUF_MESG) { - /*Only one pending buffer used at any time*/ - if(_nPendBufsUsed>0) - return 0; - return 1; - } - if(_pbufOrder == ONE_PBUF_MESG_PER_PROC) { - /*Only one non-immediate mesg can be assigned to the pending - buffers at any time*/ - if(info->order_head - || (info->immbuf_wlist_head && info->immbuf_wlist_head!=vbuf)) { - return 0;/*other requests from this process remain*/ - } - if(!IS_IMM_MSG(*msginfo) && _nPendBufsUsed==PENDING_BUF_NUM) { - return 0; /*This buffer needs a free pending buffer*/ - } - assert(info->n_pending == 0 || info->immbuf_wlist_head==vbuf); - return 1; - } - if(_pbufOrder == ACC_NO_ORDER) { - /*Messages are processed in-place in immediate buffers or issued - into pending buffers for progress in order (like - ONE_PBUF_PER_MESG). This rule relaxes ONE_PBUF_PER_MESG by - allowing a sequence of ACCs to be processed in-place/issued - without waiting for the prior ones to complete*/ - int i, nwaiting_on, nacc; - pendbuf_t *ptr; - if(!IS_IMM_MSG(*msginfo) && _nPendBufsUsed==PENDING_BUF_NUM) { -/* printf("%d(s): op=%d from=%d datalen=%d waiting for pending buffers\n",armci_me,msginfo->operation,msginfo->from,msginfo->tag.data_len); */ -/* fflush(stdout); */ - return 0; /*This buffer needs a free pending buffer*/ - } -#if 1 /*commented for now: it does work*/ - if(IS_IMM_MSG(*msginfo) && ARMCI_ACC(msginfo->operation)) { - return 1; - } -#endif - if(info->immbuf_wlist_head && info->immbuf_wlist_head!=vbuf) { -/* printf("%d(s): op=%d from=%d datalen=%d not queue head\n",armci_me,msginfo->operation,msginfo->from,msginfo->tag.data_len); */ -/* fflush(stdout); */ - return 0; /*in order issue*/ - } - - if(!ARMCI_ACC(msginfo->operation)) { - if(info->order_head) - return 0; - return 1; - } - - int check = ARMCI_ACC(msginfo->operation); - assert(check); - for(ptr=info->order_head; ptr!=NULL; ptr=ptr->order_next) { - request_header_t *m = (request_header_t *)ptr->buf; - assert(m->from == msginfo->from); - if(!ARMCI_ACC(m->operation)) - break; - } - if(ptr != NULL) - return 0; - return 1; - } - if(_pbufOrder == PUTACC_SPLIT_ORDER) { - if(!IS_IMM_MSG(*msginfo) && _nPendBufsUsed==PENDING_BUF_NUM) { - return 0; /*This buffer needs a free pending buffer*/ - } - if(info->immbuf_wlist_head && info->immbuf_wlist_head!=vbuf) { - return 0; - } - if(msginfo->operation!=PUT && !ARMCI_ACC(msginfo->operation)) { - if(info->order_head) - return 0; - return 1; - } -#if 1 - if(IS_IMM_MSG(*msginfo) && ARMCI_ACC(msginfo->operation)) { - return 1; - } -#endif - if(IS_IMM_MSG(*msginfo) && info->order_head) - return 0; - return 1; - } - armci_die("Unknown pbuf ordering rule",_pbufOrder); - return 0; -} - -/** Goes through the set of immediate buffers waiting to be processed - * and completed, and identifies a buffer that can be processed - * now. Removes it from the list and returns it. Promote also - * considers availability of pending buffers if need be. - * @return Pointer to buffer that can be processed now. NULL if none exists. - */ -static immbuf_t* _armci_serv_pendbuf_promote() { - immbuf_t *immbuf = NULL; - proc_waitlist_t *info; - - ARMCI_PR_DBG("enter",0); - - assert(_nPendBufsUsed>=0); - if(!pbuf_ordering_plist_head) { - return NULL; /*nothing to promote*/ - } - - info = pbuf_ordering_plist_head; - do { - if(info->immbuf_wlist_head==NULL) { - printf("%d(s): Why is info->immbuf_wlist_head NULL\n", armci_me); - fflush(stdout); - pause(); - } - assert(info->immbuf_wlist_head!=NULL); - assert(info->n_pending>0); - if(_armci_serv_pendbuf_can_progress(info->immbuf_wlist_head)) { - immbuf = info->immbuf_wlist_head; - info->immbuf_wlist_head = immbuf->immbuf_list_next; - info->n_pending -= 1; - immbuf->immbuf_list_next = NULL; - if(!info->immbuf_wlist_head) { - info->immbuf_wlist_tail = NULL; - /*remove this proc from proc list*/ - info->prev->next = info->next; - info->next->prev = info->prev; - if(pbuf_ordering_plist_head == info) { - pbuf_ordering_plist_head = (info->next==info)?NULL:info->next; - } - info->prev = info->next = NULL; - } - break; - } - info = info->next; - } while(info != pbuf_ordering_plist_head); - - if(DEBUG_SERVER) if(immbuf) { - request_header_t *msginfo=(request_header_t*)immbuf->buf; - printf("%d:: promoting a buffer immbuf=%p op=%d from=%d n_pending=%d\n", - armci_me,immbuf,msginfo->operation,msginfo->from,info->n_pending); - fflush(stdout); - } - ARMCI_PR_DBG("exit",0); - return immbuf; -} - -/** Enqueue a message. It could be an immediate message that cannot - * make progress or a non-immediate message that cannot make progress - * either due to ordering constraints or lack of pending buffers. - * @param vbuf IN Immediate buffer to be enqueud - * @return Pending buffer into which the message was enqueued. NULL - * if no pending buffer was allocated (which is always the case for - * immediate messages) - */ -static pendbuf_t* _armci_serv_pendbuf_enqueue(immbuf_t *vbuf) { - request_header_t *msginfo=(request_header_t *)vbuf->buf; - int from = msginfo->from; - pendbuf_t *pbuf; - proc_waitlist_t *info = &pbuf_proc_list_info[msginfo->from]; - ARMCI_PR_DBG("enter",0); - -/* printf("%d: Entered serv_pbuf_enqueue\n", armci_me); */ - - pbuf=NULL; - if(msginfo->tag.imm_msg) { - assert(!_armci_serv_pendbuf_can_progress(vbuf)); - } - else if(_armci_serv_pendbuf_can_progress(vbuf)) { - pbuf = _armci_serv_pendbuf_assignbuf(vbuf); - assert(pbuf != NULL); /*can_progress() should ensure this*/ - } - if(pbuf == NULL) { -/* printf("%d(s):: Enqueing op=%d imm=%d from %d. n_pending=%d\n", armci_me, msginfo->operation, msginfo->tag.imm_msg, msginfo->from,info->n_pending); */ -/* fflush(stdout); */ - vbuf->immbuf_list_next = NULL; - assert(info->n_pending < IMM_BUF_NUM); /*How another message now?*/ - info->n_pending += 1; - - if(!info->immbuf_wlist_head) { - assert(!info->immbuf_wlist_tail); - assert(!info->prev && !info->next); - /*insert proc into proc list*/ - if(!pbuf_ordering_plist_head) { - pbuf_ordering_plist_head=info->next=info->prev=info; - } - else { - info->next = pbuf_ordering_plist_head; - info->prev = pbuf_ordering_plist_head->prev; - pbuf_ordering_plist_head->prev->next = info; - pbuf_ordering_plist_head->prev = info; - } - } - /*insert vbuf into immbuf list for this proc*/ - if(info->immbuf_wlist_tail) - info->immbuf_wlist_tail->immbuf_list_next=vbuf; - info->immbuf_wlist_tail = vbuf; - if(!info->immbuf_wlist_head) - info->immbuf_wlist_head = vbuf; - } -/* printf("%d: Leaving serv_pbuf_enqueue\n", armci_me); */ - ARMCI_PR_DBG("exit",0); - return pbuf; -} - -/** Progress GET requests. - * @param pbuf IN Pending buffer containing the GET request - * @return none - */ -static void _armci_serv_pendbuf_progress_get(pendbuf_t *pbuf) { - int index = (pbuf - serv_pendbuf_arr); - request_header_t *msginfo = (request_header_t *)pbuf->buf; - void *buffer =((char *)(msginfo+1)+msginfo->dscrlen); - int *status = &pbuf->status; - - assert(sizeof(request_header_t)+msginfo->dscrlen+msginfo->datalendscrlen <= IMM_BUF_LEN) { - /*Have the header and descriptor; go process*/ - armci_complete_pendbuf(pbuf); - *status = SEND_DATA_PENDING; - } - else { /*Need to get rest of descriptor*/ - const int bytes = sizeof(request_header_t)+msginfo->dscrlen-IMM_BUF_LEN; -#warning "PEND_BUFS: Abusing msginfo->tag.ack_ptr for GETS with large descriptors!" - assert(msginfo->tag.ack_ptr != NULL); /*sanity check. Should point to tag.ack on the client side*/ - void *lptr = ((char *)msginfo)+IMM_BUF_LEN; - void *rptr = ((char *)msginfo->tag.ack_ptr) - (int)(&((request_header_t *)0)->tag.ack) + IMM_BUF_LEN; -/* printf("%d(s):: GET getting rest of descriptor index=%d bytes=%d ptr=%p from=%d\n", */ -/* armci_me,index,bytes,rptr,msginfo->from); */ -/* fflush(stdout); */ - assert(IMM_BUF_LEN+bytes < PENDING_BUF_LEN); - armci_pbuf_start_get(rptr,lptr,bytes,msginfo->from,index); - *status = RECV_DSCR_PENDING; - } - break; - case RECV_DSCR_PENDING: - armci_die("call_data_server should set status to RECV_DSCR_DONE before calling progress",*status); - break; - case SEND_DATA_PENDING: - armci_die("call_data_server should set status to SEND_DATA_DONE before calling progress",*status); - break; - case RECV_DSCR_DONE: -/* printf("%d(s):: GET. Done recving descriptor index=%d op=%d datalen=%d from=%d\n", */ -/* armci_me,index,msginfo->operation,msginfo->datalen,msginfo->from); */ -/* fflush(stdout); */ - armci_complete_pendbuf(pbuf); - *status = SEND_DATA_PENDING; - break; - case SEND_DATA_DONE: - _armci_serv_pendbuf_freebuf(pbuf); - break; - case RECV_DATA_PENDING: - case RECV_DATA_DONE: - default: - armci_die("pendbuf_progress_get: invalid status", *status); - } -} - -/** Progress PUT/ACC requests. - * @param pbuf IN Pending buffer containing the PUT/ACC request - * @return none - */ -static void _armci_serv_pendbuf_progress_putacc(pendbuf_t *pbuf) { - int index = (pbuf - serv_pendbuf_arr); - request_header_t *msginfo = (request_header_t *)pbuf->buf; - void *buffer =((char *)(msginfo+1))+msginfo->dscrlen; - int *status = &pbuf->status; - - int check =ARMCI_ACC(msginfo->operation); - assert(msginfo->operation==PUT || check); - assert(sizeof(request_header_t)+msginfo->dscrlen+msginfo->datalenoperation,msginfo->from); */ -/* fflush(stdout); */ - if(sizeof(request_header_t)+msginfo->dscrlen <= IMM_BUF_LEN) { - /*Have the header and descriptor; go process*/ - assert(sizeof(request_header_t)+msginfo->dscrlen+msginfo->tag.data_len < PENDING_BUF_LEN); - armci_pbuf_start_get(msginfo->tag.data_ptr,buffer,msginfo->tag.data_len, - msginfo->from, index); -/* printf("%d(s): PUT/ACC getting data. pbuf_num=%d data_ptr=%p data_len=%d bytes=%d\n", armci_me,index,msginfo->tag.data_ptr, msginfo->tag.data_len,msginfo->bytes); */ - *status = RECV_DATA_PENDING; - } - else { /*Need to get rest of descriptor*/ - const int bytes = sizeof(request_header_t)+msginfo->dscrlen-IMM_BUF_LEN; -#warning "PEND_BUFS: Abusing msginfo->tag.ack_ptr for GETS with large descriptors!" - assert(msginfo->tag.ack_ptr != NULL); /*sanity check. Should point to tag.ack on the client side*/ - void *lptr = ((char *)msginfo)+IMM_BUF_LEN; - void *rptr = ((char *)msginfo->tag.ack_ptr) - (int)(&((request_header_t *)0)->tag.ack) + IMM_BUF_LEN; -/* printf("%d(s):: PUT getting rest of descriptor index=%d bytes=%d ptr=%p from=%d\n", */ -/* armci_me,index,bytes,rptr,msginfo->from); */ -/* fflush(stdout); */ - assert(IMM_BUF_LEN+bytes < PENDING_BUF_LEN); - armci_pbuf_start_get(rptr,lptr,bytes,msginfo->from,index); - *status = RECV_DSCR_PENDING; - } - break; - case RECV_DSCR_PENDING: - armci_die("call_data_server should set status to RECV_DSCR_DONE before calling progress",*status); - break; - case RECV_DATA_PENDING: - armci_die("call_data_server should set status to RECV_DONE before calling progress",*status); - break; - case RECV_DSCR_DONE: - assert(sizeof(request_header_t)+msginfo->dscrlen+msginfo->tag.data_len < PENDING_BUF_LEN); - armci_pbuf_start_get(msginfo->tag.data_ptr,buffer,msginfo->tag.data_len, - msginfo->from, index); -/* printf("%d(s): PUT/ACC getting data. pbuf_num=%d data_ptr=%p data_len=%d bytes=%d\n", armci_me,index,msginfo->tag.data_ptr, msginfo->tag.data_len,msginfo->bytes); */ - *status = RECV_DATA_PENDING; - break; - case RECV_DATA_DONE: -/* printf("%d(s):: Done PUT/ACC with buf index=%d op=%d datalen=%d from=%d\n", */ -/* armci_me,index,msginfo->operation,msginfo->datalen,msginfo->from); */ -/* fflush(stdout); */ - if(msginfo->operation == PUT && pbuf->order_prev!=NULL) { - assert(pbuf->commit_me == 0); /*Why called so many times in thie - state?*/ - pbuf->commit_me = 1; - break; - } - pbuf->commit_me = 0; - armci_complete_pendbuf(pbuf); - _armci_serv_pendbuf_freebuf(pbuf); - break; - case SEND_DATA_PENDING: - case SEND_DATA_DONE: - default: - armci_die("pendbuf_progress_putacc: invalid status", *status); - } -} - - -/** Make progress on processing a pending buffer. This function, also - * ensures any other waiting messages get processed if they can - * be. Thus, progress and eventual termination is guaranteed by this - * function. - * @param _pbuf IN Pending buffer to make progress on - * @return none - */ -static void _armci_serv_pendbuf_progress(pendbuf_t *_pbuf){ - request_header_t *msginfo = (request_header_t *)_pbuf->buf; - immbuf_t *vbuf = _pbuf->vbuf; - pendbuf_t *pbuf = _pbuf; - - assert(pbuf->vbuf!=NULL); - do { - if(vbuf && !IS_IMM_MSG(*msginfo)) { assert(pbuf->vbuf == vbuf); } -/* printf("%d(s):: progressing op=%d imm=%d from=%d datalen=%d pbuf=%p vbuf=%p n_pending=%d\n", armci_me, */ -/* msginfo->operation,msginfo->tag.imm_msg,msginfo->from,msginfo->datalen, pbuf,vbuf,pbuf_proc_list_info[msginfo->from].n_pending); */ -/* fflush(stdout); */ - if(IS_IMM_MSG(*msginfo)) { - armci_complete_immbuf(vbuf); - } - else { /*non-immediate message*/ - proc_waitlist_t* info = &pbuf_proc_list_info[msginfo->from]; - - do { - assert(pbuf->vbuf == vbuf); - if(msginfo->operation == PUT || ARMCI_ACC(msginfo->operation)) { - _armci_serv_pendbuf_progress_putacc(pbuf); - } - else if (msginfo->operation == GET) { - _armci_serv_pendbuf_progress_get(pbuf); - } - else { - armci_die("pending buffer processing for this op not yet implemented", msginfo->operation); - } - pbuf = info->order_head; - vbuf = pbuf ? pbuf->vbuf : NULL; - } while(info->order_head && info->order_head->commit_me); - } -/* sleep(2); */ - vbuf = _armci_serv_pendbuf_promote(); - if(vbuf) { - msginfo = (request_header_t *)vbuf->buf; - if(!msginfo->tag.imm_msg) { - pbuf = _armci_serv_pendbuf_assignbuf(vbuf); - assert(pbuf != NULL); - } - } - } while(vbuf != NULL); -} - - - -/*----------------External functions--------------------*/ - - -/** Initialize array of pending buffers - * @return none - */ -void armci_pendbuf_init() { - int i; - - ARMCI_PR_DBG("enter",0); - -/* bzero(serv_pendbuf_arr, sizeof(pendbuf_t)*PENDING_BUF_NUM); */ - for(i=0; ibuf; - bzero(pbuf, sizeof(pendbuf_t)); - pbuf->buf = buf; - pbuf->avail=1; - } - - pbuf_ordering_plist_head=NULL; - pbuf_proc_list_info = (proc_waitlist_t *)malloc(sizeof(proc_waitlist_t)*armci_nproc); - assert(pbuf_proc_list_info != NULL); - bzero(pbuf_proc_list_info, sizeof(proc_waitlist_t)*armci_nproc); - ARMCI_PR_DBG("exit",0); -} - -void armci_pendbuf_service_req(immbuf_t *immbuf) { - pendbuf_t *pbuf; - request_header_t *msginfo=(request_header_t*)immbuf->buf; - if(IS_IMM_MSG(*msginfo) && _armci_serv_pendbuf_can_progress(immbuf)) { - /* printf("%d: msg vbuf=%p op=%d from=%d imm=%d datalen=%d bytes=%d data_ptr=%p can progress. Completing it now!\n", */ - /* armci_me, vbuf, msginfo->operation, msginfo->from, msginfo->tag.imm_msg,msginfo->datalen,msginfo->bytes,msginfo->tag.data_ptr); */ - /* fflush(stdout); */ - armci_complete_immbuf(immbuf); - } - else if(pbuf = _armci_serv_pendbuf_enqueue(immbuf)) { - /* printf("%d: msg vbuf=%p op=%d from=%d imm=%d datalen=%d bytes=%d data_ptr=%p got pending buf. Progressing it!\n", */ - /* armci_me, vbuf, msginfo->operation, msginfo->from, msginfo->tag.imm_msg,msginfo->datalen,msginfo->bytes,msginfo->tag.data_ptr); */ - /* fflush(stdout); */ - _armci_serv_pendbuf_progress(pbuf); - } - else { - /* printf("%d: msg vbuf=%p op=%d from=%d imm=%d datalen=%d bytes=%d data_ptr=%p in waitlist!\n", armci_me, vbuf, msginfo->operation, msginfo->from, msginfo->tag.imm_msg,msginfo->datalen,msginfo->bytes,msginfo->tag.data_ptr); */ - /* fflush(stdout); */ - } -} - -/**Network layer reporting to split buffers code that a put completed - * on a pending buffer. - * @param pbufid IN Pending buffer id (specified when starting a - * put). - * @return void - */ -void armci_pendbuf_done_put(int pbufid) { - assert(pbufid>=0 && pbufid=0 && pbufidstatus) { - case RECV_DSCR_PENDING: - pbuf->status = RECV_DSCR_DONE; - break; - case RECV_DATA_PENDING: - pbuf->status = RECV_DATA_DONE; - break; - default: - armci_die("Reporting get done on buf with inappropriate status",pbufid); - } - _armci_serv_pendbuf_progress(pbuf); -} - - -#endif /*PEND_BUFS*/ - diff --git a/armci/src-portals/pendbufs.h b/armci/src-portals/pendbufs.h deleted file mode 100644 index d754b0ec8..000000000 --- a/armci/src-portals/pendbufs.h +++ /dev/null @@ -1,72 +0,0 @@ -/** @file Split buffer implementation. - * @author Sriram Krishnamoorthy - * - * Supports multiple short/immediate buffers posted per client and a - * client-independent number of buffers to handle large messages. - */ -#ifndef _PENDBUFS_H_ -#define _PENDBUFS_H_ - -#if defined(PEND_BUFS) - -#include "armcip.h" -#include "request.h" - - -/**The buf should be the first field in immbuf_t and pendbuf_t. For - example, look at openib.c:armci_rcv_req and maybe other places*/ -typedef struct immbuf_t { - char *buf; /*immediate buffer[IMMBUF_LEN]*/ -/* IMMBUF_NW_T fields; */ - IMMBUF_NW_T - struct immbuf_t *immbuf_list_next; -} immbuf_t; - -typedef struct pendbuf_t { - char *buf; /*pending buffer[PENDBUF_LEN]*/ -/* PENDBUF_NW_T fields; */ - PENDBUF_NW_T - int status; /* -#include "armcip.h" - -/* ---------------------------------------------------------------------------------------------- *\ - global variables -\* ---------------------------------------------------------------------------------------------- */ - ptl_process_id_t *portals_id_map = NULL; - ptl_process_id_t *portals_cloned_id_map = NULL; - - size_t portalsMaxEagerMessageSize; - - MPI_Comm portals_smp_comm; - -/* ---------------------------------------------------------------------------------------------- *\ - static variables for this object -\* ---------------------------------------------------------------------------------------------- */ - static int portals_verbose = 0; - - -/* ---------------------------------------------------------------------------------------------- *\ - portals wrappers -\* ---------------------------------------------------------------------------------------------- */ - - -int -portals_init(ptl_handle_ni_t *nih) -{ - int num_interfaces = 0; - int rc; - - rc = PtlInit(&num_interfaces); - if (rc != PTL_OK) { - printf("PtlInit err %d\n", rc); - return rc; - } - - rc = PtlNIInit(CRAY_UK_SSNAL, PTL_PID_ANY, NULL, NULL, nih); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - printf("PtlNIInit err %d\n", rc); - return rc; - } - - portalsMaxEagerMessageSize = PORTALS_MAX_EAGER_MESSAGE_SIZE; - - return PTL_OK; -} - - -int -portals_finalize(ptl_handle_ni_t nih) -{ - PtlNIFini(nih); - PtlFini(); - return PTL_OK; -} - - -int -portals_getid(ptl_handle_ni_t nih, ptl_process_id_t *id) -{ - int rc; - - rc = PtlGetId(nih, id); - if(rc != PTL_OK) { - printf("PtlGetId err %d\n",rc); - return rc; - } - - return PTL_OK; -} - - -int -portals_create_eq(ptl_handle_ni_t nih, ptl_size_t count, ptl_handle_eq_t *eq_handle) -{ - int rc; - - rc = PtlEQAlloc(nih, count, PTL_EQ_HANDLER_NONE, eq_handle); - if (rc != PTL_OK) { - printf("PtlEQAlloc err %d\n", rc); - return rc; - } - - return PTL_OK; -} - - -int -portals_free_eq(ptl_handle_eq_t eq) -{ - int rc; - - rc = PtlEQFree(eq); - if (rc != PTL_OK) { - printf("PtlEQFree err %d\n",rc); - return rc; - } - - return PTL_OK; -} - -/* - permanent buffers - such as unexpected receive buffers or data requests - buffers should not be unlinked. client side buffers, such as large puts/accs - would create a ME in front of the MATCH ALL unexpected buffer/data req ME. - on the client side, the MATCH ALL ME should catch the ACKs -*/ - - -int -portals_me_attach(ptl_handle_ni_t nih, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_handle_me_t *me_handle) -{ - int rc = PtlMEAttach(nih,PORTALS_INDEX,match_id,match_bits,ignore_bits, - PTL_UNLINK,PTL_INS_BEFORE,me_handle); - if (rc != PTL_OK) { - printf("PtlAttach err %d in me_attach\n",rc); - return rc; - } - - return PTL_OK; -} - -int -portals_me_insert(ptl_handle_me_t base, - ptl_process_id_t pe_match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_handle_me_t *me_handle) -{ - int rc = PtlMEInsert(base,pe_match_id,match_bits,ignore_bits, - PTL_UNLINK,PTL_INS_BEFORE,me_handle); - if (rc != PTL_OK) { - printf("PtlME err %d in portals_push_me\n",rc); - return rc; - } - - return rc; -} - - -int -portals_me_unlink(ptl_handle_me_t meh) -{ - int rc = PtlMEUnlink(meh); - - if(rc != PTL_OK) { - printf("PtlMEUnlink err %d in me_unlink\n",rc); - } - - return rc; -} - - -int -portals_md_attach(ptl_handle_me_t me_handle, - ptl_md_t md, - ptl_unlink_t unlink_op, - ptl_handle_md_t *md_handle) -{ - int rc = PtlMDAttach(me_handle, md, unlink_op, md_handle); - if (rc != PTL_OK) { - printf("PtlMDAttach err %d\n",rc); - return rc; - } - - return PTL_OK; -} - - -int -portals_md_bind(ptl_handle_ni_t nih, - ptl_md_t md, - ptl_unlink_t unlink_op, - ptl_handle_md_t *md_handle) -{ - int rc = PtlMDBind(nih, md, unlink_op, md_handle); - if (rc != PTL_OK) { - printf("PtlMDBind err %d\n",rc); - return rc; - } - - return rc; -} - - -int -portals_eqwait(ptl_handle_eq_t eqh, ptl_event_t *ev) -{ - int rc = PtlEQWait(eqh, ev); - if (rc != PTL_OK) { - printf("PtlEQWait err %d\n",rc); - return rc; - } - - return PTL_OK; -} - - -static int -notify(portals_desc_t *desc, int state, char *name) { - if(desc->state & state) { - desc->state &= ~state; - if(desc->state == 0) desc->done = 1; - return 1; - } else { - printf("event: %s with desc state %x not %x\n",name,desc->state,state); - abort(); - return 0; - } -} - - -int -portals_wait(portals_desc_t *wait_on_desc) { - - int rc; - ptl_event_t ev; - portals_desc_t *desc = NULL; - - while(wait_on_desc->state) { - - rc = portals_eqwait(wait_on_desc->eqh, &ev); - if (rc != PTL_OK) { - printf("eq wait error in portals_wait\n"); - abort(); - } - - desc = (portals_desc_t *) ev.md.user_ptr; - - switch(ev.type) { - - case PTL_EVENT_SEND_START: - if (portals_verbose) printf("%s event: send start\n",Portals_ID()); - notify(desc, STATE_SEND_START, "send start"); - break; - - case PTL_EVENT_SEND_END: - if (portals_verbose) printf("%s event: send end\n",Portals_ID()); - notify(desc, STATE_SEND_END, "send end"); - break; - - case PTL_EVENT_REPLY_START: - if (portals_verbose) printf("%s event: reply start\n",Portals_ID()); - notify(desc, STATE_REPLY_START, "reply start"); - break; - - case PTL_EVENT_REPLY_END: - if (portals_verbose) printf("%s event: reply end\n",Portals_ID()); - notify(desc, STATE_REPLY_END, "reply end"); - break; - - case PTL_EVENT_ACK: - if (portals_verbose) printf("%s event: ack\n",Portals_ID()); - printf("%s event ack: md.threshold=%d\n",Portals_ID(),ev.md.threshold); - notify(desc, STATE_ACK, "ack"); - break; - - case PTL_EVENT_PUT_START: - if (portals_verbose) printf("%s event: put start\n",Portals_ID()); - notify(desc, STATE_PUT_START, "put start"); - break; - - case PTL_EVENT_PUT_END: - if (portals_verbose) printf("%s event: put end\n",Portals_ID()); - if (notify(desc, STATE_PUT_END, "put end")) { - // desc->len = ev.mlength; - // desc->off = ev.offset; - } - break; - - case PTL_EVENT_GET_START: - if (portals_verbose) printf("%s event: get start\n",Portals_ID()); - notify(desc, STATE_GET_START, "get start"); - break; - - case PTL_EVENT_GET_END: - if (portals_verbose) printf("%s event: get end\n",Portals_ID()); - notify(desc, STATE_GET_END, "get end"); - break; - - case PTL_EVENT_UNLINK: - if (portals_verbose) printf("%s event: unlink\n",Portals_ID()); - notify(desc, STATE_UNLINK, "unlink"); - break; - - default: - printf("%s event: %d\n",Portals_ID(), ev.type); - break; - } - - } - - return PTL_OK; -} - - -int -portals_put(portals_desc_t *desc) -{ - int rc; - int threshold = 1; - ptl_md_t md = { 0 }; - ptl_handle_md_t md_handle; - ptl_ack_req_t ack_req = PTL_NOACK_REQ; - - # ifdef PORTALS_PUT_USE_ACK - ack_req = PTL_ACK_REQ; - threshold++; - # endif - - md.start = desc->buffer; - md.length = desc->length; - md.threshold = threshold; - md.options = 0; - # ifndef PORTALS_PUT_USE_START - md.options |= PTL_MD_EVENT_START_DISABLE; - # endif - md.user_ptr = desc; - md.eq_handle = desc->eqh; - - rc = portals_md_bind(desc->nih, md, PTL_UNLINK, &md_handle); - if (rc != PTL_OK) { - printf("failed to bind local md in put; err %d",rc); - Fatal_error(rc); - } - - rc = PtlPut(md_handle, - ack_req, - desc->id, - PORTALS_INDEX, - 0, - desc->mbits, - 0, - desc->hdr); - if (rc != PTL_OK) { - printf("PtlPut err %d\n",rc); - return rc; - } - - desc->done = 0; - desc->state = STATE_SEND_END; - - # ifdef PORTALS_PUT_USE_START - desc->state |= STATE_SEND_START; - # endif - - # ifdef PORTALS_PUT_USE_ACK - desc->state |= STATE_ACK; - # endif - - return PTL_OK; -} - - -int -portals_get(portals_desc_t* desc) -{ - int rc; - ptl_md_t md = { 0 }; - ptl_handle_md_t md_handle; - - md.start = desc->buffer; - md.length = desc->length; - md.threshold = 2; - md.options = 0; - # ifndef PORTALS_GET_USE_START - md.options |= PTL_MD_EVENT_START_DISABLE; - # endif - md.user_ptr = desc; - md.eq_handle = desc->eqh; - - rc = portals_md_bind(desc->nih, md, PTL_UNLINK, &md_handle); - if (rc != PTL_OK) { - printf("failed to bind local md in get; err %d\n",rc); - Fatal_error(rc); - } - - rc = PtlGet(md_handle, - desc->id, - PORTALS_INDEX, - 0, - desc->mbits, - 0); - if (rc != PTL_OK) { - printf("PtlGet err %d\n",rc); - Fatal_error(rc); - } - - desc->done = 0; - desc->state = STATE_REPLY_END | STATE_SEND_END; - - # ifdef PORTALS_GET_USE_START - desc->state |= STATE_REPLY_START; - # endif - - return PTL_OK; -} - - -/* -portals_desc_t* -portals_get_free_desc(void) -{ - int i,rc; - portals_desc_t *desc = NULL; - - while(desc == NULL) { - for(i=0; i= 0;bit--) - { - if ((mask << bit) & (unsigned char)*(data+ptr)) - { - printf("1"); - } - else - { - printf("0"); - } - } - printf(" "); - } - printf("\n"); -} - - -void -portals_print_summary() -{ - printf("PORTALS_MAX_DESCRIPTORS = %d\n",PORTALS_MAX_DESCRIPTORS); - printf("PORTALS_MAX_BUFS = %d\n",PORTALS_MAX_BUFS); - printf("PORTALS_MAX_SMALL_BUFS = %d\n",PORTALS_MAX_SMALL_BUFS); - printf("PORTALS_BUF_SIZE = %d\n",PORTALS_BUF_SIZE); - printf("PORTALS_SMALL_BUF_SIZE = %d\n",PORTALS_SMALL_BUF_SIZE); - printf("PORTALS_NREQUEST_BUFFERS = %d\n",PORTALS_NREQUEST_BUFFERS); - printf("PORTALS_MAX_EAGER_MESSAGE_SIZE = %d\n",PORTALS_MAX_EAGER_MESSAGE_SIZE); - return; -} diff --git a/armci/src-portals/portals.h b/armci/src-portals/portals.h deleted file mode 100644 index 15f59646c..000000000 --- a/armci/src-portals/portals.h +++ /dev/null @@ -1,194 +0,0 @@ -/* ---------------------------------------------------------------------------------------------- *\ - portals.h header -\* ---------------------------------------------------------------------------------------------- */ - # ifndef _PORTALS_H_ - # define _PORTALS_H_ - - # define PORTALS_INDEX 1 - - # define ONE_KB 1024 - # define ONE_MB 1048576 - - # define MAX_DS_MSG_SIZE ONE_MB - - # define PORTALS_MAX_DESCRIPTORS (MAX_BUFS+MAX_SMALL_BUFS) - # define PORTALS_MAX_BUFS MAX_BUFS - # define PORTALS_MAX_SMALL_BUFS MAX_SMALL_BUFS - # define PORTALS_BUF_SIZE MSG_BUFLEN /* defined in requesh.h */ - -/* define small buf length here - formerly request.h */ - # ifdef PORTALS_USE_RENDEZ_VOUS - # define PORTALS_SMALL_BUF_SIZE 1024 /* for use with nwchem only -- will not pass armci test.x */ - # define PORTALS_MAX_EAGER_MESSAGE_SIZE PORTALS_SMALL_BUF_SIZE - # else - # define PORTALS_SMALL_BUF_SIZE 1024 - # define PORTALS_MAX_EAGER_MESSAGE_SIZE PORTALS_BUF_SIZE - # endif - - # define PORTALS_NREQUEST_BUFFERS 40 - # define PORTALS_REQUEST_BUFFER_SIZE_WARNING (128*ONE_MB) - - # define PORTALS_READ_ACCESS 1 - # define PORTALS_WRITE_ACCESS 1000 - - # define MATCH_ALL_MBITS 0x8000000000000000 /* should be set for all data requests */ - # define MATCH_ALL_IBITS ~MATCH_ALL_MBITS /* used to mask out all other bits, but MATCH_ALL */ - - - # define STATE_SEND_START 0x1 - # define STATE_SEND_END 0x2 - # define STATE_REPLY_START 0x4 - # define STATE_REPLY_END 0x8 - # define STATE_ACK 0x10 - # define STATE_PUT_START 0x20 - # define STATE_PUT_END 0x40 - # define STATE_GET_START 0x80 - # define STATE_GET_END 0x100 - # define STATE_UNLINK 0x200 - - - # define DS_RESPONSE_ACK 0x100000000000000 - # define DS_RESPONSE_PUT 0x200000000000000 - # define DS_RESPONSE_GET 0x400000000000000 - - # define PORTALS_ALLOW_NBGETS - # define PORTALS_USE_ARMCI_CLIENT_BUFFERS - - # define PORTALS_PUT_USE_ACK_TURNED_OFF - # define PORTALS_PUT_USE_START_TURNED_OFF - # define PORTALS_GET_USE_START_TURNED_OFF - -/* ---------------------------------------------------------------------------------------------- *\ - portals types -\* ---------------------------------------------------------------------------------------------- */ - typedef struct portals_desc_s { - void* buffer; // used for the md - ptl_size_t length; // used for the md - ptl_process_id_t id; // on whom the operation is acting on - ptl_match_bits_t mbits; // operations destination mbits - ptl_hdr_data_t hdr; // used for puts/unique counter value - - ptl_handle_ni_t nih; // network interface handle - ptl_handle_eq_t eqh; // event handler - ptl_handle_me_t meh; // me handle (if necessary) - ptl_handle_md_t mdh; // md handle (if necessary) - - int state; // track outstanding events remaining on the descriptor - int done; // flag to test whether all work on the descriptor is finished - int noperations; // the number of remote operations allowed on buffer - // this is only used when preposting/pinning CP memory - // for remote operations initiated by the data server - } portals_desc_t; - - - typedef struct portals_ds_req_s { - portals_desc_t req_desc; - portals_desc_t ack_desc; - portals_desc_t data_desc; - ptl_process_id_t dsid; - size_t unique_msg_id; - int active; - int remote_node; - } portals_ds_req_t; - - -/* ---------------------------------------------------------------------------------------------- *\ - portals global variables -\* ---------------------------------------------------------------------------------------------- */ - /*ptl_handle_ni_t cp_nih;*/ - /*ptl_handle_ni_t ds_nih;*/ - /*ptl_handle_eq_t cp_eqh;*/ - /*ptl_handle_eq_t ds_eqh;*/ - ptl_process_id_t *portals_id_map; - ptl_process_id_t *portals_cloned_id_map; - - int portals_ds_ready; - int portals_cp_finished; - - size_t portalsMaxEagerMessageSize; - - -/* ---------------------------------------------------------------------------------------------- *\ - portals prototypes -\* ---------------------------------------------------------------------------------------------- */ - int portals_init(ptl_handle_ni_t*); - int portals_finalize(ptl_handle_ni_t); - int portals_getid(ptl_handle_ni_t,ptl_process_id_t *); - int portals_free_eq(ptl_handle_eq_t); - int portals_create_eq(ptl_handle_ni_t, ptl_size_t, ptl_handle_eq_t*); - int portals_create_matchall_me(ptl_handle_me_t*); - int portals_me_attach(ptl_handle_ni_t,ptl_process_id_t,ptl_match_bits_t,ptl_match_bits_t,ptl_handle_me_t*); - int portals_me_insert(ptl_handle_me_t,ptl_process_id_t,ptl_match_bits_t,ptl_match_bits_t,ptl_handle_me_t*); - int portals_me_unlink(ptl_handle_me_t); - int portals_md_attach(ptl_handle_me_t,ptl_md_t,ptl_unlink_t,ptl_handle_md_t*); - int portals_md_bind(ptl_handle_ni_t,ptl_md_t,ptl_unlink_t,ptl_handle_md_t*); - - int portals_eqwait(ptl_handle_eq_t,ptl_event_t*); - int portals_put(portals_desc_t*); - int portals_get(portals_desc_t*); - int portals_wait(portals_desc_t*); - - - void* portalsCloneDataServer(void *(*func)(void*)); - void portalsSpinLockOnInt(volatile int*, int, int); - - void portals_print_event_details(ptl_event_t *ev); - - void Fatal_error(int); - const char *Portals_ID(); - - void bit_print(const char *,int); - void hex_print(const char *,int); - void portals_print_summary(); - -/* ---------------------------------------------------------------------------------------------- *\ - portals data server prototypes -\* ---------------------------------------------------------------------------------------------- */ - void* portals_ds_thread(void* args); - int portals_ds_init(void); - int portals_ds(void); - int portal_send_test_ack(int to,int val); - int portals_ds_requeue_md(int); - - void portals_ds_get_from_cp(void*,ptl_size_t,ptl_process_id_t,ptl_match_bits_t); - - //void ds_handler(DDI_Patch*,ptl_process_id_t); - -/* ---------------------------------------------------------------------------------------------- *\ - portals compute process prototypes -\* ---------------------------------------------------------------------------------------------- */ - int portals_cp_init(void); - int portals_cp_getid(ptl_process_id_t *id); - - void portals_req_send(void *buffer, size_t size, portals_ds_req_t *req); - void portals_req_nbsend(void *buffer, size_t size, portals_ds_req_t *req); - void portals_req_wait(portals_ds_req_t *req); - - void portals_remote_get(void *buffer, request_header_t *msginfo, int remote_node); - void portals_remote_put(void *buffer, request_header_t *msginfo, int remote_node); - void portals_remote_acc(void *buffer, request_header_t *msginfo, int remote_node); - void portals_remote_rmw(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req); - void portals_remote_nbget(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req); - void portals_remote_nbput(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req); - void portals_remote_nbacc(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req); - - - void portalsRemoteOperation(void*,size_t,ptl_process_id_t,portals_ds_req_t*); - void portalsRemoteOperationToRank(void*,size_t,int,portals_ds_req_t*); - void portalsRemoteOperationToNode(void*,size_t,int,portals_ds_req_t*); - - void portalsBlockingRemoteOperationToNode(void*,size_t,int); - - -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx; - - __asm__("cpuid" - : "=a" (eax), "=b" (ebx) - : "0" (op) - : "cx", "dx" ); - return ebx; -} - - # endif diff --git a/armci/src-portals/portals_cp.c b/armci/src-portals/portals_cp.c deleted file mode 100644 index 9505140bf..000000000 --- a/armci/src-portals/portals_cp.c +++ /dev/null @@ -1,913 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* ---------------------------------------------------------------------------------------------- *\ - portals_cp.c -- compute process portals calls - author: ryan olson - email: ryan@cray.com -\* ---------------------------------------------------------------------------------------------- */ - # include "armcip.h" - # include - # include - # include - # include - -/* ---------------------------------------------------------------------------------------------- *\ -\* ---------------------------------------------------------------------------------------------- */ - static ptl_handle_ni_t cp_nih; - static ptl_handle_eq_t cp_eqh; - static ptl_handle_eq_t cp_tx_eqh; - - static void *portals_eager_send_buffer = NULL; - static size_t portals_unique_msg_counter = 373; - - static int portals_smp_sem = -1; - static int *active_requests_by_node = NULL; - -/* ---------------------------------------------------------------------------------------------- *\ -\* ---------------------------------------------------------------------------------------------- */ - int portals_cp_finished = 0; - - -/* ---------------------------------------------------------------------------------------------- *\ - Implementation -\* ---------------------------------------------------------------------------------------------- */ - -int -portals_cp_init(void) -{ - int rc; - int me; - ptl_process_id_t id; - - rc = portals_init(&cp_nih); - if(rc != PTL_OK) { - printf("error in portals_init: err %d\n",rc); - Fatal_error(rc); - } - - rc = portals_create_eq(cp_nih,10*PORTALS_MAX_DESCRIPTORS,&cp_eqh); - if(rc != PTL_OK) { - printf("failed to create cp event queue; err %d\n",rc); - Fatal_error(911); - } - - rc = portals_create_eq(cp_nih,30,&cp_tx_eqh); - if(rc != PTL_OK) { - printf("failed to create cp_tx event queue; err %d\n",rc); - Fatal_error(911); - } - - rc = portals_cp_getid(&id); - if(rc != PTL_OK) { - printf("failed to get the portals id; err %d\n",rc); - Fatal_error(rc); - } - - /* creating an smp/intra-node communicator */ - MPI_Comm_rank(ARMCI_COMM_WORLD,&me); - MPI_Comm_split(ARMCI_COMM_WORLD,id.nid,me,&portals_smp_comm); - - /* set affinity */ - # ifdef PORTALS_AFFINITY - int smp_np, smp_me; - unsigned long mask; - unsigned int len = sizeof(mask); - unsigned long ncpus; - unsigned int nsockets, siblings; - int cores_per_socket, cps_per_socket; - int verbose = 0; - - MPI_Comm_size(portals_smp_comm,&smp_np); - MPI_Comm_rank(portals_smp_comm,&smp_me); - - - if((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) < 0) { - printf("%d [cp] sysconf(_SC_NPROCESSORS_ONLN) failed; err=%d\n", ncpus); - armci_die("sysconf in init_throttle",911); - } - - - if(sched_getaffinity(0, len, &mask) < 0) { - perror("sched_getaffinity"); - armci_die("getaffinity error in ds_init",911); - } - - if(armci_clus_me == 0 && /* verbose */ 0 ) { - printf("%d [cp]: old affinity = 0x%x, ncpus = %d\n", armci_me, mask, ncpus); - } - - if(smp_me == 0) { - mask = 1 << (ncpus-1); - if(sched_setaffinity(0, len, (cpu_set_t *) &mask) < 0) { - perror("sched_setaffinity to probe the socket count"); - armci_die("setaffinity error in ds_init",911); - } - siblings = cpuid_ebx(1) >>16 & 0xff; - nsockets = ncpus / siblings; - } - MPI_Bcast(&nsockets,1,MPI_INT,0,portals_smp_comm); - - cores_per_socket = ncpus/nsockets; - cps_per_socket = (smp_np / nsockets); - cps_per_socket += (smp_np % nsockets); - if(nsockets > 2) { - armci_die("nsockets > 2 not supported",911); - } - if(smp_me < cps_per_socket) { - mask = 1 << smp_me; - } else { - mask = 1 << (smp_me + (cores_per_socket - cps_per_socket)); - } - - if(sched_setaffinity(0, len, (cpu_set_t *) &mask) < 0) { - perror("sched_setaffinity"); - armci_die("setaffinity error in ds_init",911); - } - - if(sched_getaffinity(0, len, &mask) < 0) { - perror("sched_getaffinity"); - armci_die("getaffinity error (#2) in ds_init",911); - } - - if(armci_clus_me == 0 && verbose) { - printf("%d [cp]: new affinity = 0x%x, ncpus = %d\n", armci_me, mask, ncpus); - } - # endif - - return PTL_OK; -} - - -int -portals_cp_finalize() -{ - int rc; - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - armci_semrm(portals_smp_sem); - # endif - - rc = portals_free_eq(cp_eqh); - if (rc != PTL_OK) { - printf("error freeing cp_eqh; err %d\n",rc); - } - - MPI_Barrier(ARMCI_COMM_WORLD); - MPI_Finalize(); - - portals_cp_finished = 1; - exit(0); - - return PTL_OK; -// return portals_finalize(cp_nih); -} - - -int -portals_cp_getid(ptl_process_id_t *id) -{ - return portals_getid(cp_nih, id); -} - - -static size_t -portals_get_unique_msg_id(void) { - size_t val = armci_me*1000; - portals_unique_msg_counter++; - if(portals_unique_msg_counter == 1000) portals_unique_msg_counter=1; - val += portals_unique_msg_counter; - return val; -} - - -static void -portals_req_clear(portals_ds_req_t *req) -{ - req->active = 0; - req->unique_msg_id = 0; - - req->req_desc.done = 1; - req->req_desc.state = 0; - req->req_desc.eqh = cp_tx_eqh; - - req->ack_desc.done = 1; - req->ack_desc.state = 0; - req->ack_desc.eqh = cp_eqh; - - req->data_desc.done = 1; - req->data_desc.state = 0; - req->data_desc.eqh = cp_eqh; - - req->remote_node = -1; -} - - -static ptl_process_id_t -portals_get_dsid_from_node(int remote_node) -{ - int rank = armci_clus_info[remote_node].master; - if(portals_cloned_id_map) return portals_cloned_id_map[rank]; - else return portals_id_map[rank]; -} - - -static ptl_process_id_t -portals_get_dsid_from_rank(int remote_id) -{ - if(portals_cloned_id_map) return portals_cloned_id_map[remote_id]; - else return portals_id_map[remote_id]; -} - -void -portals_req_nbsend(void *buffer, size_t size, portals_ds_req_t *req) -{ - int rc; - portals_desc_t *desc = &req->req_desc; - - assert(req->unique_msg_id); - assert(size < portalsMaxEagerMessageSize); - assert(req->remote_node >= 0); - - /* ---------------------------------------------------------------------------- *\ - if we get here, we can guarantee that where are no outstanding requests from - this PE to the remote node; however, we can not guarantee that other PEs on - this node aren't talking to the intended data server ... so now we wait on - value in the "shared" array. - \* ---------------------------------------------------------------------------- */ - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - int got_lock = 0; - while(!got_lock) { - portalsSpinLockOnInt(&active_requests_by_node[req->remote_node],0,1000); - semaphoreAcquire(portals_smp_sem,1,PORTALS_WRITE_ACCESS); - if(active_requests_by_node[req->remote_node] == 0) { - active_requests_by_node[req->remote_node] = 1; - got_lock = 1; - } - semaphoreRelease(portals_smp_sem,1,PORTALS_WRITE_ACCESS); - } - # endif - - desc->buffer = buffer; - desc->length = size; - desc->id = req->dsid; - desc->mbits = MATCH_ALL_MBITS; - desc->hdr = req->unique_msg_id; - desc->state = 0; - desc->eqh = cp_tx_eqh; - desc->nih = cp_nih; - - rc = portals_put(desc); - if(rc != PTL_OK) { - printf("portals_put err %d\n",rc); - Fatal_error(rc); - } -} - -void -portals_req_send(void *buffer, size_t size, portals_ds_req_t *req) -{ - int rc; - portals_desc_t *desc = &req->req_desc; - - portals_req_nbsend(buffer,size,req); - - rc = portals_wait(desc); - if(rc != PTL_OK) { - printf("portals_wait err %d\n",rc); - Fatal_error(rc); - } -} - - -void -portals_req_wait(portals_ds_req_t *req) -{ - int rc; - - if(req->req_desc.state) { - rc = portals_wait( &(req->req_desc) ); - if(rc != PTL_OK) { - printf("portals wait error on req_desc in req_wait; err=%d\n",rc); - Fatal_error(rc); - } - } - - if(req->ack_desc.state) { - rc = portals_wait( &(req->ack_desc) ); - if(rc != PTL_OK) { - printf("portals wait error on ack_desc in req_wait; err=%d\n",rc); - Fatal_error(rc); - } - } - if(req->data_desc.state) { - rc = portals_wait( &(req->data_desc) ); - if(rc != PTL_OK) { - printf("portals wait error on data_desc in req_wait; err=%d\n",rc); - Fatal_error(rc); - } - } - - req->active = 0; - return; -} - - -void -portalsWaitOnRequest(portals_ds_req_t *req) { - portals_req_wait(req); -} - - -static int -portals_prepost_ack_from_ds(portals_ds_req_t *req) -{ - int rc; - ptl_md_t md; - portals_desc_t *desc = &req->ack_desc; - unsigned long mbits = req->unique_msg_id; - - assert(req->unique_msg_id); - assert(req->remote_node >= 0); - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - desc->buffer = &active_requests_by_node[req->remote_node]; - desc->length = sizeof(int); - # else - desc->buffer = NULL; - desc->length = 0; - # endif - desc->id = req->dsid; - desc->mbits = mbits | DS_RESPONSE_ACK; - desc->hdr = mbits; - desc->eqh = cp_eqh; - - rc = portals_me_attach(cp_nih,desc->id,desc->mbits,0,&desc->meh); - if(rc != PTL_OK) { - printf("me failed in prepost ack\n"); - Fatal_error(rc); - } - - md.start = desc->buffer; - md.length = desc->length; - md.threshold = 1; - md.options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - md.user_ptr = desc; - md.eq_handle = cp_eqh; - - rc = portals_md_attach(desc->meh,md,PTL_UNLINK,&desc->mdh); - if(rc != PTL_OK) { - printf("md failed in prepost ack\n"); - Fatal_error(rc); - } - - // desc->state = STATE_PUT_END; - // |= needed for rendez-vous gets; put and get using the same descriptor - desc->state |= STATE_PUT_END; - desc->done = 0; -} - - -static int -portals_prepost_put_from_ds(void *buffer, size_t size, portals_ds_req_t *req) -{ - int rc; - int nputs; - ptl_md_t md; - portals_desc_t *desc = &req->data_desc; - unsigned long mbits = req->unique_msg_id; - - assert(req->unique_msg_id); - - desc->buffer = buffer; - desc->length = size; - desc->id = req->dsid; - desc->mbits = mbits | DS_RESPONSE_PUT; - desc->hdr = mbits; - desc->eqh = cp_eqh; - - rc = portals_me_attach(cp_nih,desc->id,desc->mbits,0,&desc->meh); - if(rc != PTL_OK) { - printf("me failed in prepost put\n"); - Fatal_error(rc); - } - - md.start = buffer; - md.length = size; - md.threshold = desc->noperations; - md.options = PTL_MD_OP_PUT - | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - | PTL_MD_EVENT_START_DISABLE - | PTL_MD_EVENT_END_DISABLE; - md.user_ptr = (void *) desc; - md.eq_handle = cp_eqh; - - rc = portals_md_attach(desc->meh,md,PTL_UNLINK,&desc->mdh); - if(rc != PTL_OK) { - printf("md failed in prepost put\n"); - Fatal_error(rc); - } - - // desc->state = STATE_UNLINK; - // |= needed for rendez-vous gets; put and get using the same descriptor - desc->state |= STATE_UNLINK; - desc->done = 0; -} - - -static int -portals_prepost_get_from_ds(void *buffer, size_t size, portals_ds_req_t *req) { - - int rc; - ptl_md_t md; - portals_desc_t *desc = &req->data_desc; - unsigned long mbits = req->unique_msg_id; - - assert(req->unique_msg_id); - - desc->buffer = buffer; - desc->length = size; - desc->id = req->dsid; - desc->mbits = mbits | DS_RESPONSE_GET; - desc->hdr = mbits; - desc->eqh = cp_eqh; - - rc = portals_me_attach(cp_nih,desc->id,desc->mbits,0,&desc->meh); - if(rc != PTL_OK) { - printf("me failed in prepost get\n"); - Fatal_error(rc); - } - - md.start = buffer; - md.length = size; - md.threshold = desc->noperations; - md.options = PTL_MD_OP_GET - | PTL_MD_EVENT_START_DISABLE; - // | PTL_MD_EVENT_AUTO_UNLINK_ENABLE - // | PTL_MD_EVENT_START_DISABLE - // | PTL_MD_EVENT_END_DISABLE; - md.user_ptr = (void *) desc; - md.eq_handle = cp_eqh; - - rc = portals_md_attach(desc->meh,md,PTL_UNLINK,&desc->mdh); - if(rc != PTL_OK) { - printf("md failed in prepost get\n"); - Fatal_error(rc); - } - - // printf("%d: preposted get of lenght=%ld\n",armci_me,size); - // desc->state = STATE_UNLINK; - // desc->state = STATE_GET_END; - // |= needed for rendez-vous gets; put and get using the same descriptor - desc->state |= STATE_GET_END; - desc->done = 0; -} - - -void portalsBlockingRemoteOperationToNode(void *buffer, size_t length, int remote_node) { - portals_ds_req_t req; - portals_req_clear(&req); - portalsRemoteOperationToNode(buffer,length,remote_node,&req); - portalsWaitOnRequest(&req); -} - - -void portalsRemoteOperationToNode(void *buffer, size_t length, int remote_node, portals_ds_req_t *req) -{ - ptl_process_id_t id = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - portalsRemoteOperation(buffer,length,id,req); -} - - -/* -void portalsRemoteOperationToRank(void *buffer, size_t length, int remote_rank, portals_ds_req_t *req) { - ptl_process_id_t id = portals_get_dsid_from_rank(remote_rank); - portalsRemoteOperation(buffer,length,id,req); -} -*/ - - -void -portalsRemoteOperation(void *buffer, size_t length, ptl_process_id_t dsid, portals_ds_req_t *req) -{ - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - // portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = dsid; - - /* --------------------------------------------------------------------- *\ - the only response from the ds will be a 0-byte ack coming in as a put - \* --------------------------------------------------------------------- */ - portals_prepost_ack_from_ds(req); - - /* --------------------------------------------------------------------- *\ - send data request; this is a completely blocking req - \* --------------------------------------------------------------------- */ - portals_req_send(buffer,length,req); -} - - -void -portals_send_oper(int remote_node,int val, portals_ds_req_t *req) -{ - int rc; - request_header_t msg; - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - - /* --------------------------------------------------------------------- *\ - the only response from the ds will be a 0-byte ack coming in as a put - \* --------------------------------------------------------------------- */ - portals_prepost_ack_from_ds(req); - - /* --------------------------------------------------------------------- *\ - prepare data request and send it; this is a completely blocking req - \* --------------------------------------------------------------------- */ - msg.operation = val; - portals_req_send(&msg,sizeof(request_header_t),req); - return; -} - - -void -portals_send_QUIT(int remote_node) -{ - portals_ds_req_t req; - portals_send_oper(remote_node,QUIT,&req); - portals_req_wait(&req); -} - - -static int -portals_determine_remote_op_count(request_header_t *msg) -{ -#ifdef DDI - int nr,nc,np; - int datatype_extent = sizeof(double); - - /* --------------------------------------------------------------------- *\ - previously we have worked with words, but to provide support for - other data types, we must work with bytes. note to developers: - datatype_extent = the size in bytes of the stored datatype - \* --------------------------------------------------------------------- */ - if(msg->size*datatype_extent <= MAX_DS_MSG_SIZE) return 1; - - /* --------------------------------------------------------------------- *\ - the data must be moved in segments; determine patch dimensions - \* --------------------------------------------------------------------- */ - nr = msg->ihi - msg->ilo + 1; - nc = msg->jhi - msg->jlo + 1; - - /* --------------------------------------------------------------------- *\ - each column individually is too long to fit in the buffer - \* --------------------------------------------------------------------- */ - if(nr*datatype_extent < MAX_DS_MSG_SIZE) { - - /* ------------------------------------------------------------------ *\ - np the number of "evenly" sized passed needed to send a column - \* ------------------------------------------------------------------ */ - np = 2; - while(((nr/np)+((nr%np)?1:0)*datatype_extent)>MAX_DS_MSG_SIZE) np++; - - /* ------------------------------------------------------------------ *\ - noperations is np times the number of columns to be sent - \* ------------------------------------------------------------------ */ - return np*nc; - - } - - /* --------------------------------------------------------------------- *\ - determine the number of full columns that can be sent in one pass - break down the subpatch on this metric - \* --------------------------------------------------------------------- */ - else { - - /* ------------------------------------------------------------------ *\ - np is the number of passes needed to send the full patch which - is broken down into "evenly" sized sets of columns that fit in - the allocated buffer region - \* ------------------------------------------------------------------ */ - np = 2; - while(nr*((nc/np)+((nc%np)?1:0))*datatype_extent>MAX_DS_MSG_SIZE) np++; - - /* ------------------------------------------------------------------ *\ - noperations is np - \* ------------------------------------------------------------------ */ - return np; - - } - - assert(0); // should not happen - return -1; -#else - return 1; -#endif -} - -void -portals_remote_rmw(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req) -{ - ptl_size_t length; - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - - /* --------------------------------------------------------------------- *\ - prepare the buffer into which the ds will put data - \* --------------------------------------------------------------------- */ - req->data_desc.noperations=portals_determine_remote_op_count(msginfo); - portals_prepost_put_from_ds(buffer,msginfo->datalen,req); - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - portals_prepost_ack_from_ds(req); - # endif - - /* --------------------------------------------------------------------- *\ - send data request - note: from armci_send_req - if get, the value of bytes (local: length) - is msginfo->dscrlen + (hdrlen=sizeof(request_header_t) ... this is - the size of the "data server request message" to be sent - \* --------------------------------------------------------------------- */ - length = sizeof(request_header_t) + msginfo->dscrlen + msginfo->datalen; - portals_req_send(msginfo,length,req); -} - -void -portals_remote_get(void *buffer, request_header_t *msginfo, int remote_node) -{ - portals_ds_req_t req; - portals_remote_nbget(buffer,msginfo,remote_node,&req); - portals_req_wait(&req); -} - -void -portals_remote_nbget(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req) -{ - ptl_size_t length; - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - - /* --------------------------------------------------------------------- *\ - prepare the buffer into which the ds will put data - \* --------------------------------------------------------------------- */ - req->data_desc.noperations=portals_determine_remote_op_count(msginfo); - portals_prepost_put_from_ds(buffer,msginfo->datalen,req); - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - portals_prepost_ack_from_ds(req); - # endif - - /* --------------------------------------------------------------------- *\ - send data request - note: from armci_send_req - if get, the value of bytes (local: length) - is msginfo->dscrlen + (hdrlen=sizeof(request_header_t) ... this is - the size of the "data server request message" to be sent - \* --------------------------------------------------------------------- */ - length = sizeof(request_header_t) + msginfo->dscrlen; - - # if defined(PORTALS_USE_RENDEZ_VOUS) - if(length < portalsMaxEagerMessageSize) portals_req_send(msginfo,length,req); - else { - req->data_desc.noperations = 1; - portals_prepost_get_from_ds(msginfo,length,req); - - /* ------------------------------------------------------------------ *\ - send data request: branch here for eager vs. rendez-vous - \* ------------------------------------------------------------------ */ - assert(length <= PORTALS_BUF_SIZE); - portals_req_send(msginfo,sizeof(request_header_t),req); - } - # else - portals_req_send(msginfo,length,req); - # endif -} - - -void -portals_remote_put(void *buffer, request_header_t *msginfo, int remote_node) -{ - portals_ds_req_t req; - portals_remote_nbput(buffer,msginfo,remote_node,&req); - portals_req_wait(&req); -} - - -void -portals_remote_nbput(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req) -{ - char *eagerBuffer = NULL; - size_t eagerSendSize = 0; - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - req->remote_node = remote_node; - - /* --------------------------------------------------------------------- *\ - prepost ack response from the data server - \* --------------------------------------------------------------------- */ - portals_prepost_ack_from_ds(req); - - /* --------------------------------------------------------------------- *\ - eager vs. rendez-vous messaging - eager: pack and send the message immediate (only for small messages) - developers note: since portals_eager_send_buffer only exists once, - this has to be a blocking send (ie the data is on the wire when - req_send has finished and the buffer can be reused. for greater - overlap, create a set of eager send buffers ... however they have to - be managed ... probably best to do it in a ring. - - note: armci put/acc buffer is prepacked. - \* --------------------------------------------------------------------- */ - eagerSendSize = sizeof(request_header_t) + msginfo->dscrlen + msginfo->datalen; - if(eagerSendSize < portalsMaxEagerMessageSize) { -// printf("sending eager message\n"); - # if 0 /* armci prepacked */ - eagerBuffer = (char *) portals_eager_send_buffer; - memcpy(eagerBuffer,msginfo,sizeof(request_header_t)); - eagerBuffer += sizeof(request_header_t); - memcpy(eagerBuffer,buffer,msginfo->bytes); - # endif - eagerBuffer = (char *) msginfo; /* buffer == msginfo for armci */ - portals_req_send(eagerBuffer,eagerSendSize,req); - } - - /* --------------------------------------------------------------------- *\ - rendez-vous: send the ds a request; ds will "get/pull" data - \* --------------------------------------------------------------------- */ - else { - # ifdef PORTALS_USE_RENDEZ_VOUS - /* ------------------------------------------------------------------ *\ - prepare the buffer into which the ds will put data - \* ------------------------------------------------------------------ */ - // req->data_desc.noperations=portals_determine_remote_op_count(msginfo); - req->data_desc.noperations = 1; - portals_prepost_get_from_ds(msginfo,eagerSendSize,req); - - /* ------------------------------------------------------------------ *\ - send data request: branch here for eager vs. rendez-vous - \* ------------------------------------------------------------------ */ - assert(eagerSendSize <= PORTALS_BUF_SIZE); - portals_req_send(msginfo,sizeof(request_header_t),req); - - # else - printf("%d [cp]: rendez-vous messaging not supported\n",armci_me); - abort(); - # endif - } - - -} - - -#if 0 -void -portals_remote_acc(void *buffer, request_header_t *msginfo, int remote_node) -{ - portals_ds_req_t req; - portals_remote_nbacc(buffer,msginfo,remote_node,&req); - portals_req_wait(&req); -} - - -void -portals_remote_nbacc(void *buffer, request_header_t *msginfo, int remote_node, portals_ds_req_t *req) -{ - char *eagerBuffer = NULL; - size_t eagerSendSize = 0; - - assert(msginfo->bytes); - - /* --------------------------------------------------------------------- *\ - initialize the data server request - \* --------------------------------------------------------------------- */ - portals_req_clear(req); - req->active = 1; - req->unique_msg_id = portals_get_unique_msg_id(); - req->dsid = portals_get_dsid_from_node(remote_node); - - /* --------------------------------------------------------------------- *\ - eager vs. rendez-vous messaging - eager: pack and send the message immediate (only for small messages) - \* --------------------------------------------------------------------- */ - eagerSendSize = msginfo->bytes + sizeof(request_header_t); - if(eagerSendSize < portalsMaxEagerMessageSize) { - - /* ------------------------------------------------------------------ *\ - prepost ack response from the data server - developers note: if you globally fence an array with a collective - operation prior to a section of code and defence it after, then you - don't need to micro manage the fence on a per request basis in that - section; this eliminates the need for a DS ack - \* ------------------------------------------------------------------ */ - portals_prepost_ack_from_ds(req); - - /* ------------------------------------------------------------------ *\ - pack and send eager data request - blocking for now, since portals_eager_send_buffer only exists once - create multiple eager buffers for greater overlap - \* ------------------------------------------------------------------ */ - eagerBuffer = (char *) portals_eager_send_buffer; - memcpy(eagerBuffer,msginfo,sizeof(request_header_t)); - eagerBuffer += sizeof(request_header_t); - memcpy(eagerBuffer,buffer,msginfo->bytes); - eagerBuffer = (char *) portals_eager_send_buffer; - portals_req_send(eagerBuffer,eagerSendSize,req); - } - - /* --------------------------------------------------------------------- *\ - rendez-vous: send the ds a request; ds will "get/pull" data - developers note: a ds ack is not required for a rendez-vous pull, - this is because the ds will not start the pull until a local fence - has been raised (if needed - see note above) - \* --------------------------------------------------------------------- */ - else { - /* ------------------------------------------------------------------ *\ - prepare the buffer from which the ds will pull data - \* ------------------------------------------------------------------ */ - req->data_desc.noperations=portals_determine_remote_op_count(msginfo); - portals_prepost_get_from_ds(buffer,msginfo->bytes,req); - - /* ------------------------------------------------------------------ *\ - send data request - \* ------------------------------------------------------------------ */ - portals_req_send(msginfo,sizeof(request_header_t),req); - portalsWaitOnRequest(req); - } -} -#endif - -extern int armci_shmget(size_t,char*); -extern int armci_semget(int); -extern void *shmat(int,int,int); - -void -portals_cp_init_throttle(int nnodes) -{ - int i, shmid, smp_np, smp_me; - size_t size = nnodes*sizeof(int); - char *buf = NULL; - - MPI_Comm_size(portals_smp_comm,&smp_np); - MPI_Comm_rank(portals_smp_comm,&smp_me); - - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - if(armci_me == armci_master) { - if(smp_me != 0) armci_die("smp_me and armci_master are different",911); - } - - if(smp_me == 0) { - shmid = armci_shmget(size,"portals_cp_init_throttle"); - active_requests_by_node = (int *) shmat(shmid,0,0); - if(active_requests_by_node == (void *) -1) { - printf("%d [cp] shmat failed for shmid %d\n",armci_me,shmid); - armci_die("badness",911); - } - armci_shmrm(shmid); - for(i=0; i - # include - # include - # include - -static ptl_handle_ni_t ds_nih; -static ptl_handle_eq_t ds_eqh; -static ptl_handle_eq_t request_eqh; -static ptl_handle_me_t matchall_meh; - -static int request_buffer_cur_block; -static ptl_md_t request_buffer_md[PORTALS_NREQUEST_BUFFERS]; -static ptl_handle_me_t request_buffer_meh[PORTALS_NREQUEST_BUFFERS]; - -int portals_ds_ready = 0; - -// void *portals_ds_working_buffer = NULL; - -void* -portals_ds_thread(void* args) -{ - portals_ds_init(); - portals_ds(); - portals_ds_finalize(); - portalsSpinLockOnInt(&portals_cp_finished,1,1000); - exit(0); - return NULL; -} - - -int -portals_ds_init() -{ - int i,rc; - size_t bufferSize; - float warningSize; - - portals_ds_ready = 0; - - /* --------------------------------------------------------------------- *\ - unhook set affinity ... data servers can roam - \* --------------------------------------------------------------------- */ - # ifdef PORTALS_AFFINITY - int smp_np, smp_me; - unsigned long mask; - unsigned int len = sizeof(mask); - unsigned long ncpus; - int verbose = 0; - - MPI_Comm_size(portals_smp_comm,&smp_np); - MPI_Comm_rank(portals_smp_comm,&smp_me); - - if((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) < 0) { - printf("%d [ds] sysconf(_SC_NPROCESSORS_ONLN) failed; err=%d\n", armci_me, ncpus); - armci_die("sysconf in init_throttle",911); - } - - if(sched_getaffinity(0, len, &mask) < 0) { - perror("sched_getaffinity"); - armci_die("getaffinity error in ds_init",911); - } - - if(armci_clus_me == 0 && /* verbose */ 0 ) { - printf("%d [ds]: old affinity = 0x%x, ncpus = %d\n", armci_me, mask, ncpus); - } - - if(smp_np == ncpus) { - mask = (1 << ncpus) - 1; /* let the data server roam over all cores */ - } else { - mask = 1 << (ncpus - 1); /* pin the ds to the last core on the node */ - } - - if(sched_setaffinity(0, len, (cpu_set_t *) &mask) < 0) { - perror("sched_setaffinity"); - armci_die("setaffinity error in ds_init",911); - } - - if(sched_getaffinity(0, len, &mask) < 0) { - perror("sched_getaffinity"); - armci_die("getaffinity error (#2) in ds_init",911); - } - - if(armci_clus_me == 0 && verbose) { - printf("%d [ds]: new affinity = 0x%x, ncpus = %d\n", armci_me, mask, ncpus); - } - # endif - - /* --------------------------------------------------------------------- *\ - initialize the network interface - \* --------------------------------------------------------------------- */ - rc = portals_init(&ds_nih); - if (rc != PTL_OK) { - printf("failed to initialize portals on ds; err %d\n",rc); - Fatal_error(rc); - } - - /* --------------------------------------------------------------------- *\ - used for responding to data requests; this keeps the response events - in a separate queue from the multitude of incoming data requests - \* --------------------------------------------------------------------- */ - rc = portals_create_eq(ds_nih, 200, &ds_eqh); - - /* --------------------------------------------------------------------- *\ - used to process incoming data requests. at very large scale we will - have to do some sort of messaging by node group to reduce the worst - case scenario off all to one type operations. use the data server - to message forward from node groups. - \* --------------------------------------------------------------------- */ - i = ARMCI_MAX(6*PORTALS_MAX_DESCRIPTORS*armci_nproc,200); - i = ARMCI_MAX(6*armci_nproc,200); - rc = portals_create_eq(ds_nih, i, &request_eqh); - if (rc != PTL_OK) { - printf("failed to create request event queue"); - Fatal_error(rc); - } - - /* --------------------------------------------------------------------- *\ - create ME list that matches all incoming data requests - this will be a dead ME with no MD ... it will only be used as a - place holder in which the "active" me/md will be placed in front of. - \* --------------------------------------------------------------------- */ - rc = portals_create_matchall_me(&matchall_meh); - if (rc != PTL_OK) { - printf("failed to create matchall ME\n"); - Fatal_error(rc); - } - - /* --------------------------------------------------------------------- *\ - create buffer space for the ds buffer - \* --------------------------------------------------------------------- */ - assert(portalsMaxEagerMessageSize > sizeof(request_header_t)); - bufferSize = portalsMaxEagerMessageSize; - - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - bufferSize *= armci_nclus; - # else - bufferSize *= armci_nproc; - # endif - bufferSize = bufferSize/(PORTALS_NREQUEST_BUFFERS-2); - bufferSize = ARMCI_MAX(bufferSize,portalsMaxEagerMessageSize); - - // if(armci_me == 0) printf("%s: bufferSize=%ld\n",Portals_ID(),bufferSize); -/* - if(bufferSize*PORTALS_NREQUEST_BUFFERS > PORTALS_REQUEST_BUFFER_SIZE_WARNING) { - warningSize = (float) bufferSize * PORTALS_NREQUEST_BUFFERS; - warningSize /= ONE_MB; - printf("[data server]: internal request buffer is %.2f MB\n",warningSize); - } -*/ - for(i=0; itag.user_ptr = (void *) &ev; - - if(request->operation == PUT || ARMCI_ACC(request->operation)) { - buffersize = sizeof(request_header_t) + request->dscrlen + request->datalen; - if(buffersize >= portalsMaxEagerMessageSize) { - buffer = (char *) MessageRcvBuffer; - portals_ds_get_from_cp(buffer,buffersize,ev.initiator,ev.hdr_data); - request = (request_header_t *) buffer; - request->tag.user_ptr = (void *) &ev; - armci_data_server(buffer); - // printf("%d: FINISHED RENDEZ-VOUS!\n",armci_me); - break; - } - } - - if(request->operation == GET) { - buffersize = sizeof(request_header_t) + request->dscrlen; - if(buffersize >= portalsMaxEagerMessageSize) { - buffer = (char *) MessageRcvBuffer; - portals_ds_get_from_cp(buffer,buffersize,ev.initiator,ev.hdr_data); - request = (request_header_t *) buffer; - request->tag.user_ptr = (void *) &ev; - armci_data_server(buffer); - // printf("%d: FINISHED RENDEZ-VOUS!\n",armci_me); - break; - } - } - - /* ------------------------------------------------------------- *\ - process request - \* ------------------------------------------------------------- */ - armci_data_server(buffer); - if(request->operation == QUIT) active = 0; - break; - - case PTL_EVENT_UNLINK: -// printf("captured an unlink event!!\n"); -// portals_print_event_details(&ev); - /* - if((long) ev.md.user_ptr != request_buffer_cur_block) { - printf("sanity check failed: user_ptr=%ld; cur_block=%ld\n",(long) ev.md.user_ptr, request_buffer_cur_block); - armci_die("hummm ... unlink issue?",911); - } - */ - portals_ds_requeue_md((long) ev.md.user_ptr); - break; - - default: - printf("unexpected event type %d in recvany\n"); - Fatal_error(911); - break; - } - - } while(active); - -// flush out event q; the only thing that should remain is possibly 1 unlink event; - while( (rc=PtlEQGet(request_eqh, &ev)) != PTL_EQ_EMPTY) { - if(rc == PTL_OK) { - if(ev.type != PTL_EVENT_UNLINK) { - printf("%s: flushing request_eqh: event type=%d\n",Portals_ID(),ev.type); - } else { - portals_ds_requeue_md((long) ev.md.user_ptr); - } - } - else if(rc == PTL_EQ_DROPPED) { - printf("%s: eq dropped\n",Portals_ID()); - } - else { - printf("%s: some error in PtlEQGet; err=%d\n",Portals_ID(),rc); - Fatal_error(rc); - } - } - - return PTL_OK; -} - - - - -int -portals_ds_finalize() -{ - int i,rc; - - // unlink and request buffers - for(i=0; itype, ev->offset, ev->mlength, ev->hdr_data, (long) ev->md.user_ptr); - fflush(stdout); -} - - -int -portals_ds_requeue_md(int i) -{ - int rc; - ptl_handle_me_t meh; - ptl_handle_md_t mdh; - ptl_process_id_t match_id; - ptl_match_bits_t match_bits = MATCH_ALL_MBITS; - ptl_match_bits_t ignore_bits = MATCH_ALL_IBITS; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - - rc = portals_me_insert(matchall_meh,match_id,match_bits,ignore_bits,&meh); - if(rc != PTL_OK) { - printf("me insert failed in ds requeue md; err %d\n",rc); - Fatal_error(rc); - } - - rc = portals_md_attach(meh,request_buffer_md[i],PTL_UNLINK,&mdh); - if(rc != PTL_OK) { - printf("md attach failed in ds requeue md; err %d\n",rc); - Fatal_error(rc); - } - - request_buffer_meh[i] = meh; - request_buffer_cur_block++; - if(request_buffer_cur_block == PORTALS_NREQUEST_BUFFERS) request_buffer_cur_block=0; - - return PTL_OK; -} - - -int -portals_create_matchall_me(ptl_handle_me_t* me_handle) -{ - int rc; - ptl_process_id_t match_id; - ptl_match_bits_t match_bits = MATCH_ALL_MBITS; - ptl_match_bits_t ignore_bits = MATCH_ALL_IBITS; - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - - rc = portals_me_attach(ds_nih,match_id,match_bits,ignore_bits,&matchall_meh); - - if (rc != PTL_OK) { - printf("PtlMEAttachAny err %d in portals_create_melist\n",rc); - return rc; - } - - return rc; -} - - -void -portals_ds_send_ack(ptl_process_id_t id, ptl_match_bits_t mbits) -{ - portals_desc_t desc; - # ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE - static int ack = 0; - desc.buffer = &ack; - desc.length = sizeof(int); - # else - desc.buffer = NULL; - desc.length = 0; - # endif - desc.id = id; - desc.mbits = mbits | DS_RESPONSE_ACK; - desc.hdr = mbits; - desc.state = 0; - desc.eqh = ds_eqh; - desc.nih = ds_nih; - portals_put(&desc); - portals_wait(&desc); -} - - -void -portals_ds_send_put(void *buffer, ptl_size_t length, ptl_process_id_t id, ptl_match_bits_t mbits) -{ - portals_desc_t desc; - desc.buffer = buffer; - desc.length = length; - desc.id = id; - desc.mbits = mbits | DS_RESPONSE_PUT; - desc.hdr = mbits; - desc.state = 0; - desc.eqh = ds_eqh; - desc.nih = ds_nih; - portals_put(&desc); - portals_wait(&desc); -} - - -void -portals_ds_get_from_cp(void *buffer, ptl_size_t length, ptl_process_id_t id, ptl_match_bits_t mbits) -{ - portals_desc_t desc; - desc.buffer = buffer; - desc.length = length; - desc.id = id; - desc.mbits = mbits | DS_RESPONSE_GET; - desc.hdr = mbits; - desc.state = 0; - desc.eqh = ds_eqh; - desc.nih = ds_nih; - portals_get(&desc); - portals_wait(&desc); -} - - -#ifdef DDI -static void -ds_handler(DDI_Patch *request, ptl_process_id_t from) -{ - int i,j,nr,nc; - long array[10],*a; - size_t size; - char *data_ptr; - portals_desc_t desc; - ptl_event_t *ev = (ptl_event_t *) request->user_ptr; - - switch(request->oper) { - - case DDI_GET: -// printf("%s received DDI_GET request of size %d\n",Portals_ID(),request->size); - nr = request->ihi - request->ilo + 1; - nc = request->jhi - request->jlo + 1; - if(nr < 0 || nc < 0 || nr > 10 || nc > 1) { - printf("test get dimension problem\n"); - abort(); - } - - if(nr*sizeof(long) != request->size) { - printf("test get request size does not match\n"); - abort(); - } - - for(i=0,j=317; iinitiator; - desc.mbits = ev->hdr_data | DS_RESPONSE_PUT; - desc.hdr = ev->hdr_data; - desc.state = 0; - desc.eqh = ds_eqh; - desc.nih = ds_nih; - portals_put(&desc); - portals_wait(&desc); - break; - - case DDI_PUT: - nr = request->ihi - request->ilo + 1; - nc = request->jhi - request->jlo + 1; - - data_ptr = NULL; - if(ev->mlength > sizeof(DDI_Patch)) { - printf("recv'ed eager put - size %d\n",ev->mlength-sizeof(DDI_Patch)); - data_ptr = (char *) request; - data_ptr += sizeof(DDI_Patch); - } - - if(request->size != ev->mlength-sizeof(DDI_Patch)) { - printf("eager msg buffer length does not match request size %d\n",request->size); - abort(); - } - - a = (long *) data_ptr; - for(i=0; iinitiator,ev->hdr_data); - break; - - case DDI_QUIT: -// printf("%s received DDI_QUIT request\n",Portals_ID()); - portals_ds_send_ack(ev->initiator,ev->hdr_data); -/* - desc.buffer = NULL; - desc.length = 0; - desc.id = ev->initiator; - desc.mbits = ev->hdr_data | DS_RESPONSE_ACK; - desc.hdr = ev->hdr_data; - desc.state = 0; - portals_put(&desc); - portals_wait(&desc); -*/ - break; - - case DDI_MEMORY: - DDI_Memory_server(request->size); - portals_ds_send_ack(ev->initiator,ev->hdr_data); - break; - - default: - printf("%s unknown operation in request=%d\n",Portals_ID(),request->oper); - abort(); - break; - } - - return; -} -#endif diff --git a/armci/src-portals/request.c b/armci/src-portals/request.c deleted file mode 100644 index db7df55ad..000000000 --- a/armci/src-portals/request.c +++ /dev/null @@ -1,1068 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: request.c,v 1.74.2.11 2007-10-18 06:09:37 d3h325 Exp $ */ -#include "armcip.h" -#include "request.h" -#include "memlock.h" -#include "armci_shmem.h" -#include "copy.h" -#include "gpc.h" -#include -#include - -#define DEBUG_ 0 -#define DEBUG_MEM 0 - -#if 0 -# define MARK_ENTER(func_) { fprintf(stdout, "ENTERING %s\n", func_); fflush(stdout); } -# define MARK_EXIT(func_) { fprintf(stdout, "EXITING %s\n", func_); fflush(stdout); } -#else -# define MARK_ENTER(func_) -# define MARK_EXIT(func_) -#endif - -#if 0 -# define PRNDBG3(m,a1,a2,a3) \ - fprintf(stderr,"DBG %d: " m,armci_me,a1,a2,a3);fflush(stderr) -# define PRNDBG(m) PRNDBG3(m,0,0,0) -# define PRNDBG1(m,a1) PRNDBG3(m,a1,0,0) -# define PRNDBG2(m,a1,a2) PRNDBG3(m,a1,a2,0) -#else -# define PRNDBG(m) -# define PRNDBG1(m,a1) -# define PRNDBG2(m,a1,a2) -# define PRNDBG3(m,a1,a2,a3) -#endif - - -#if !defined(GM) && !defined(VIA) && !defined(LAPI) &&!defined(VAPI) - double _armci_rcv_buf[MSG_BUFLEN_DBL]; - double _armci_snd_buf[MSG_BUFLEN_DBL]; - char* MessageSndBuffer = (char*)_armci_snd_buf; - char* MessageRcvBuffer = (char*)_armci_rcv_buf; -#endif - - -#define MAX_EHLEN 248 -#define ADDBUF(buf,type,val) *(type*)(buf) = (val); (buf) += sizeof(type) -#define GETBUF(buf,type,var) (var) = *(type*)(buf); (buf) += sizeof(type) - -#define ALLIGN8(buf){size_t _adr=(size_t)(buf); \ - _adr>>=3; _adr<<=3; _adr+=8; (buf) = (char*)_adr; } - -#ifndef CLN -# define CLN 1 -#endif -#ifndef SERV -# define SERV 2 -#endif - -/*******************Routines to handle completion descriptor******************/ -/*\ - *Following the the routines to fill a completion descriptor, if necessary - *copy the data to destination based on completion descriptor - *NOTE, THE FOLLOWING ROUTINES ARE FOR CLIENTS ONLY -\*/ - - -/*\Routine to complete a vector request, data is in buf and descriptor in dscr -\*/ -extern int armci_direct_vector_get(request_header_t *msginfo , armci_giov_t darr[], int len, int proc); -static void armci_complete_vector_get(armci_giov_t darr[],int len,void *buf) -{ -int proc; -request_header_t *msginfo = (request_header_t*) buf; - proc = msginfo->to; -#if defined(USE_SOCKET_VECTOR_API) - armci_direct_vector_get(msginfo, darr, len, proc); -#else - armci_rcv_vector_data(proc, msginfo, darr, len); -#endif - FREE_SEND_BUFFER(buf); -} - - - - - - -/*\ Routine called from buffers.c to complete a request for which the buffer was - * used for, so that the buffer can be reused. -\*/ -void armci_complete_req_buf(BUF_INFO_T *info, void *buffer) -{ -request_header_t *msginfo = (request_header_t*) buffer; - ARMCI_PR_DBG("enter",0); - if(info->protocol==0)return; - else if(info->protocol==SDSCR_IN_PLACE){ - char *dscr = info->dscr; - void *loc_ptr; - int stride_levels; - int *loc_stride_arr,*count; - - loc_ptr = *(void**)dscr; dscr += sizeof(void*); - stride_levels = *(int*)dscr; dscr += sizeof(int); - loc_stride_arr = (int*)dscr; dscr += stride_levels*sizeof(int); - count = (int*)dscr; - if(0 || DEBUG_){ - if(armci_me==0){ - printf("\n%d:extracted loc_ptr=%p, stridelevels=%d\n",armci_me, - loc_ptr,stride_levels); - fflush(stdout); - } - } - armci_rcv_strided_data(msginfo->to, msginfo, msginfo->datalen, loc_ptr, - stride_levels,loc_stride_arr,count); - FREE_SEND_BUFFER(msginfo); - } - else if(info->protocol==VDSCR_IN_PLACE || info->protocol==VDSCR_IN_PTR){ - char *dscr; - int len,i; - if(info->protocol==VDSCR_IN_PLACE){ - dscr = info->dscr; - //printf("\n%d:vdscr in place\n",armci_me); - } - else { - dscr = info->ptr.dscrbuf; - //printf("\n%d:vdscr in buf\n",armci_me); - } - GETBUF(dscr, long ,len); - { - armci_giov_t *darr; - darr = (armci_giov_t *)malloc(sizeof(armci_giov_t)*len); - if(!darr)armci_die("malloc in complete_req_buf failed",len); - for(i = 0; i< len; i++){ - int parlen, bytes; - GETBUF(dscr, int, parlen); - GETBUF(dscr, int, bytes); - darr[i].ptr_array_len = parlen; - darr[i].bytes = bytes; - if(msginfo->operation==GET)darr[i].dst_ptr_array=(void **)dscr; - else darr[i].src_ptr_array=(void **)dscr; - dscr+=sizeof(void *)*parlen; - } - if (msginfo->operation==GET) armci_complete_vector_get(darr,len,buffer); - } - } - else - armci_die("armci_complete_req_buf,protocol val invalid",info->protocol); - ARMCI_PR_DBG("exit",0); -} - -extern long x_net_offset(void *,int); -/*\ save a part of strided descriptor needed to complete request - -rmo: it seems as if save_ - -\*/ -void armci_save_strided_dscr(char **bptr, void *rem_ptr,int rem_stride_arr[], - int count[], int stride_levels,int is_nb,int proc) -{ -int i; -char *bufptr=*bptr; -BUF_INFO_T *info=NULL; -long network_offset,tmpoffset; - ARMCI_PR_DBG("enter",0); - - # ifdef PORTALS_UNRESOLVED - if(!is_nb){ - network_offset=x_net_offset(rem_ptr,proc); - if(DEBUG_){printf("\n%d:rem_ptr=%p offset=%d newrem=%p",armci_me,rem_ptr,network_offset,(char *)rem_ptr+network_offset);fflush(stdout);} - rem_ptr = (char *)rem_ptr+network_offset; - } - # endif - - if(is_nb){ - info=BUF_TO_BUFINFO(*bptr); - bufptr = (info->dscr); - } - *(void**)bufptr = rem_ptr; bufptr += sizeof(void*); - *(int*)bufptr = stride_levels; bufptr += sizeof(int); - for(i=0;idscr); - if(armci_me==0) - printf("\n%d:rem_ptr %p=%p stride_levels %d=%d\n",armci_me, - *(void**)bufptr,rem_ptr, - *(int*)(bufptr + sizeof(void*)),stride_levels); - } - /*remote_strided expects the pointer to point to the end of descr hence..*/ - if(is_nb) - info->protocol=SDSCR_IN_PLACE; - else - *bptr=bufptr; - ARMCI_PR_DBG("exit",0); - -} - - -/*\ save a part of vector descriptor needed to complete request -\*/ -void armci_save_vector_dscr(char **bptr,armci_giov_t darr[],int len, - int op,int is_nb, int proc) -{ -int i,size=sizeof(int); -BUF_INFO_T *info; -char *buf,*bufptr=*bptr; -void *rem_ptr; -long offst; - ARMCI_PR_DBG("enter",0); - if(is_nb){ - for(i=0;idscr; - info->protocol=VDSCR_IN_PLACE; - } - else { - info->ptr.dscrbuf = (void *)malloc(size); - buf = (char *)info->ptr.dscrbuf; - info->protocol=VDSCR_IN_PTR; - } - } - else - buf=bufptr; - - ADDBUF(buf,long,len); /* number of sets */ - for(i=0;ibufid to val, else set it to the id of the buf -\*/ -void armci_set_nbhandle_bufid(armci_ihdl_t nb_handle,char *buf,int val) -{ -BUF_INFO_T *info; - if(buf){ - info = BUF_TO_BUFINFO(buf); - val = info->bufid; - } - nb_handle->bufid = val; -} - -/**************End--Routines to handle completion descriptor******************/ - - -/*\ send request to server to LOCK MUTEX -\*/ -void armci_rem_lock(int mutex, int proc, int *ticket) -{ -request_header_t *msginfo; -int *ibuf; -int bufsize = sizeof(request_header_t)+sizeof(int); - - msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,LOCK,proc); - bzero(msginfo,sizeof(request_header_t)); - - msginfo->datalen = sizeof(int); - msginfo->dscrlen = 0; - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->operation = LOCK; - msginfo->format = mutex; - msginfo->bytes = msginfo->datalen + msginfo->dscrlen; - - ibuf = (int*)(msginfo+1); - *ibuf = mutex; - - armci_send_req(proc, msginfo, bufsize, 0); - - /* receive ticket from server */ - *ticket = *(int*)armci_rcv_data(proc,msginfo,0); - FREE_SEND_BUFFER(msginfo); - - if(DEBUG_)fprintf(stderr,"%d receiving ticket %d\n",armci_me, *ticket); -} - - - - -void armci_server_lock(request_header_t *msginfo) -{ -int *ibuf = (int*)(msginfo+1); -int proc = msginfo->from; -int mutex; -int ticket; - ARMCI_PR_DBG("enter",0); - - mutex = *(int*)ibuf; - - /* acquire lock on behalf of requesting process */ - ticket = armci_server_lock_mutex(mutex, proc, msginfo->tag); - - if(ticket >-1){ - /* got lock */ - msginfo->datalen = sizeof(int); - armci_send_data(msginfo, &ticket); - } - ARMCI_PR_DBG("exit",0); -} - - -/*\ send request to server to UNLOCK MUTEX -\*/ -void armci_rem_unlock(int mutex, int proc, int ticket) -{ -request_header_t *msginfo; -int *ibuf; -int bufsize = sizeof(request_header_t)+sizeof(ticket); - - msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,UNLOCK,proc); - bzero(msginfo,sizeof(request_header_t)); - - msginfo->dscrlen = msginfo->bytes = sizeof(ticket); - msginfo->datalen = 0; - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->operation = UNLOCK; - msginfo->format = mutex; - ibuf = (int*)(msginfo+1); - *ibuf = ticket; - - if(DEBUG_)fprintf(stderr,"%d sending unlock\n",armci_me); - armci_send_req(proc, msginfo, bufsize,0); -} - - - -/*\ server unlocks mutex and passes lock to the next waiting process -\*/ -void armci_server_unlock(request_header_t *msginfo, char* dscr) -{ - int ticket = *(int*)dscr; - int mutex = msginfo->format; - int proc = msginfo->to; - int waiting; - - waiting = armci_server_unlock_mutex(mutex,proc,ticket,&msginfo->tag); - - if(waiting >-1){ /* -1 means that nobody is waiting */ - - ticket++; - /* pass ticket to the waiting process */ - msginfo->from = waiting; - msginfo->datalen = sizeof(ticket); - armci_send_data(msginfo, &ticket); - - } -} - -void armci_unlock_waiting_process(msg_tag_t tag, int proc, int ticket) -{ -request_header_t header; -request_header_t *msginfo = &header; - - msginfo->datalen = sizeof(int); - msginfo->tag = tag; - msginfo->from = proc; - msginfo->to = armci_me; - armci_send_data(msginfo, &ticket); -} - -void * armci_server_ptr(int id){ -char *buf; -int bufsize = sizeof(int); -request_header_t *msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,ATTACH,armci_me); - bzero(msginfo,sizeof(request_header_t)); - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(armci_clus_me); - msginfo->dscrlen = 0; - msginfo->datalen = sizeof(int); - msginfo->operation = ATTACH; - msginfo->bytes = msginfo->dscrlen+ msginfo->datalen; - armci_copy(&id, msginfo +1, sizeof(int)); - if(DEBUG_MEM){ - printf("\n%d:attach req:sending id %d \n",armci_me,id);fflush(stdout); - } - armci_send_req(armci_master, msginfo, bufsize,0); - buf= armci_rcv_data(armci_master,msginfo,sizeof(void *));/* receive response */ - if(DEBUG_MEM){ - printf("\n%d:attach req:got %p \n",armci_me,buf);fflush(stdout); - } - FREE_SEND_BUFFER(msginfo); - ARMCI_PR_DBG("exit",0); - return (void *)buf; - -} - -/*\ control message to the server, e.g.: ATTACH to shmem, return ptr etc. -\*/ -void armci_serv_attach_req(void *info, int ilen, long size, void* resp,int rlen) -{ -char *buf; - ARMCI_PR_DBG("enter",0); -int bufsize = 2*sizeof(request_header_t)+ilen + sizeof(long)+sizeof(rlen); -long *idlist=(long *)info; -request_header_t *msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,ATTACH,armci_me); - bzero(msginfo,sizeof(request_header_t)); - - msginfo->from = armci_me; - msginfo->to = SERVER_NODE(armci_clus_me); - msginfo->dscrlen = ilen; - msginfo->datalen = sizeof(long)+sizeof(int); - msginfo->operation = ATTACH; - msginfo->bytes = msginfo->dscrlen+ msginfo->datalen; - - armci_copy(info, msginfo +1, ilen); - if(DEBUG_MEM){printf("\n%d:sending idlist+1 %d, size %d, idlist[0] %d, idlist[1] %d\n",armci_me,idlist+1,size,idlist[0],idlist[1]);} - buf = ((char*)msginfo) + ilen + sizeof(request_header_t); - *((long*)buf) =size; - *(int*)(buf+ sizeof(long)) = rlen; - armci_send_req(armci_master, msginfo, bufsize,0); - if(rlen){ - buf= armci_rcv_data(armci_master, msginfo,rlen); /* receive response */ - bcopy(buf, resp, rlen); - FREE_SEND_BUFFER(msginfo); - - if(DEBUG_MEM){printf("%d:client attaching got ptr=%p %d bytes\n",armci_me,buf,rlen); - fflush(stdout); - } - } - ARMCI_PR_DBG("exit",0); -} - - -/*\ server initializes its copy of the memory lock data structures -\*/ -static void server_alloc_memlock(void *ptr_myclus) -{ -int i; - - /* for protection, set pointers for processes outside local node NULL */ - memlock_table_array = calloc(armci_nproc,sizeof(void*)); - if(!memlock_table_array) armci_die("malloc failed for ARMCI lock array",0); - - /* set pointers for processes on local cluster node - * ptr_myclus - corresponds to the master process - */ - for(i=0; i< armci_clus_info[armci_clus_me].nslave; i++){ - memlock_table_array[armci_master +i] = ((char*)ptr_myclus) - + MAX_SLOTS*sizeof(memlock_t)*i; - } - - /* set pointer to the use flag */ -#ifdef MEMLOCK_SHMEM_FLAG - armci_use_memlock_table = (int*) (MAX_SLOTS*sizeof(memlock_t) + - (char*) memlock_table_array[armci_clus_last]); - - if(DEBUG_) - fprintf(stderr,"server initialized memlock %p\n",armci_use_memlock_table); -#endif -} - - -static int allocate_memlock=1; - -/*\ server actions triggered by client request to ATTACH -\*/ -void armci_server_ipc(request_header_t* msginfo, void* descr, - void* buffer, int buflen) -{ -double *ptr; -long *idlist = (long*)descr; -long size = *(long*)buffer; -int rlen = *(int*)(sizeof(long)+(char*)buffer); -extern int **_armci_int_mutexes; - ARMCI_PR_DBG("enter",0); - if(size<0) armci_die("armci_server_ipc: size<0",(int)size); - if(DEBUG_MEM)printf("\n%d:got idlist+1 %p, size %d, idlist[0] %d, idlist[1] %d",armci_me,idlist+1,size,idlist[0],idlist[1]); - ptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!ptr)armci_die("armci_server_ipc: failed to attach",0); - /* provide data server with access to the memory lock data structures */ - if(allocate_memlock){ - allocate_memlock = 0; - server_alloc_memlock(ptr); - } - if(_armci_int_mutexes==NULL){ - printf("unresolved portals external\n"); - abort(); - # ifdef PORTALS_UNRESOLVED - extern int _armci_server_mutex_ready; - extern void *_armci_server_mutex_ptr; - if(_armci_server_mutex_ready){ - _armci_int_mutexes=(int **)_armci_server_mutex_ptr; - } - # endif - } - if(size>0)armci_set_mem_offset(ptr); - - if(msginfo->datalen != sizeof(long)+sizeof(int)) - armci_die("armci_server_ipc: bad msginfo->datalen ",msginfo->datalen); - - if(rlen==sizeof(ptr)){ - msginfo->datalen = rlen; - armci_send_data(msginfo, &ptr); - } - else armci_die("armci_server_ipc: bad rlen",rlen); - ARMCI_PR_DBG("exit",0); -} - - -/*\ send RMW request to server -\*/ -void armci_rem_rmw(int op, void *ploc, void *prem, int extra, int proc) -{ -request_header_t *msginfo; -char *buf; -void *buffer; -int bufsize = sizeof(request_header_t)+sizeof(long)+sizeof(void*); -long offst; - - ARMCI_PR_DBG("enter",0); - msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,op,proc); - bzero(msginfo,sizeof(request_header_t)); - - msginfo->dscrlen = sizeof(void*); - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->operation = op; - msginfo->datalen = sizeof(long); - # ifdef PORTALS_UNRESOLVED - offst=x_net_offset(prem,proc); - prem = ((char *)prem+offst); - # endif - buf = (char*)(msginfo+1); - ADDBUF(buf, void*, prem); /* pointer is shipped as descriptor */ - - /* data field: extra argument in fetch&add and local value in swap */ - if(op==ARMCI_SWAP){ - ADDBUF(buf, int, *((int*)ploc)); - }else if(op==ARMCI_SWAP_LONG) { - ADDBUF(buf, long, *((long*)ploc) ); - msginfo->datalen = sizeof(long); - }else { - ADDBUF(buf, int, extra); - } - - msginfo->bytes = msginfo->datalen+msginfo->dscrlen ; - - if(DEBUG_){ - printf("%d sending RMW request %d to %d\n",armci_me,op,proc); - fflush(stdout); - } - armci_send_req(proc, msginfo, bufsize,0); - buffer = armci_rcv_data(proc,msginfo,0); /* receive response */ - - if(op==ARMCI_FETCH_AND_ADD || op== ARMCI_SWAP) - *(int*)ploc = *(int*)buffer; - else - *(long*)ploc = *(long*)buffer; - - FREE_SEND_BUFFER(msginfo); - ARMCI_PR_DBG("exit",0); -} - - -/*\ server response to RMW -\*/ -void armci_server_rmw(request_header_t* msginfo,void* ptr, void* pextra) -{ -long lold; -int iold; -void *pold=0; -int op = msginfo->operation; - - ARMCI_PR_DBG("enter",0); - if(DEBUG_){ - printf("%d server: executing RMW from %d. op=%d pextra=%p\n",armci_me,msginfo->from, op, pextra); - fflush(stdout); - } - if(msginfo->datalen != sizeof(long)) - armci_die2("armci_server_rmw: bad datalen=",msginfo->datalen,op); - - /* for swap operations *pextra has the value to swap - * for fetc&add it carries the increment argument - */ - switch(op){ - case ARMCI_SWAP: - iold = *(int*) pextra; - case ARMCI_FETCH_AND_ADD: - pold = &iold; - break; - - case ARMCI_SWAP_LONG: - lold = *(long*) pextra; - case ARMCI_FETCH_AND_ADD_LONG: - pold = &lold; - break; - - default: - armci_die("armci_server_rmw: bad operation code=",op); - } - - armci_generic_rmw(op, pold, *(int**)ptr, *(int*) pextra, msginfo->to); - - armci_send_data(msginfo, pold); - ARMCI_PR_DBG("exit",0); -} - -extern int armci_direct_vector_snd(request_header_t *msginfo , armci_giov_t darr[], int len, int proc); -extern int armci_direct_vector(request_header_t *msginfo , armci_giov_t darr[], int len, int proc); -int armci_rem_vector(int op, void *scale, armci_giov_t darr[],int len,int proc,int flag, armci_ihdl_t nb_handle) -{ - char *buf,*buf0; - request_header_t *msginfo; - int bytes =0, s, slen=0; - size_t adr; - int bufsize = sizeof(request_header_t); - int tag=0; - - if(nb_handle)tag=nb_handle->tag; - - /* compute size of the buffer needed */ - for(s=0; stag,0); - if(nb_handle->bufid == NB_NONE) - armci_set_nbhandle_bufid(nb_handle,buf,0); - } - - buf += sizeof(request_header_t); - - /* fill vector descriptor */ - armci_save_vector_dscr(&buf,darr,len,op,0,proc); - - /* align buf for doubles (8-bytes) before copying data */ - adr = (size_t)buf; - adr >>=3; - adr <<=3; - adr +=8; - buf = (char*)adr; - - msginfo->ehlen = 0; - - /* fill message header */ - msginfo->dscrlen = buf - buf0 - sizeof(request_header_t); - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->operation = op; - msginfo->format = VECTOR; - msginfo->datalen = bytes; - - /* put scale for accumulate */ - switch(op){ - case ARMCI_ACC_INT: - *(int*)buf = *(int*)scale; slen= sizeof(int); break; - case ARMCI_ACC_DCP: - ((double*)buf)[0] = ((double*)scale)[0]; - ((double*)buf)[1] = ((double*)scale)[1]; - slen=2*sizeof(double);break; - case ARMCI_ACC_DBL: - *(double*)buf = *(double*)scale; slen = sizeof(double); break; - case ARMCI_ACC_CPL: - ((float*)buf)[0] = ((float*)scale)[0]; - ((float*)buf)[1] = ((float*)scale)[1]; - slen=2*sizeof(float);break; - case ARMCI_ACC_FLT: - *(float*)buf = *(float*)scale; slen = sizeof(float); break; - default: slen=0; - } - buf += slen; - msginfo->datalen += slen; - msginfo->bytes = msginfo->datalen+msginfo->dscrlen; - - - /* for put and accumulate copy data into buffer */ - if(op != GET){ -/* fprintf(stderr,"sending %lf\n",*(double*)darr[0].src_ptr_array[0]);*/ - armci_vector_to_buf(darr, len, buf); - } - - armci_send_req(proc, msginfo, bufsize,tag); - /*x_buf_send_complete(buf0);*/ - - if(nb_handle && op==GET) armci_save_vector_dscr(&buf0,darr,len,op,1,proc); - if(op == GET&& !nb_handle){ - armci_complete_vector_get(darr,len,msginfo); - } - - return 0; -} - -#define CHUN_ (8*8096) -#define CHUN 200000 - -/*\ client version of remote strided operation -\*/ -int armci_rem_strided(int op, void* scale, int proc, - void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, - ext_header_t *h, int flag,armci_ihdl_t nb_handle) -{ - char *buf, *buf0; - request_header_t *msginfo; - int i, slen=0, bytes; - void *rem_ptr; - int *rem_stride_arr; - int bufsize = sizeof(request_header_t); - int ehlen =0; - msg_tag_t msg_tag; - int tag=0; - - /* we send ext header only for last chunk */ -#if 0 - if(h) ehlen = h->len; -#else - if(h) if(h->last) ehlen = h->len; -#endif - if(ehlen>MAX_EHLEN || ehlen <0) - armci_die2("armci_rem_strided ehlen out of range",MAX_EHLEN,ehlen); - /* calculate size of the buffer needed */ - for(i=0, bytes=1;i<=stride_levels;i++)bytes*=count[i]; - bufsize += bytes+sizeof(void*)+2*sizeof(int)*(stride_levels+1) +ehlen - +2*sizeof(double) + 16; /* +scale+alignment */ - - if (flag){ - if(op==GET)bufsize -=bytes; - } - - buf = buf0= GET_SEND_BUFFER((bufsize),op,proc); - msginfo = (request_header_t*)buf; - bzero(msginfo,sizeof(request_header_t)); - - - if(nb_handle) -#ifdef ACC_SMP - if(!ARMCI_ACC(op)) -#endif - { - // printf("%s: non-blocking ops not yet supported\n",Portals_ID()); - // abort(); -/* INIT_SENDBUF_INFO(nb_handle,buf,op,proc); same as _armci_buf_set_tag, why here? */ - _armci_buf_set_tag(buf,nb_handle->tag,0); - if(nb_handle->bufid == NB_NONE) - armci_set_nbhandle_bufid(nb_handle,buf,0); - tag = nb_handle->tag; - } - - if(op == GET){ - rem_ptr = src_ptr; - rem_stride_arr = src_stride_arr; - }else{ - rem_ptr = dst_ptr; - rem_stride_arr = dst_stride_arr; - } - - msginfo->datalen=bytes; - - /* fill strided descriptor */ - buf += sizeof(request_header_t); - /*this function fills the dscr into buf and also moves the buf ptr to the - end of the dscr*/ - armci_save_strided_dscr(&buf,rem_ptr,rem_stride_arr,count,stride_levels,0,proc); - - /* align buf for doubles (8-bytes) before copying data */ - ALLIGN8(buf); - - /* fill message header */ - msginfo->from = armci_me; - msginfo->to = proc; - msginfo->format = STRIDED; - msginfo->operation = op; - - /* put scale for accumulate */ - switch(op){ - case ARMCI_ACC_INT: - *(int*)buf = *(int*)scale; slen= sizeof(int); break; - case ARMCI_ACC_DCP: - ((double*)buf)[0] = ((double*)scale)[0]; - ((double*)buf)[1] = ((double*)scale)[1]; - slen=2*sizeof(double);break; - case ARMCI_ACC_DBL: - *(double*)buf = *(double*)scale; slen = sizeof(double); break; - case ARMCI_ACC_CPL: - ((float*)buf)[0] = ((float*)scale)[0]; - ((float*)buf)[1] = ((float*)scale)[1]; - slen=2*sizeof(float);break; - case ARMCI_ACC_FLT: - *(float*)buf = *(float*)scale; slen = sizeof(float); break; - case ARMCI_ACC_LNG: - *(long*)buf = *(long*)scale; slen = sizeof(long); break; - default: slen=0; - } - - /* - if(ARMCI_ACC(op))printf("%d client len=%d alpha=%lf data=%lf,%lf\n", - armci_me, buf-(char*)msginfo,((double*)buf)[0],*((double*)src_ptr), ((double*)buf)[1]); - */ - - buf += slen; - - /**** add extended header *******/ - if(ehlen){ - bcopy(h->exthdr,buf,ehlen); - i = ehlen%8; ehlen += (8-i); /* make sure buffer is still alligned */ - buf += ehlen; - } - - msginfo->ehlen = ehlen; - msginfo->dscrlen = buf - buf0 - sizeof(request_header_t); - msginfo->bytes = msginfo->datalen+msginfo->dscrlen; - - if(op == GET){ - /* - if(nb_handle) { - printf("%s rem_strided: nb gets not yet available\n",Portals_ID()); - abort(); - } - */ - armci_send_req(proc, msginfo, bufsize,tag); - armci_save_strided_dscr(&buf0,dst_ptr,dst_stride_arr,count, - stride_levels,1,proc); - - # ifdef PORTALS_ALLOW_NBGETS - if(!nb_handle){ - # endif - armci_rcv_strided_data(proc, msginfo, msginfo->datalen, - dst_ptr, stride_levels, dst_stride_arr, count); - FREE_SEND_BUFFER(msginfo); - # ifdef PORTALS_ALLOW_NBGETS - } - # endif - } else { - /* for put and accumulate send data */ - armci_send_strided(proc,msginfo, buf, - src_ptr, stride_levels, src_stride_arr, count,tag); - } - - return 0; -} - - -void armci_process_extheader(request_header_t *msginfo, char *dscr, char* buf, int buflen) -{ - armci_flag_t *h; - int *flag; - - h = (armci_flag_t*)(dscr + msginfo->dscrlen - msginfo->ehlen); -#if 0 - if(msginfo->ehlen)printf("%d:server from=%d len=%d: ptr=%p val=%d\n",armci_me,msginfo->from, msginfo->ehlen,h->ptr,h->val); - fflush(stdout); -#endif - flag = (int*)(h->ptr); - *flag = h->val; -} - -void armci_server(request_header_t *msginfo, char *dscr, char* buf, int buflen) -{ -int buf_stride_arr[MAX_STRIDE_LEVEL+1]; -int *loc_stride_arr,slen; -int *count, stride_levels; -void *buf_ptr, *loc_ptr; -void *scale; -char *dscr_save = dscr; -int rc, i,proc; -int stat; - - ARMCI_PR_DBG("enter",msginfo->datalen);fflush(stdout); - /*return if using readv/socket for put*/ - if(msginfo->operation==PUT && msginfo->datalen==0){ - if(msginfo->ehlen) /* process extra header if available */ - armci_process_extheader(msginfo, dscr, buf, buflen); - return; - } - - /* unpack descriptor record */ - loc_ptr = *(void**)dscr; dscr += sizeof(void*); - stride_levels = *(int*)dscr; dscr += sizeof(int); - loc_stride_arr = (int*)dscr; dscr += stride_levels*sizeof(int); - count = (int*)dscr; - - /* compute stride array for buffer */ - buf_stride_arr[0]=count[0]; - for(i=0; i< stride_levels; i++) - buf_stride_arr[i+1]= buf_stride_arr[i]*count[i+1]; - - /* get scale for accumulate, adjust buf to point to data */ - switch(msginfo->operation){ - case ARMCI_ACC_INT: slen = sizeof(int); break; - case ARMCI_ACC_DCP: slen = 2*sizeof(double); break; - case ARMCI_ACC_DBL: slen = sizeof(double); break; - case ARMCI_ACC_CPL: slen = 2*sizeof(float); break; - case ARMCI_ACC_FLT: slen = sizeof(float); break; - case ARMCI_ACC_LNG: slen = sizeof(long); break; - default: slen=0; - } - - scale = dscr_save+ (msginfo->dscrlen - slen -msginfo->ehlen); -/* - if(ARMCI_ACC(msginfo->operation)) - fprintf(stderr,"%d in server len=%d slen=%d alpha=%lf data=%lf\n", - armci_me, msginfo->dscrlen, slen, *(double*)scale,*(double*)buf); -*/ - - buf_ptr = buf; /* data in buffer */ - - proc = msginfo->to; - - if(msginfo->operation == GET){ - armci_send_strided_data(proc, msginfo, buf, - loc_ptr, stride_levels, loc_stride_arr, count); - /* fprintf(stderr, "GET response sent with tag: %d\n, msginfo->tag", - msginfo->tag); */ - } else{ - if((rc = armci_op_strided(msginfo->operation, scale, proc, - buf_ptr, buf_stride_arr, loc_ptr, loc_stride_arr, - count, stride_levels, 1,NULL))) - armci_die("server_strided: op from buf failed",rc); - } - - if(msginfo->ehlen) /* process extra header if available */ - armci_process_extheader(msginfo, dscr_save, buf, buflen); - ARMCI_PR_DBG("exit",0); -} - - -void armci_server_vector( request_header_t *msginfo, - char *dscr, char* buf, int buflen) -{ - int proc; - long len; - void *scale; - int i,s; - char *sbuf = buf; - if(msginfo->operation==PUT && msginfo->datalen==0)return;/*return if using readv/socket for put*/ - /* unpack descriptor record */ - GETBUF(dscr, long ,len); - - /* get scale for accumulate, adjust buf to point to data */ - scale = buf; - switch(msginfo->operation){ - case ARMCI_ACC_INT: buf += sizeof(int); break; - case ARMCI_ACC_DCP: buf += 2*sizeof(double); break; - case ARMCI_ACC_DBL: buf += sizeof(double); break; - case ARMCI_ACC_CPL: buf += 2*sizeof(float); break; - case ARMCI_ACC_FLT: buf += sizeof(float); break; - } - - proc = msginfo->to; - - /*fprintf(stderr,"scale=%lf\n",*(double*)scale);*/ - /* execute the operation */ - - switch(msginfo->operation) { - case GET: -/* fprintf(stderr, "%d:: Got a vector message!!\n", armci_me); */ - if(msginfo->ehlen) { - armci_die("Unexpected vector message with non-zero ehlen. GPC call?", - msginfo->ehlen); - } - else { - for(i = 0; i< len; i++){ - int parlen, bytes; - void **ptr; - GETBUF(dscr, int, parlen); - GETBUF(dscr, int, bytes); - /* fprintf(stderr,"len=%d bytes=%d parlen=%d\n",len,bytes,parlen);*/ - ptr = (void**)dscr; dscr += parlen*sizeof(char*); - for(s=0; s< parlen; s++){ - armci_copy(ptr[s], buf, bytes); - buf += bytes; - } - } -/* fprintf(stderr,"%d:: VECTOR GET. server sending buffer %p datalen=%d\n",armci_me, sbuf, msginfo->datalen); */ - armci_send_data(msginfo, sbuf); - } - break; - - case PUT: - -/* fprintf(stderr,"received in buffer %lf\n",*(double*)buf);*/ - for(i = 0; i< len; i++){ - int parlen, bytes; - void **ptr; - GETBUF(dscr, int, parlen); - GETBUF(dscr, int, bytes); - ptr = (void**)dscr; dscr += parlen*sizeof(char*); - for(s=0; s< parlen; s++){ -/* - armci_copy(buf, ptr[s], bytes); -*/ - bcopy(buf, ptr[s], (size_t)bytes); - buf += bytes; - } - } - break; - - default: - - /* this should be accumulate */ - if(!ARMCI_ACC(msginfo->operation)) - armci_die("v server: wrong op code",msginfo->operation); - -/* fprintf(stderr,"received first=%lf last =%lf in buffer\n",*/ -/* *((double*)buf),((double*)buf)[99]);*/ - - for(i = 0; i< len; i++){ - int parlen, bytes; - void **ptr; - GETBUF(dscr, int, parlen); - GETBUF(dscr, int, bytes); - ptr = (void**)dscr; dscr += parlen*sizeof(char*); - armci_lockmem_scatter(ptr, parlen, bytes, proc); - for(s=0; s< parlen; s++){ - armci_acc_2D(msginfo->operation, scale, proc, buf, ptr[s], - bytes, 1, bytes, bytes, 0); - buf += bytes; - } - ARMCI_UNLOCKMEM(proc); - } - } -} diff --git a/armci/src-portals/request.h b/armci/src-portals/request.h deleted file mode 100644 index 10eed571b..000000000 --- a/armci/src-portals/request.h +++ /dev/null @@ -1,383 +0,0 @@ -#ifndef _REQUEST_H_ -#define _REQUEST_H_ - - -/******** client buffer managment ops ****************************/ -extern void _armci_buf_init(); -extern char* _armci_buf_get(int size, int operation, int to); -extern void _armci_buf_release(void *buf); -extern int _armci_buf_to_index(void *buf); -extern char* _armci_buf_ptr_from_id(int id); -extern void _armci_buf_ensure_one_outstanding_op_per_node(void *buf, int node); -#if defined(SERV_QUEUE) -extern void _armci_buf_ensure_pend_outstanding_op_per_node(void *buf, int node); -#endif -extern void _armci_buf_complete_nb_request(int bufid,unsigned int tag, int *retcode); -extern void _armci_buf_test_nb_request(int bufid,unsigned int tag, int *retcode); -extern void _armci_buf_set_tag(void *bufptr,unsigned int tag,short int protocol); -extern void _armci_buf_clear_all(); -extern void x_buf_send_complete(void *); - -extern INLINE char *_armci_buf_get_clear_busy(int size, int operation, int to); -extern INLINE void _armci_buf_set_busy(void *buf, int state); -extern INLINE void _armci_buf_set_busy_idx(int tbl_idx, int state); -extern INLINE int _armci_buf_cmpld(int bufid); -extern INLINE void _armci_buf_set_cmpld(void *buf, int state); -extern INLINE void _armci_buf_set_cmpld_idx(int idx, int state); - -#ifdef LAPI -# include "lapidefs.h" -#elif PORTALS -# include "armci_portals.h" -#elif defined(GM) -# include "myrinet.h" -#elif defined(DOELAN4) -# include "elandefs.h" -#elif defined(QUADRICS) -# include - typedef void* msg_tag_t; -# ifdef _ELAN_PUTGET_H -# define NB_CMPL_T ELAN_EVENT* -# endif -#elif defined(VIA) -# include "via.h" - typedef void* msg_tag_t; -#elif defined(VAPI) -# include "armci-vapi.h" -#elif defined(SOCKETS) -# include "sockets.h" - typedef long msg_tag_t; - typedef unsigned short msg_id_t; -# define DTAG_ ((1<<(sizeof(msg_id_t)*8))-1) -# define NB_SOCKETS_ /* define NB_SOCKETS to allow non-blocking path */ -#elif defined(HITACHI) -# include "sr8k.h" -#elif defined(BGML) -# include "bgml.h" -# include "bgmldefs.h" -# define NB_CMPL_T BG1S_t - typedef long msg_tag_t; -#elif defined(MPI_SPAWN) -# include "mpi2.h" -# define MSG_BUFLEN_DBL 500000 - typedef long msg_tag_t; -#else - typedef long msg_tag_t; -#endif - -#ifndef CLEAR_HNDL_FIELD -# define CLEAR_HNDL_FIELD(_x) -#endif - -#define ACK_QUIT 0 -#define QUIT 33 -#define ATTACH 34 -#define REGISTER 35 - -/*\ the internal request structure for non-blocking api. -\*/ -typedef struct{ - unsigned int tag; - short int bufid; - short int agg_flag; - int op; - int proc; -#ifdef NB_CMPL_T - NB_CMPL_T cmpl_info; -#endif -} armci_ireq_t; -/*\ the internal request structure for non-blocking api. -\*/ -typedef armci_ireq_t* armci_ihdl_t; -extern void armci_set_nbhandle_bufid(armci_ihdl_t nb_handle, char *buf, int val); -extern void set_nbhandle(armci_ihdl_t *nbh, armci_hdl_t *nb_handle, - int op, int proc); - -typedef struct { - int to; /* message recipient */ - int from; /* message sender */ - int operation; /* operation code */ - int format; /* data format used */ - int bytes; /* number of bytes requested */ - int datalen; /* >0 in lapi means that data is included */ - int ehlen; /* size of extra header and the end of descr */ - int dscrlen; /* >0 in lapi means that descriptor is included */ - msg_tag_t tag; /* message tag for response to this request, MUST BE LAST */ -}request_header_t; - -#include "portals.h" - -typedef struct _buf_ackresp{ - long val,valc; - portals_ds_req_t req; - struct _buf_ackresp *next, *previous; -} _buf_ackresp_t; - -/*******gpc call strctures*************/ -#include -#define MAX_GPC_REQ 1 -#define MAX_GPC_REPLY_LEN (64*1024) -#define MAX_GPC_SEND_LEN (64*1024) -#define GPC_COMPLETION_SIGNAL SIGUSR1 - -typedef struct { - int hndl; - int hlen, dlen; - void *hdr, *data; - int rhlen, rdlen; - void *rhdr, *rdata; -} gpc_call_t; - -typedef struct { - int active; -/* int zombie; */ - request_header_t msginfo; - gpc_call_t call; - char send[MAX_GPC_SEND_LEN]; - char reply[MAX_GPC_REPLY_LEN]; -} gpc_buf_t; - -/* gpc_buf_t *gpc_req; */ -extern gpc_buf_t *gpc_req; - -extern void block_pthread_signal(int signo); -extern void unblock_pthread_signal(int signo); - -/*******structures copied from async.c for storing cmpl dscr for nb req*******/ -#define UBUF_LEN 112 - -typedef struct { - unsigned int tag; /* request id*/ - _buf_ackresp_t ar; - short int bufid; /* communication buffer id */ - short int protocol; /* what does this buf hold?*/ - union { - void *dscrbuf; /*in case dscr below is not enough, do a*/ - double pad; /*malloc, save pointer in dscrbuf and use it*/ - }ptr; - char dscr[UBUF_LEN]; /*place to store the dscr*/ -}_buf_info_t; - -#define BUF_INFO_T _buf_info_t -extern BUF_INFO_T *_armci_buf_to_bufinfo(void *buf); -#define BUF_TO_BUFINFO _armci_buf_to_bufinfo - -void armci_complete_req_buf(BUF_INFO_T *info, void *buffer); -extern INLINE BUF_INFO_T *_armci_id_to_bufinfo(int bufid); - -#if 0 && defined(DATA_SERVER) && defined(SOCKETS) -#define MAX_BUFS 1 -#define MAX_SMALL_BUFS 1 -#else -#if defined(SERV_QUEUE) -#define MAX_BUFS 8 -#define MAX_SMALL_BUFS 16 -#else - -# ifdef PORTALS_USE_RENDEZ_VOUS -# ifdef PORTALS_LIMIT_REMOTE_REQUESTS_BY_NODE -# define MAX_BUFS 4 -# define MAX_SMALL_BUFS 8 -# else -# define MAX_BUFS 4 -# define MAX_SMALL_BUFS 8 -# endif -# else -# define MAX_BUFS 4 -# define MAX_SMALL_BUFS 8 -# endif - -#endif -#endif - -/* tracks sockets used for receiving responces from data server (GET) */ -typedef struct { - int socks[MAX_BUFS+MAX_SMALL_BUFS]; /* sock # or -1 if not used */ - int ready[MAX_BUFS+MAX_SMALL_BUFS]; /* 1 - ready, 0 - not */ -} active_socks_t; - - - -/*valid values for the element protocol in BUF_INFO_T*/ -#define SDSCR_IN_PLACE 1 /*indicated that strided descriptor is in place*/ -#define VDSCR_IN_PLACE 2 /*indicated that vector descriptor is in place*/ -#define VDSCR_IN_PTR 3 /*indicates that the vector descriptor in allocated - and pointer stored in dscrbuf */ -/****************************************************************************/ - -/* this effects: buf_ext_t, portalsEagerMessageSendSize, portals ds buffer size */ -/* note: MSG_BUFLEN_DBL is being defined earlier in armci-portals.h */ -#ifndef MSG_BUFLEN_DBL -# if defined(HITACHI) -# define MSG_BUFLEN_DBL 0x50000 -# else -# ifdef PORTALS_USE_RENDEZ_VOUS -# define MSG_BUFLEN_DBL 50000 /* for rendez-vous, this can go bigger i think */ -# else -# define MSG_BUFLEN_DBL 8192 /* this is smaller when rendez-vous is off */ -# endif -# endif -#endif - -#define MSG_BUFLEN sizeof(double)*MSG_BUFLEN_DBL -extern char* MessageRcvBuffer; -extern char* MessageSndBuffer; - -#ifdef LAPI -# define GET_SEND_BUFFER_(_size)(MessageSndBuffer+sizeof(lapi_cmpl_t));\ - CLEAR_COUNTER(*((lapi_cmpl_t*)MessageSndBuffer));\ - SET_COUNTER(*((lapi_cmpl_t*)MessageSndBuffer),1); -# define GET_SEND_BUFFER _armci_buf_get -# define GA_SEND_REPLY armci_lapi_send -#else -# ifdef SOCKETS -# define GA_SEND_REPLY(tag, buf, len, p) armci_sock_send(p,buf,len) -# else -# define GA_SEND_REPLY(tag, buf, len, p) -# endif -#endif - -#ifdef QUADRICS_ -# define GET_SEND_BUFFER(_size,_op,_to) MessageSndBuffer;\ - while(((request_header_t*)MessageSndBuffer)->tag)\ - armci_util_spin(100, MessageSndBuffer) -# define FREE_SEND_BUFFER(_ptr) ((request_header_t*)MessageSndBuffer)->tag = (void*)0 -#endif - -#ifndef GET_SEND_BUFFER -# define GET_SEND_BUFFER(_size,_op,_to) MessageSndBuffer -#endif - -#ifndef FREE_SEND_BUFFER -#define FREE_SEND_BUFFER(_ptr) -#endif - -#ifndef INIT_SENDBUF_INFO -#define INIT_SENDBUF_INFO(_hdl,_buf,_op,_proc) -#endif - -typedef struct { - char *buf; char* buf_posted; int count; int proc; int op; int extra; -} buf_arg_t; - -/*includes for SERVER_LOCK*/ -#if defined(SERVER_THREAD) && !defined(VIA) - extern void armci_rem_lock(int mutex, int proc, int *ticket); - extern void armci_rem_unlock(int mutex, int proc, int ticket); - extern void armci_unlock_waiting_process(msg_tag_t tag,int proc, int ticket); -#endif - - -#ifdef PIPE_BUFSIZE - extern void armcill_pipe_post_bufs(void *ptr, int stride_arr[], int count[], - int strides, void* argvoid); - extern void armcill_pipe_extract_data(void *ptr,int stride_arr[],int count[], - int strides, void* argvoid); - extern void armcill_pipe_send_chunk(void *data, int stride_arr[],int count[], - int strides, void* argvoid); -#endif - -extern void armci_send_strided(int proc, request_header_t *msginfo, char *bdata, - void *ptr, int strides, int stride_arr[], int count[],int tag); - -extern void armci_rcv_hdlr(request_header_t* msginfo); - -extern char *armci_rcv_data(int proc, request_header_t *msginfo, int rcvlen); -extern void armci_rcv_strided_data_bypass(int proc, request_header_t *msginfo, - void *ptr, int stride_levels); -extern void armci_send_strided_data_bypass(int proc, request_header_t *msginfo, - void *loc_buf, int msg_buflen, void *loc_ptr, int *loc_stride_arr, - void *rem_ptr, int *rem_stride_arr, int *count, int stride_levels); - -extern void armci_rcv_strided_data(int proc, request_header_t* msginfo, - int datalen, void *ptr, int strides,int stride_arr[],int count[]); -extern void armci_send_strided_data(int proc, request_header_t *msginfo, - char *bdata, void *ptr, int strides, int stride_arr[], int count[]); -extern void armci_send_req(int proc, request_header_t* msginfo, int len,int tag); -extern void armci_server_rmw(request_header_t* msginfo,void* ptr, void* pextra); -extern int armci_rem_vector(int op, void *scale, armci_giov_t darr[],int len, - int proc,int flag,armci_ihdl_t nb_handle); -extern int armci_rem_strided(int op, void* scale, int proc, - void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, - ext_header_t *h, int lockit,armci_ihdl_t nb_handle); - -extern void armci_rem_rmw(int op, void *ploc, void *prem, int extra, int proc); -extern void armci_rem_ack(int clus); -extern void armci_server(request_header_t *msginfo, char *dscr, char* buf, - int buflen); -extern void armci_server_vector(request_header_t *msginfo, - char *dscr, char* buf, int buflen); -extern void *armci_server_ptr(int); -extern void armci_serv_attach_req(void *info, int ilen, long size, - void* resp,int rlen); -extern void armci_server_lock(request_header_t *msginfo); -extern void armci_server_unlock(request_header_t *msginfo, char* dscr); -extern void armci_create_server_thread ( void* (* func)(void*) ); -extern int armci_server_lock_mutex(int mutex, int proc, msg_tag_t tag); -extern void armci_send_data(request_header_t* msginfo, void *data); -extern int armci_server_unlock_mutex(int mutex, int p, int tkt, msg_tag_t* tag); -extern void armci_rcv_vector_data(int p, request_header_t* msginfo, armci_giov_t dr[], int len); - -#if !defined(LAPI) -extern void armci_wait_for_server(); -extern void armci_start_server(); -extern void armci_transport_cleanup(); -extern int armci_send_req_msg(int proc, void *buf, int bytes,int tag); -extern void armci_WriteToDirect(int proc, request_header_t* msginfo, void *buf); -extern char *armci_ReadFromDirect(int proc, request_header_t *msginfo, int len); -extern void armci_init_connections(); -extern void *armci_server_code(void *data); -extern void armci_rcv_req(void *mesg, void *phdr, void *pdescr, - void *pdata, int *buflen); -extern void armci_client_connect_to_servers(); -extern void armci_data_server(void *mesg); -extern void armci_server_initial_connection(); -extern void armci_call_data_server(); -#endif -#ifdef SOCKETS -extern void armci_ReadStridedFromDirect(int proc, request_header_t* msginfo, - void *ptr, int strides, int stride_arr[], int count[]); -extern void armci_WriteStridedToDirect(int proc, request_header_t* msginfo, - void *ptr, int strides, int stride_arr[], int count[]); -extern void armci_serv_quit(); -extern int armci_send_req_msg_strided(int proc, request_header_t *msginfo, - char *ptr, int strides, int stride_arr[],int count[]); -extern void armci_server_goodbye(request_header_t* msginfo); -#endif -#ifdef MPI_SPAWN -extern void armci_serv_quit(); -extern void armci_server_goodbye(request_header_t* msginfo); -#endif -#ifdef HITACHI -extern void armci_server_goodbye(request_header_t* msginfo); -extern void armci_serv_quit(); -#endif -extern void armci_server_ipc(request_header_t* msginfo, void* descr, - void* buffer, int buflen); - -#ifdef PIPE_BUFSIZE -extern void armci_pipe_prep_receive_strided(request_header_t *msginfo,char *buf, - int strides, int stride_arr[], int count[], int bufsize); -extern void armci_pipe_receive_strided(request_header_t* msginfo, void *ptr, - int stride_arr[], int count[], int strides); -extern void armci_pipe_send_req(int proc, void *buf, int bytes); -#endif - -extern void armci_rcv_strided_data_bypass_both(int, request_header_t*,void*, int*, int); -extern int armci_rem_get(int proc, void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], int count[], int stride_levels, - armci_ihdl_t nb_handle,void *mhloc,void *mhrem); - -#if defined(ALLOW_PIN) && defined(VAPI) -extern int armci_two_phase_send(int proc,void *src_ptr,int src_stride_arr[], - void *dst_ptr,int dst_stride_arr[],int count[], - int stride_levels,void ** context_ptr,armci_ihdl_t nbhandle, - ARMCI_MEMHDL_T *mhloc); -extern int armci_two_phase_get(int proc, void*src_ptr, int src_stride_arr[], - void*dst_ptr,int dst_stride_arr[], int count[], - int stride_levels, void**context_ptr, - armci_ihdl_t nbhandle, ARMCI_MEMHDL_T *mhloc); -#endif -#endif diff --git a/armci/src-portals/rmw.c b/armci/src-portals/rmw.c deleted file mode 100644 index e405d58ca..000000000 --- a/armci/src-portals/rmw.c +++ /dev/null @@ -1,140 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: rmw.c,v 1.24.2.5 2007-08-29 17:32:47 manoj Exp $ */ -#include "armcip.h" -#include "locks.h" -#include "copy.h" -#include -#if defined(__i386__) || defined(__x86_64__) -# include "atomics-i386.h" -#endif - -#ifdef LIBELAN_ATOMICS - -ELAN_ATOMIC *a; - -int elan_int_fadd(int *target, int inc, int vp) -{ - int result; - - elan_wait(elan_atomic32(a, ELAN_ATOMIC_ADD, target, inc, 0, vp, &result), elan_base->waitType); - return(result); -} - -int elan_long_fadd(long *target, long inc, int vp) -{ - long result; - -#ifdef _LP64 - elan_wait(elan_atomic64(a, ELAN_ATOMIC_ADD, target, inc, 0, vp, &result), elan_base->waitType); -#else - elan_wait(elan_atomic32(a, ELAN_ATOMIC_ADD, target, inc, 0, vp, &result), elan_base->waitType); -#endif - - return(result); -} - -int elan_int_swap(int *target, int value, int vp) -{ - int result; - - elan_wait(elan_atomic32(a, ELAN_ATOMIC_SWAP, target, value, 0, vp, &result), elan_base->waitType); - return(result); -} - -int elan_long_swap(long *target, long value, int vp) -{ - long result; - -#ifdef _LP64 - elan_wait(elan_atomic64(a, ELAN_ATOMIC_SWAP, target, value, 0, vp, &result), elan_base->waitType); -#else - elan_wait(elan_atomic32(a, ELAN_ATOMIC_SWAP, target, value, 0, vp, &result), elan_base->waitType); -#endif - - return(result); -} -#endif /* LIBELAN_ATOMICS */ - -/* enable use of newer interfaces in SHMEM */ -#ifndef CRAY -#ifndef LIBELAN_ATOMICS -/* manpages for shmem_fadd exist on the T3E but library code does not */ -#define SHMEM_FADD -#endif -#endif - - -/* global scope to prevent compiler optimization of volatile code */ -int _a_temp; -long _a_ltemp; - -void armci_generic_rmw(int op, void *ploc, void *prem, int extra, int proc) -{ -#if defined(CLUSTER) && !defined(SGIALTIX) - int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; -#else - int lock = 0; -#endif - - ARMCI_PR_DBG("enter",0); - NATIVE_LOCK(lock,proc); - switch (op) { - case ARMCI_FETCH_AND_ADD: - armci_get(prem,ploc,sizeof(int),proc); - _a_temp = *(int*)ploc + extra; - armci_put(&_a_temp,prem,sizeof(int),proc); - break; - case ARMCI_FETCH_AND_ADD_LONG: - armci_get(prem,ploc,sizeof(long),proc); - _a_ltemp = *(long*)ploc + extra; - armci_put(&_a_ltemp,prem,sizeof(long),proc); - break; - case ARMCI_SWAP: -#if (defined(__i386__) || defined(__x86_64__)) - if(SERVER_CONTEXT || armci_nclus==1){ - atomic_exchange(ploc, prem, sizeof(int)); - } - else -#endif - { - armci_get(prem,&_a_temp,sizeof(int),proc); - armci_put(ploc,prem,sizeof(int),proc); - *(int*)ploc = _a_temp; - } - break; - case ARMCI_SWAP_LONG: - armci_get(prem,&_a_ltemp,sizeof(long),proc); - armci_put(ploc,prem,sizeof(long),proc); - *(long*)ploc = _a_ltemp; - break; - default: armci_die("rmw: operation not supported",op); - } - /*TODO memfence here*/ - NATIVE_UNLOCK(lock,proc); - ARMCI_PR_DBG("exit",0); -} - - -int PARMCI_Rmw(int op, void *ploc, void *prem, int extra, int proc) -{ - if(!SAMECLUSNODE(proc)){ - armci_rem_rmw(op, ploc, prem, extra, proc); - return 0; - } - - switch (op) { - case ARMCI_FETCH_AND_ADD: - case ARMCI_FETCH_AND_ADD_LONG: - case ARMCI_SWAP: - case ARMCI_SWAP_LONG: - armci_generic_rmw(op, ploc, prem, extra, proc); - break; - default: armci_die("rmw: operation not supported",op); - } - - return 0; -} - diff --git a/armci/src-portals/rtinfo.c b/armci/src-portals/rtinfo.c deleted file mode 100644 index 41165682f..000000000 --- a/armci/src-portals/rtinfo.c +++ /dev/null @@ -1,26 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: rtinfo.c,v 1.1 2003-03-20 00:57:27 d3h325 Exp $ Run-time system configuration */ - -#include -#include - -/*\ determine number of CPUs on the current SMP node- Linux version for now -\*/ -int armci_getnumcpus(void) -{ -int numproc=0; -FILE* fp; -char line[80]; - fp=fopen("/proc/cpuinfo","r"); - if(fp==NULL) return -1; - while(!feof(fp)){ - fgets(line,80,fp); - if(strncmp(line,"processor",9)==0) numproc++; - } - fclose(fp); - return(numproc); -} - diff --git a/armci/src-portals/semaphores.c b/armci/src-portals/semaphores.c deleted file mode 100644 index cbc2b136f..000000000 --- a/armci/src-portals/semaphores.c +++ /dev/null @@ -1,98 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: semaphores.c,v 1.12 2005-03-10 19:11:23 vinodtipparaju Exp $ */ -#include "semaphores.h" -#include -#include - -int num_sem_alloc=0; -void perror(); -#ifdef SUN -int fprintf(); -void fflush(); -int semget(),semctl(); -#endif - -extern void armci_die(char*, int); - -struct sembuf sops; -int semaphoreID; - -int SemGet(num_sem) - int num_sem; -{ - semaphoreID = semget(IPC_PRIVATE,num_sem, IPC_CREAT | 0600); - if(semaphoreID<0){ - fprintf(stderr," Semaphore Allocation Failed \nsuggestions to fix the problem: \n"); - fprintf(stderr," 1. run ipcs and ipcrm -s commands to clean any semaphore ids\n"); - fprintf(stderr," 2. verify if constant SEMMSL defined in file semaphore.h is set correctly for your system\n"); - fprintf(stderr," 3. recompile semaphore.c\n"); - sleep(1); - perror("Error message from failed semget:"); - armci_die(" exiting ...", num_sem); - } - - num_sem_alloc = num_sem; - return(semaphoreID); -} - -void SemInit(id,value) - int id,value; -{ - int i, semid, num_sem; - union semun semctl_arg; - - semctl_arg.val = value; - - if(id == ALL_SEMS){ semid = 0; num_sem = num_sem_alloc;} - else { semid = id; num_sem = 1;} - - for(i=0; i< num_sem; i++){ - if( semctl(semaphoreID, semid, SETVAL,semctl_arg )<0){ - perror((char*)0); - armci_die("SemInit error",id); - } - semid++; - } -} - - -/* release semaphore(s) */ -void SemDel() -{ - union semun dummy; - - /* this is only to avoid compiler whinning about the unitialized variable*/ - dummy.val=0; - - (void) semctl(semaphoreID,0,IPC_RMID,dummy); -} - - -void Sem_CreateInitLocks(int num, lockset_t *id) -{ - *id = SemGet(num); - SemInit(ALL_SEMS,1); -} - - -void Sem_InitLocks(int num, lockset_t id) -{ - semaphoreID = id; - num_sem_alloc = num; -} - - -void Sem_DeleteLocks(lockset_t id) -{ - union semun dummy; - - /* this is only to avoid compiler whinning about the unitialized variable*/ - dummy.val=0; - - (void) semctl(id,0,IPC_RMID,dummy); -} - - diff --git a/armci/src-portals/semaphores.h b/armci/src-portals/semaphores.h deleted file mode 100644 index 379f95466..000000000 --- a/armci/src-portals/semaphores.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef _SEMAPHORES_H_ -#define _SEMAPHORES_H_ - -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_SYS_IPC_H -# include -#endif -#if HAVE_SYS_SEM_H -# include -#endif - -#if !HAVE_UNION_SEMUN -union semun { - int val; /* value for SETVAL */ - struct semid_ds *buf; /* buffer for IPC_STAT, IPC_SET */ - unsigned short int *array; /* array for GETALL, SETALL */ - struct seminfo *__buf; /* buffer for IPC_INFO */ -}; -#endif - -/* how many semaphores are available ? */ -#ifndef SEMMSL -# ifdef AIX -# define SEMMSL 8094 -# else -# define SEMMSL 16 -# endif -#endif - -/* on HPUX 10.2 SEMMSL is much bigger than realistically we can allocate */ -#ifdef HPUX -#undef SEMMSL -#define SEMMSL 64 -#endif - -extern struct sembuf sops; -extern int semaphoreID; -int semop(); -#define ALL_SEMS -1 - -#define _P_code -1 -#define _V_code 1 -#define P_semaphore(s) \ -{\ - sops.sem_num = (s);\ - sops.sem_op = _P_code;\ - sops.sem_flg = 0; \ - semop(semaphoreID,&sops,1);\ -} -#define V_semaphore(s) \ -{\ - sops.sem_num = (s);\ - sops.sem_op = _V_code;\ - sops.sem_flg = 0; \ - semop(semaphoreID,&sops,1);\ -} - -typedef int lockset_t; - -#endif diff --git a/armci/src-portals/shmalloc.c b/armci/src-portals/shmalloc.c deleted file mode 100644 index f5e1c4311..000000000 --- a/armci/src-portals/shmalloc.c +++ /dev/null @@ -1,89 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: shmalloc.c,v 1.10 2002-06-20 23:34:17 vinod Exp $ */ -#include -#include -#include "armcip.h" -#include "message.h" -#include "kr_malloc.h" - -static long *offset_arr; - -void armci_shmalloc_exchange_offsets(context_t *ctx_local) -{ - void **ptr_arr; - void *ptr; - armci_size_t bytes = 128; - int i; - - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - offset_arr = (long*) malloc(armci_nproc*sizeof(long)); - if(!ptr_arr || !offset_arr) armci_die("armci_shmalloc_get_offsets: malloc failed", 0); - - /* get memory with same size on all procs */ - ptr = kr_malloc(bytes, ctx_local); - if(!ptr) armci_die("armci_shmalloc_get_offsets: kr_malloc failed",bytes); - - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - ptr_arr[armci_me] = ptr; - - /* now combine individual addresses into a single array */ - armci_exchange_address(ptr_arr, armci_nproc); - - /* identify offets */ - for (i=0; i -#include -#include -#include -#include -#include -#include -#include -#include "armci_shmem.h" -#include "kr_malloc.h" -#include "shmlimit.h" -#include "message.h" -#include "armcip.h" - -#ifdef ALLOC_MUNMAP -#include -#include -static size_t pagesize=0; -static int logpagesize=0; -/* allow only that big shared memory segment (in MB)- incresed from 128 11/02 */ -#define MAX_ALLOC_MUNMAP 128 -#define MAX_ALLOC_MUNMAP_ 368 -static long max_alloc_munmap=MAX_ALLOC_MUNMAP; -#endif - -#if defined(SUN) - extern char *shmat(); -#endif - -#define SHM_UNIT (1024) - - -/* Need to determine the max shmem segment size. There are 2 alternatives: - * 1. use predefined SHMMAX if available or set some reasonable values, or - * 2. trial-and-error search for a max value (default) - * case a) fork a process to determine shmmax size (more accurate) - * case b) search w/o forking until success (less accurate) - */ - -/* under Myrinet GM, we cannot fork */ -#if defined(GM) || defined(VAPI) -# define SHMMAX_SEARCH_NO_FORK -#endif -#if defined(LAPI) || defined(AIX) || defined(SHMMAX_SEARCH_NO_FORK) || defined(CRAY_XT) -# define NO_SHMMAX_SEARCH -#endif - -/* Limits for the largest shmem segment are in Kilobytes to avoid passing - * Gigavalues to kr_malloc - * the limit for the KSR is lower than SHMMAX in sys/param.h because - * shmat would fail -- SHMMAX cannot be trusted (a bug) - */ -#define _SHMMAX 4*1024 - -#if defined(SUN)||defined(SOLARIS) -# undef _SHMMAX -# define _SHMMAX (1024) /* memory in KB */ -#elif defined(SGI64) || defined(AIX) || defined(CONVEX) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)512*1024) -#elif defined(SGI) && !defined(SGI64) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)128*1024) -#elif defined(KSR) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)512*1024) -#elif defined(HPUX) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)64*1024) -#elif defined(__FreeBSD__) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)3*1024) -#elif defined(LINUX) -# if !defined(SHMMAX) /* Red Hat does not define SHMMAX */ -# undef _SHMMAX -# if defined(__sparc__) || defined(__powerpc__) -# define _SHMMAX ((unsigned long)16*1024) -# elif defined(__alpha__) -# define _SHMMAX ((unsigned long)4072) -# else - /* Intel */ -# define _SHMMAX ((unsigned long)32*1024) -# endif -# endif -#elif defined(SHMMAX) -# undef _SHMMAX -# define _SHMMAX (((unsigned long)SHMMAX)>>10) -#endif - -static unsigned long MinShmem_per_core = 0; -static unsigned long MaxShmem_per_core = 0; -static unsigned long MinShmem = _SHMMAX; -static unsigned long MaxShmem = MAX_REGIONS*_SHMMAX; -static context_t ctx_shmem; /* kr_malloc context */ -static context_t *ctx_shmem_global; /* kr_malloc context stored in shmem */ -static int create_call=0; - -#ifdef SHMMAX_SEARCH_NO_FORK -static char *ptr_search_no_fork = (char*)0; -static int id_search_no_fork=0; -#endif - - -#ifdef LINUX -#define CLEANUP_CMD(command) sprintf(command,"/usr/bin/ipcrm shm %d",id); -#elif defined(SOLARIS) -#define CLEANUP_CMD(command) sprintf(command,"/bin/ipcrm -m %d",id); -#elif defined(SGI) -#define CLEANUP_CMD(command) sprintf(command,"/usr/sbin/ipcrm -m %d",id); -#else -#define CLEANUP_CMD(command) sprintf(command,"/usr/bin/ipcrm -m %d",id); -#endif - - -#ifdef ALLOC_MUNMAP -#ifdef QUADRICS -# include -# include - static char *armci_elan_starting_address = (char*)0; - -# ifdef __ia64__ -# define ALLOC_MUNMAP_ALIGN 1024*1024 -# else -# define ALLOC_MUNMAP_ALIGN 64*1024 -# endif - -# define ALGN_MALLOC(s,a) elan_allocMain(elan_base->state, (a), (s)) -#else -# define ALGN_MALLOC(s,a) malloc((s)) -#endif - -static char* alloc_munmap(size_t size) -{ -char *tmp; -unsigned long iptr; -size_t bytes = size+pagesize-1; - - if(armci_elan_starting_address){ - tmp = armci_elan_starting_address; - armci_elan_starting_address += size; -# ifdef ALLOC_MUNMAP_ALIGN - armci_elan_starting_address += ALLOC_MUNMAP_ALIGN; -# endif - if(DEBUG_) {printf("%d: address for shm attachment is %p size=%ld\n", - armci_me,tmp,(long)size); fflush(stdout); } - } else { - tmp = ALGN_MALLOC(bytes, getpagesize()); - if(tmp){ - iptr = (unsigned long)tmp + pagesize-1; - iptr >>= logpagesize; iptr <<= logpagesize; - if(DEBUG_) printf("%d:unmap ptr=%p->%p size=%d pagesize=%d\n",armci_me, - tmp,(char*)iptr,(int)size,pagesize); - tmp = (char*)iptr; - if(munmap(tmp, size) == -1) armci_die("munmap failed",0); - if(DEBUG_){printf("%d: unmap OK\n",armci_me); fflush(stdout);} - }else armci_die("alloc_munmap: malloc failed",(int)size); - } - return tmp; -} -#endif - -/*\ A wrapper to shmget. Just to be sure that ID is not 0. -\*/ -int armci_shmget(size_t size,char *from) -{ -int id; - - id = shmget(IPC_PRIVATE, size, (IPC_CREAT | 00600)); - - /*attaching with id 0 somehow fails (Seen on pentium4+linux24+gm163) - *so if id=0, shmget again. */ - while(id==0){ - /* free id=0 and get a new one */ - if(shmctl((int)id,IPC_RMID,(struct shmid_ds *)NULL)) { - fprintf(stderr,"id=%d \n",id); - armci_die("allocate: failed to _delete_ shared region ",id); - } - id = shmget(IPC_PRIVATE, size, (IPC_CREAT | 00600)); - } - if(DEBUG_){ - printf("\n%d:armci_shmget sz=%ld caller=%s id=%d\n",armci_me,(long)size, - from,id); - fflush(stdout); - } - return(id); -} - -static int -Semget(key_t key,int nsems,int semflg) { - int ret; - - if((ret = semget(key,nsems,semflg)) == -1) { - switch(errno) { - case EACCES: fprintf(stdout," semget errno=EACCES.\n"); break; - case EINVAL: fprintf(stdout," semget errno=EINVAL.\n"); break; - case ENOENT: fprintf(stdout," semget errno=ENOENT.\n"); break; - case ENOSPC: fprintf(stdout," semget errno=ENOSPC -- check system limit for sysv semaphores.\n"); break; - case ENOMEM: fprintf(stdout," semget errno=ENOMEM.\n"); break; - case EEXIST: fprintf(stdout," semget errno=EEXIST.\n"); break; - default: - fprintf(stdout," unusual semget errno=%d\n",errno); break; - } - armci_die("semget failed",errno); - } - - return ret; -} - - -int armci_semget(int count) -{ - int id; - id = Semget(IPC_PRIVATE,2,0600); - return id; -} - -int armci_semrm(int id) -{ - semctl(id,0,IPC_RMID); -} - -int armci_shmrm(int id) -{ - int ret; - if((ret = shmctl(id,IPC_RMID,NULL)) != 0) { - fprintf(stdout,"[cp]: shmctl return an error.\n"); - switch(errno) { - case EINVAL: - fprintf(stdout," Error EINVAL: shmid is not a valid shared memory segment.\n"); - break; - case EFAULT: - fprintf(stdout," Error EFAULT: argument 3 is not a valid struct shmid_ds.\n"); - break; - case EPERM: - fprintf(stdout," Error EPREM: permission to access/change shared mem segment denied.\n"); - break; - default: - fprintf(stdout," unusual shmctl errno=%d\n",errno); break; - break; - } - armci_die("error deleting shmid",id); - } - return ret; -} - -/*\ test is a shared memory region of a specified size can be allocated - * return 0 (no) or 1 (yes) -\*/ -int armci_test_allocate(long size) -{ - char *ptr; - int id = armci_shmget((size_t)size,"armci_test_allocate"); - if (id <0) return 0; - - /* attach to segment */ - ptr = shmat(id, (char *) NULL, 0); - - /* delete segment id */ - if(shmctl( id, IPC_RMID, (struct shmid_ds *)NULL)) - fprintf(stderr,"failed to remove shm id=%d\n",id); - - /* test pointer */ - if (((long)ptr) == -1L) return 0; - else return 1; -} - - -/*\ try to allocate a shared memory region of a specified size; return pointer -\*/ -static int armci_shmalloc_try(long size) -{ -#ifdef SHMMAX_SEARCH_NO_FORK - char *ptr; - int id = armci_shmget((size_t) size,"armci_shmalloc_try"); - if (id <0) return 0; - - /* attach to segment */ - ptr = shmat(id, (char *) NULL, 0); - - /* test pointer */ - if (((long)ptr) == -1L) return 0; - - ptr_search_no_fork = ptr; - id_search_no_fork = id; -#endif - return 1; -} - - - - -/* parameters that define range and granularity of search for shm segment size - * UBOUND is chosen to be < 2GB to avoid overflowing on 32-bit systems - * smaller PAGE gives more accurate results but with more search steps - * LBOUND is set to minimum amount for our purposes - * change UBOUND=512MB if you need larger arrays than 512 MB - */ -#define PAGE (16*65536L) -#define LBOUND 1048576L -#if defined(MULTI_CTX) && defined(QUADRICS) -#define UBOUND 256*LBOUND -#else -#define UBOUND 512*LBOUND -#endif - -static long get_user_shmmax() -{ -char *uval; -long x=0; - uval = getenv("ARMCI_DEFAULT_SHMMAX"); - if(uval != NULL){ - sscanf(uval,"%ld",&x); - if(x<1L || x> 2048L){ - fprintf(stderr,"incorrect ARMCI_DEFAULT_SHMMAX should be <1,2048>mb and 2^N Found=%ld\n",x); - x=0; - } - } - return x*1048576; /* return value in bytes */ -} - -/*\ determine the max shmem segment size using bisection -\*/ -int armci_shmem_test() -{ -long x; -int i,rc; -long upper_bound=UBOUND; -long lower_bound=0; - - x = get_user_shmmax(); - if(!x) x = upper_bound; - else upper_bound =x; - - if(DEBUG_){printf("%d: x = %ld upper_bound=%ld\n",armci_me, x, upper_bound); fflush(stdout);} - - for(i=1;;i++){ - long step; - rc = armci_test_allocate(x); - if(DEBUG_) - printf("%d:test %d size=%ld bytes status=%d\n",armci_me,i,x,rc); - if(rc){ - lower_bound = x; - step = (upper_bound -x)>>1; - if(step < PAGE) break; - x += step; - }else{ - upper_bound = x; - step = (x-lower_bound)>>1; - if(step>=20; - x <<=20; - } - - if(!lower_bound){ - /* try if can get LBOUND - necessary if search starts from UBOUND */ - lower_bound=LBOUND; - rc = armci_test_allocate(lower_bound); - if(!rc) return(0); - } - - if(DEBUG_) printf("%ld bytes segment size, %d calls \n",lower_bound,i); - return (int)( lower_bound>>20); /* return shmmax in mb */ -} - - -#ifdef SHMMAX_SEARCH_NO_FORK -/*\ determine the max shmem segment size by halving -\*/ -static int armci_shmem_test_no_fork() -{ -long x; -int i,rc; -long lower_bound=_SHMMAX*SHM_UNIT; -#define UBOUND_SEARCH_NO_FORK (256*SHM_UNIT*SHM_UNIT) - - x = get_user_shmmax(); - if(!x) x = UBOUND_SEARCH_NO_FORK; - - for(i=1;;i++){ - - rc = armci_shmalloc_try(x); - if(DEBUG_) - printf("%d:test by halving size=%ld bytes rc=%d\n",armci_me,x,rc); - - if(rc){ - lower_bound = x; - break; - }else{ - x >>= 1 ; - if(x>20); /* return shmmax in mb */ -} -#endif - - -#ifdef MULTI_CTX -void armci_nattach_preallocate_info(int* segments, int *segsize) -{ - int x; - char *uval; - uval = getenv("LIBELAN_NATTACH"); - if(uval != NULL){ - sscanf(uval,"%d",&x); - if(x<2 || x>8) armci_die("Error in LIBELAN_NATTACH <8, >1 ",(int)x); - }else - armci_die("Inconsistent configuration: ARMCI needs LIBELAN_NATTACH",0); - *segments =x; - *segsize = (int) (SHM_UNIT * MinShmem); - -} -#endif - -/* Create shared region to store kr_malloc context in shared memory */ -void armci_krmalloc_init_ctxshmem() { - void *myptr=NULL; - long idlist[SHMIDLEN]; - long size; - int offset = sizeof(void*)/sizeof(int); - - /* to store shared memory context and myptr */ - size = SHMEM_CTX_MEM; - - if(armci_me == armci_master ){ - myptr = Create_Shared_Region(idlist+1,size,idlist); - if(!myptr && size>0 ) armci_die("armci_krmalloc_init_ctxshmem: could not create", (int)(size>>10)); - if(size) *(volatile void**)myptr = myptr; - if(DEBUG_){ - printf("%d:armci_krmalloc_init_ctxshmem addr mptr=%p ref=%p size=%ld\n", armci_me, myptr, *(void**)myptr, size); - fflush(stdout); - } - - /* Bootstrapping: allocate storage for ctx_shmem_global. NOTE:there is - offset,as master places its address at begining for others to see */ - ctx_shmem_global = (context_t*) ( ((int*)myptr)+offset ); - *ctx_shmem_global = ctx_shmem; /*master copies ctx into shared region */ - } - - /* broadcast shmem id to other processes on the same cluster node */ - armci_msg_clus_brdcst(idlist, SHMIDLEN*sizeof(long)); - - if(armci_me != armci_master){ - myptr=(double*)Attach_Shared_Region(idlist+1,size,idlist[0]); - if(!myptr)armci_die("armci_krmalloc_init_ctxshmem: could not attach", (int)(size>>10)); - - /* now every process in a SMP node needs to find out its offset - * w.r.t. master - this offset is necessary to use memlock table - */ - if(size) armci_set_mem_offset(myptr); - if(DEBUG_){ - printf("%d:armci_krmalloc_init_ctxshmem attached addr mptr=%p ref=%p size=%ld\n", armci_me,myptr, *(void**)myptr,size); fflush(stdout); - } - /* store context info */ - ctx_shmem_global = (context_t*) ( ((int*)myptr)+offset ); - if(DEBUG_){ - printf("%d:armci_krmalloc_init_ctxshmem: shmid=%d off=%ld size=%ld\n", armci_me, ctx_shmem_global->shmid, ctx_shmem_global->shmoffset, - (long)ctx_shmem_global->shmsize); - fflush(stdout); - } - } -} - -void armci_shmem_init() -{ - -#ifdef ALLOC_MUNMAP - -#if defined(QUADRICS) -# if (defined(__ia64__) || defined(__alpha)) && !defined(DECOSF) - - /* this is to determine size of Elan Main memory allocator for munmap */ - long x; - char *uval; - uval = getenv("LIBELAN_ALLOC_SIZE"); - if(uval != NULL){ - sscanf(uval,"%ld",&x); - if((x>80000000) && (x< 4*1024*1024*1024L)){ - max_alloc_munmap = (x>>20) - 72; - if(DEBUG_){ - printf("%d: max_alloc_munmap is %ld\n",armci_me,max_alloc_munmap); - fflush(stdout); - } - } - } - - /* an alternative approach is to use MMAP area where we get - the address from the Elan environment variable in qsnetlibs 1.4+ */ - uval = getenv("LIBELAN3_MMAPBASE"); - if(uval != NULL){ - sscanf(uval,"%p",&armci_elan_starting_address); - } - -# endif -# if defined(__ia64__) - /* need aligment on 1MB boundary rather than the actual pagesize */ - pagesize = 1024*1024; - logpagesize = 20; -# else - /* determine log2(pagesize) needed for address alignment */ - int tp=512; - logpagesize = 9; - pagesize = getpagesize(); - if(tp>pagesize)armci_die("armci_shmem_init:pagesize",pagesize); - - while(tpmax_alloc_munmap && !armci_elan_starting_address) x=max_alloc_munmap; -# else - x = 10; /* mb */ -# endif -# endif - - if(DEBUG_){ - printf("%d:shmem_init: %d mbytes max segment size\n",armci_me,x);fflush(stdout);} - - MinShmem = (long)(x<<10); /* make sure it is in kb: mb <<10 */ - MaxShmem = MAX_REGIONS*MinShmem; -# ifdef REPORT_SHMMAX - printf("%d using x=%d SHMMAX=%ldKB\n", armci_me,x, MinShmem); - fflush(stdout); -# endif -#else - - /* nothing to do here - limits were given */ - -#endif - } - - armci_krmalloc_init_ctxshmem(); - if(DEBUG_)printf("%d: out of shmem_init\n",armci_me); -} - -void armci_set_shmem_limit_per_node(int nslaves) -{ - if (MaxShmem_per_core > 0) MaxShmem = nslaves*MaxShmem_per_core; - if (MinShmem_per_core > 0) MinShmem = nslaves*MinShmem_per_core; -} - -void armci_set_shmem_limit_per_core(unsigned long shmemlimit) -{ - MaxShmem_per_core = (shmemlimit + SHM_UNIT - 1)/SHM_UNIT; - MinShmem_per_core = (shmemlimit + SHM_UNIT - 1)/SHM_UNIT; -} - -/*\ application can reset the upper limit (bytes) for memory allocation -\*/ -void armci_set_shmem_limit(unsigned long shmemlimit) -{ - unsigned long kbytes; - kbytes = (shmemlimit + SHM_UNIT -1)/SHM_UNIT; - if(MaxShmem > kbytes) MaxShmem = kbytes; - if(MinShmem > kbytes) MinShmem = kbytes; -} - - -static void shmem_errmsg(size_t size) -{ -long sz=(long)size; - printf("******************* ARMCI INFO ************************\n"); - printf("The application attempted to allocate a shared memory segment "); - printf("of %ld bytes in size. This might be in addition to segments ",sz); - printf("that were allocated succesfully previously. "); - printf("The current system configuration does not allow enough "); - printf("shared memory to be allocated to the application.\n"); - printf("This is most often caused by:\n1) system parameter SHMMAX "); - printf("(largest shared memory segment) being too small or\n"); - printf("2) insufficient swap space.\n"); - printf("Please ask your system administrator to verify if SHMMAX "); - printf("matches the amount of memory needed by your application and "); - printf("the system has sufficient amount of swap space. "); - printf("Most UNIX systems can be easily reconfigured "); - printf("to allow larger shared memory segments,\n"); - printf("see https://hpc.pnl.gov/globalarrays/support.shtml\n"); - printf("In some cases, the problem might be caused by insufficient swap space.\n"); - printf("*******************************************************\n"); -} - - -static struct shm_region_list{ - char *addr; - long id; - long sz; - long attached; -}region_list[MAX_REGIONS]; -static int alloc_regions=0; -static long occup_blocks=0; - -/* Terminology - * region - actual piece of shared memory allocated from OS - * block - a part of allocated shmem that is given to the requesting process - */ - - -static int last_allocated=-1; - - -unsigned long armci_max_region() -{ - return MinShmem; -} - - -int find_regions(char *addrp, long* id, int *region) -{ -int nreg, reg; - - if(last_allocated!=-1){ - reg=last_allocated; - last_allocated = -1; - } else{ - - for(reg=-1,nreg=0;nreg= region_list[nreg].addr && - addrp < (region_list[nreg].addr + region_list[nreg].sz)) - { - reg = nreg; - break; - } - } - - if(reg == -1) - armci_die("find_regions: failed to locate shared region", 0L); - } - - *region = reg; - *id = region_list[reg].id; - - return 1; -} - -/* returns the shmem info based on the addr */ -int armci_get_shmem_info(char *addrp, int* shmid, long *shmoffset, - size_t *shmsize) -{ - int region; long id; - - find_regions(addrp, &id, ®ion); - *shmid = id; - *shmoffset = (long)(addrp - region_list[region].addr); - *shmsize = region_list[region].sz; - - return 1; -} - -long armci_shm_reg_size(int i, long id) -{ - if(i<0 || i>= MAX_REGIONS)armci_die("armci_shmem_reg_size: bad i",i); - return region_list[i].sz; -} - -void* armci_shm_reg_ptr(int i) -{ - if(i<0 || i>= MAX_REGIONS)armci_die("armci_shmem_reg_ptr: bad i",i); - return (void *)region_list[i].addr; -} - -Header *armci_get_shmem_ptr(int shmid, long shmoffset, size_t shmsize) -{ -/* returns, address of the shared memory region based on shmid, offset. - * (i.e. return_addr = stating address of shmid + offset)*/ - long idlist[SHMIDLEN]; - Header *p = NULL; - - idlist[1] = (long)shmid; - idlist[0] = shmoffset; - idlist[IDLOC+1] = shmsize; /* CHECK : idlist in CreateShmem????*/ - - if(!(p=(Header*)Attach_Shared_Region(idlist+1, shmsize, idlist[0]))) - armci_die("kr_malloc:could not attach",(int)(p->s.shmsize>>10)); -#if DEBUG_ - printf("%d: armci_get_shmem_ptr: %d %ld %ld %p\n", - armci_me, idlist[1], idlist[0], shmsize, p); - fflush(stdout); -#endif - return p; -} - - -char *Attach_Shared_Region(id, size, offset) - long *id, offset, size; -{ -int reg, found, shmflag=0; -static char *temp; - - if(alloc_regions>=MAX_REGIONS) - armci_die("Attach_Shared_Region: to many regions ",0); - - if(DEBUG_){ - printf("%d:AttachSharedRegion %d:size=%ld id=%ld\n", - armci_me, create_call++, size,*id); - fflush(stdout); - } - - - /* under Linux we can get valid id=0 */ -#ifndef LINUX - if(!*id) armci_die("Attach_Shared_Region: shmem ID=0 ",(int)*id); -#endif - - /* first time needs to initialize region_list structure */ - if(!alloc_regions){ - for(reg=0;reg= MAX_REGIONS) - armci_die("Create_Shared_Region:allocate:too many regions allocated ",0); - - last_allocated = alloc_regions; - -#ifdef SHMMAX_SEARCH_NO_FORK - if (ptr_search_no_fork){ - temp = ptr_search_no_fork; - id = id_search_no_fork; - ptr_search_no_fork = (char*)0; /* do not look at it again */ - }else -#endif - { - if ( (id = armci_shmget(sz,"armci_allocate")) < 0 ) { - fprintf(stderr,"id=%d size=%ld\n",id, size); - shmem_errmsg(sz); - armci_die("allocate: failed to create shared region ",id); - } - - if ( (long)( (temp = shmat(id, pref_addr, shmflag))) == -1L){ - char command[64]; - CLEANUP_CMD(command); - if(system(command) == -1) - printf("Please clean shared memory (id=%d): see man ipcrm\n",id); - armci_die("allocate: failed to attach to shared region id=",id); - } - if(DEBUG_){ - printf("%d:allocate:attach:id=%d paddr=%p size=%ld\n",armci_me,id,temp,size); - fflush(stdout); - } -#if 1 - /* delete segment id so that OS cleans it when all attached processes are gone */ - if(shmctl( id, IPC_RMID, (struct shmid_ds *)NULL)) - fprintf(stderr,"failed to remove shm id=%d\n",id); -#endif - - } - POST_ALLOC_CHECK(temp,sz); - - region_list[alloc_regions].addr = temp; - region_list[alloc_regions].id = id; - region_list[alloc_regions].attached=1; - region_list[alloc_regions].sz=sz; - alloc_regions++; - - if(DEBUG2_){ - printf("%d:allocate:id=%d addr=%p size=%ld\n",armci_me,id,temp,size); - fflush(stdout); - } - - return (void*) (temp); -} - -/******************** common code for the two versions *********************/ - - -/*\ Allocate a block of shared memory - called by master process -\*/ -char *Create_Shared_Region(long *id, long size, long *offset) -{ - char *temp; -int reg, refreg=0,nreg; - - if(alloc_regions>=MAX_REGIONS) - armci_die("Create_Shared_Region: to many regions ",0); - - if(DEBUG_){ - printf("%d:CreateSharedRegion %d:size=%ld\n",armci_me,create_call++,size); - fflush(stdout); - } - - /*initialization: 1st allocation request */ - if(!alloc_regions){ - for(reg=0;reg -#include -#include -#include -#include -#include -#include "shmlimit.h" - -#define DEBUG_ 0 - -#if defined(DECOSF) || defined(SOLARIS64) || defined(HPUX) -#define PIPE_AFTER_FORK_BUG -#endif - -void (*armci_sig_chld_orig)(); -static int status=0; -int armci_shmlimit_caught_sigchld=0; - -#if defined(SUN) && !defined(SOLARIS) -static void SigChldHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -static void SigChldHandler(sig) -#endif - int sig; -{ -#ifdef DISABLED - int pid; - pid = wait(&status); -#endif - armci_shmlimit_caught_sigchld=1; -} - -static void TrapSigChld() -{ - if ( (armci_sig_chld_orig = signal(SIGCHLD, SigChldHandler)) == SIG_ERR) - armci_die("TrapSigChld: error from signal setting SIGCHLD",0); -} - - -static void RestoreSigChld() -{ - if ( signal(SIGCHLD, armci_sig_chld_orig) == SIG_ERR) - armci_die("Restore_SigChld: error from restoring signal SIGChld",0); -} - - -static int child_finished() -{ - return armci_shmlimit_caught_sigchld; -} - - -int armci_child_shmem_init() -{ - pid_t pid; - int x; -#ifdef PIPE_AFTER_FORK_BUG - int i; -#endif - - int y; - int fd[2]; - - if(pipe(fd)==-1) armci_die("armci shmem_test pipe failed",0); - - TrapSigChld(); - - if ( (pid = fork() ) < 0) - - armci_die("armci shmem_test fork failed", (int)pid); - - else if(pid == 0){ - - x= armci_shmem_test(); - -#ifdef PIPE_AFTER_FORK_BUG - /* due to a bug in OSF1 V4.0/1229/alpha first item written gets hosed*/ - for(i=0;i<2;i++) -#endif - if(write(fd[1],&x,sizeof(int)) -#include -#ifndef WIN32 -#include -#include -#include -#include -#include -#endif - -#define PAUSE_ON_ERROR - -#define Error armci_die - -#if (defined(ENCORE) || defined(SEQUENT) || defined(ARDENT)) -# define SigType int -#else -# define SigType void -#endif - -#ifndef SIG_ERR -# define SIG_ERR (SigType (*)())-1 -#endif - -#if defined(SUN) || defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || \ - defined(AIX) || defined(NEXT) -#include -#endif - -extern void Error(); -extern int armci_me; - -int AR_caught_sigint=0; -int AR_caught_sigterm=0; -int AR_caught_sigchld=0; -int AR_caught_sigsegv=0; -int AR_caught_sig=0; - -SigType (*SigChldOrig)(), (*SigIntOrig)(), (*SigHupOrig)(), (*SigTermOrig)(); -SigType (*SigSegvOrig)(); - - -/*********************** SIGINT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigIntHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigIntHandler(sig) -#endif - int sig; -{ - AR_caught_sigint = 1; - AR_caught_sig= sig; - Error("SigIntHandler: interrupt signal was caught",(int) sig); -} - -void TrapSigInt() -/* - Trap the signal SIGINT so that we can propagate error - conditions and also tidy up shared system resources in a - manner not possible just by killing everyone -*/ -{ - if ( (SigIntOrig = signal(SIGINT, SigIntHandler)) == SIG_ERR) - Error("TrapSigInt: error from signal setting SIGINT",0); -} - -void RestoreSigInt() -/* - Restore the original signal handler -*/ -{ - if(AR_caught_sigint) SigIntOrig(SIGINT); - if ( signal(SIGINT, SigIntOrig) == SIG_ERR) - Error("RestoreSigInt: error from restoring signal SIGINT",0); -} - - -/*********************** SIGABORT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigAbortHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigAbortHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("SigIntHandler: abort signal was caught: cleaning up",(int) sig); -} - -void TrapSigAbort() -/* - Trap the signal SIGINT so that we can propagate error - conditions and also tidy up shared system resources in a - manner not possible just by killing everyone -*/ -{ - if ( signal(SIGINT, SigAbortHandler) == SIG_ERR) - Error("TrapSigAbort: error from signal setting SIGABORT",0); -} - - - -/*********************** SIGCHLD *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigChldHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigChldHandler(sig) -#endif - int sig; -{ - int status; -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - union wait ustatus; -#endif - -#if defined(LINUX) - pid_t ret; - /* Trap signal as soon as possible to avoid race */ - if ( (SigChldOrig = signal(SIGCHLD, SigChldHandler)) == SIG_ERR) - Error("SigChldHandler: error from signal setting SIGCHLD",0); -#endif - -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - -# if defined(LINUX) - ret = wait(&ustatus); - if((ret == 0) || ((ret == -1) && (errno == ECHILD))) { return; } -# else - (void) wait(&ustatus); -# endif - status = ustatus.w_status; - -#else - -# if defined(LINUX) - ret = waitpid(0, &status, WNOHANG); - if((ret == 0) || ((ret == -1) && (errno == ECHILD))) { return; } -# else - (void)wait(&status); -# endif - -#endif - AR_caught_sigchld=1; - AR_caught_sig= sig; - Error("Child process terminated prematurely, status=",(int) status); -} - -void TrapSigChld() -/* - Trap SIGCHLD so that can tell if children die unexpectedly. -*/ -{ - if ( (SigChldOrig = signal(SIGCHLD, SigChldHandler)) == SIG_ERR) - Error("TrapSigChld: error from signal setting SIGCHLD",0); -} - - -void RestoreSigChld() -/* - Restore the original signal handler -*/ -{ - if(AR_caught_sigchld) SigChldOrig(SIGCHLD); - if ( signal(SIGCHLD, SigChldOrig) == SIG_ERR) - Error("RestoreSigChld: error from restoring signal SIGChld",0); -} - - -void RestoreSigChldDfl() -{ -(void) signal(SIGCHLD, SIG_DFL); -} - - -/*********************** SIGBUS *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigBusHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigBusHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; -#ifdef PAUSE_ON_ERROR - fprintf(stderr,"%d(%d): Bus Error ... pausing\n", - armci_me, getpid() );pause(); -#endif - Error("Bus error, status=",(int) sig); -} - -void TrapSigBus() -/* - Trap SIGBUS -*/ -{ - if ( signal(SIGBUS, SigBusHandler) == SIG_ERR) - Error("TrapSigBus: error from signal setting SIGBUS", 0); -} - - - - -/*********************** SIGFPE *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigFpeHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigFpeHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Floating Point Exception error, status=",(int) sig); -} - -void TrapSigFpe() -/* - Trap SIGFPE -*/ -{ - if ( signal(SIGFPE, SigFpeHandler) == SIG_ERR) - Error("TrapSigFpe: error from signal setting SIGFPE", 0); -} - - - - -/*********************** SIGILL *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigIllHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigIllHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Illegal Instruction error, status=",(int) sig); -} - -void TrapSigIll() -/* - Trap SIGILL -*/ -{ - if ( signal(SIGILL, SigIllHandler) == SIG_ERR) - Error("TrapSigIll: error from signal setting SIGILL", 0); -} - - - - -/*********************** SIGSEGV *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigSegvHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigSegvHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - AR_caught_sigsegv=1; -#ifdef PAUSE_ON_ERROR - fprintf(stderr,"%d(%d): Segmentation Violation ... pausing\n", - armci_me, getpid() );pause(); -#endif - - Error("Segmentation Violation error, status=",(int) sig); -} -#ifdef DO_CKPT -static void * signal_arr[100]; -SigType SigSegvActionSa(int sig,siginfo_t *sinfo, void *ptr) -{ - int (*func)(); - AR_caught_sig= sig; - AR_caught_sigsegv=1; - func = signal_arr[sig]; - /*printf("\n%d:in sigaction %p, %d\n",armci_me,sinfo->si_addr,sinfo->si_errno);fflush(stdout);*/ - - if(func(sinfo->si_addr,sinfo->si_errno,sinfo->si_fd)) - Error("Segmentation Violation error, status=",(int) SIGSEGV); -} - -void TrapSigSegvSigaction() -{ - struct sigaction sa; - sa.sa_sigaction = (void *)SigSegvActionSa; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_RESTART; - sigaction(SIGSEGV, &sa, NULL); -} -#endif - -void TrapSigSegv() -/* - Trap SIGSEGV -*/ -{ - if ( (SigSegvOrig=signal(SIGSEGV, SigSegvHandler)) == SIG_ERR) - Error("TrapSigSegv: error from signal setting SIGSEGV", 0); -} - - -void RestoreSigSegv() -/* - Restore the original signal handler -*/ -{ -/* - if(AR_caught_sigsegv) SigSegvOrig(SIGSEGV); -*/ -#ifdef DO_CKPT__ - struct sigaction sa; - sa.sa_handler = (void *)SigSegvOrig; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_RESTART; - sigaction(SIGSEGV, &sa, NULL); - sigaction(SIGSEGV,&sa,NULL); -#else - if ( signal(SIGSEGV,SigSegvOrig) == SIG_ERR) - Error("RestoreSigSegv: error from restoring signal SIGSEGV",0); -#endif -} - - -/*********************** SIGSYS *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigSysHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigSysHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Bad Argument To System Call error, status=",(int) sig); -} - -void TrapSigSys() -/* - Trap SIGSYS -*/ -{ -#ifndef LINUX - if ( signal(SIGSYS, SigSysHandler) == SIG_ERR) - Error("TrapSigSys: error from signal setting SIGSYS", 0); -#endif -} - - - -/*********************** SIGTRAP *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigTrapHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigTrapHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Trace Trap error, status=",(int) sig); -} - -void TrapSigTrap() -/* - Trap SIGTRAP -*/ -{ - if ( signal(SIGTRAP, SigTrapHandler) == SIG_ERR) - Error("TrapSigTrap: error from signal setting SIGTRAP", 0); -} - - - -/*********************** SIGHUP *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigHupHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigHupHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Hangup error, status=",(int) sig); -} - -void TrapSigHup() -/* - Trap SIGHUP -*/ -{ - if ( (SigHupOrig = signal(SIGHUP, SigHupHandler)) == SIG_ERR) - Error("TrapSigHup: error from signal setting SIGHUP", 0); -} - - -void RestoreSigHup() -/* - Restore the original signal handler -*/ -{ - if(AR_caught_sig== SIGHUP) SigHupOrig(SIGHUP); - if ( signal(SIGHUP, SigHupOrig) == SIG_ERR) - Error("RestoreSigHUP: error from restoring signal SIGHUP",0); -} - - - -/*********************** SIGTERM *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigTermHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigTermHandler(sig) -#endif - int sig; -{ - AR_caught_sigterm = 1; - AR_caught_sig= sig; - Error("Terminate signal was sent, status=",(int) sig); -} - -void TrapSigTerm() -/* - Trap SIGTERM -*/ -{ - if ( (SigTermOrig = signal(SIGTERM, SigTermHandler)) == SIG_ERR) - Error("TrapSigTerm: error from signal setting SIGTERM", 0); -} - -void RestoreSigTerm() -/* - Restore the original signal handler -*/ -{ - if(AR_caught_sigterm && (SigTermOrig != SIG_DFL) ) SigTermOrig(SIGTERM); - if ( signal(SIGTERM, SigTermOrig) == SIG_ERR) - Error("RestoreSigTerm: error from restoring signal SIGTerm",0); -} - - -/*********************** SIGIOT *************************************/ -#ifdef SIGIOT -#if defined(SUN) && !defined(SOLARIS) -SigType SigIotHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigIotHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("IOT signal was sent, status=",(int) sig); -} - -void TrapSigIot() -/* - Trap SIGIOT -*/ -{ - if ( signal(SIGIOT, SigIotHandler) == SIG_ERR) - Error("TrapSigIot: error from signal setting SIGIOT", 0); -} -#endif - - - -/*********************** SIGCONT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigContHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigContHandler(sig) -#endif - int sig; -{ -/* Error("Trace Cont error, status=",(int) sig);*/ - AR_caught_sig= sig; -} - -void TrapSigCont() -/* - Trap SIGCONT -*/ -{ - if ( signal(SIGCONT, SigContHandler) == SIG_ERR) - Error("TrapSigCont: error from signal setting SIGCONT", 0); -} - -/*********************** SIGXCPU *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigXcpuHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigXcpuHandler(sig) -#endif - int sig; -{ - AR_caught_sig= sig; - Error("Terminate signal was sent, status=",(int) sig); -} - -void TrapSigXcpu() -/* - Trap SIGXCPU -*/ -{ - if ( signal(SIGXCPU, SigXcpuHandler) == SIG_ERR) - Error("TrapSigXcpu: error from signal setting SIGXCPU", 0); -} - -/******************* external API *********************************/ - -void ARMCI_ChildrenTrapSignals() -{ -#ifndef LAPI - TrapSigBus(); -#endif - TrapSigFpe(); - TrapSigIll(); -#ifdef DO_CKPT - TrapSigSegvSigaction(); -#else - TrapSigSegv(); -#endif - TrapSigSys(); - TrapSigTrap(); - TrapSigAbort(); - TrapSigTerm(); - TrapSigInt(); - -#if defined(LAPI) || defined(SGI) - TrapSigIot(); -#endif - -#ifdef SGI - TrapSigXcpu(); -#endif - -} - - -void ARMCI_ParentTrapSignals() -{ -#ifndef LAPI - TrapSigChld(); -#endif - TrapSigHup(); -} - - -void ARMCI_RestoreSignals() -{ - RestoreSigTerm(); - RestoreSigInt(); - RestoreSigSegv(); -} - - -void ARMCI_ParentRestoreSignals() -{ -#ifndef LAPI - RestoreSigChld(); -#endif - ARMCI_RestoreSignals(); - RestoreSigHup(); -} - -#ifdef DO_CKPT -/*user can register a function with 3 parameters, 1st offending address - * 2nd err number and third file descriptor*/ -void ARMCI_Register_Signal_Handler(int sig, void (*func)()) -{ - signal_arr[sig]=func; -} -#endif diff --git a/armci/src-portals/signaltrap.h b/armci/src-portals/signaltrap.h deleted file mode 100644 index 7e961826e..000000000 --- a/armci/src-portals/signaltrap.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _SIGNALTRAP_H_ -#define _SIGNALTRAP_H_ - -#ifdef SYSV - extern void ARMCI_ChildrenTrapSignals(); - extern void ARMCI_ParentTrapSignals(); - extern void ARMCI_ParentRestoreSignals(); - extern void ARMCI_RestoreSignals(); -#else -# define ARMCI_ChildrenTrapSignals() -# define ARMCI_ParentTrapSignals() -# define ARMCI_ParentRestoreSignals() -#endif - -#endif diff --git a/armci/src-portals/sockets.h b/armci/src-portals/sockets.h deleted file mode 100644 index e76b5d277..000000000 --- a/armci/src-portals/sockets.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef SOCKETS_H_ -#define SOCKETS_H_ -#include "armci.h" -#ifndef WIN32 -#define USE_SOCKET_VECTOR_API -#endif -#if defined(USE_SOCKET_VECTOR_API) -# include -#endif -extern int tcp_sendrcv_bufsize; -extern int armci_PollSocket(int sock); -extern int armci_WaitSock(int *socklist, int num, int *ready); -extern int armci_ReadFromSocket(int sock, void* buffer, int lenbuf); -extern int armci_WriteToSocket (int sock, void* buffer, int lenbuf); - -#if defined(USE_SOCKET_VECTOR_API) -extern int armci_RecvStridedFromSocket(int sock,void* buffer,int *str_arr,int *cnt,int str_level,struct iovec *iov); -extern int armci_SendStridedToSocket(int sock,void* buffer,int *str_arr,int *cnt,int str_level,struct iovec *iov); -extern int armci_RecvVectorFromSocket(int sock,armci_giov_t darr[], int len,struct iovec *iov); -extern int armci_SendVectorToSocket(int sock,armci_giov_t darr[], int len,struct iovec *iov); -extern int armci_ReadVFromSocket(int sock,struct iovec *iov, int iovlength, int totalsize); -extern int armci_WriteVToSocket (int sock,struct iovec *iov, int iovlength, int totalsize); -#endif -extern void armci_ListenSockAll(int* socklist, int num); -extern void armci_AcceptSockAll(int* socklist, int num); -extern int armci_CreateSocketAndConnect(char *hostname, int port); -extern void armci_ShutdownAll(int socklist[], int num); -extern void armci_CreateSocketAndBind(int *sock, int *port); -#define PACKET_SIZE tcp_sendrcv_bufsize -#define TIMEOUT_ACCEPT 60 -#define GET_SEND_BUFFER _armci_buf_get_clear_busy -#define FREE_SEND_BUFFER _armci_buf_release - -#ifndef UIO_MAXIOV -#define MAX_IOVEC 8 -#else -#define MAX_IOVEC (UIO_MAXIOV>100?100:UIO_MAXIOV) -#endif -#endif diff --git a/armci/src-portals/spawn.c b/armci/src-portals/spawn.c deleted file mode 100644 index a1d7d5157..000000000 --- a/armci/src-portals/spawn.c +++ /dev/null @@ -1,147 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* OS specific server process/thread creation and destruction - * JN/03.25.2000 - */ -#if HAVE_STDIO_H -# include -#endif -#if HAVE_ERRNO_H -# include -#endif -#include "armcip.h" - -#ifdef WIN32 -/************************** Windows threads **************************/ -#if HAVE_WINDOWS_H -# include -#endif -#if HAVE_PROCESS_H -# include -#endif - -thread_id_t armci_serv_tid; -unsigned long armci_serv_handle; -#ifndef NO_STDLIBC -#define NEWTHREAD CreateThread -#else -#define NEWTHREAD _beginthreadex -#endif - -unsigned __stdcall armci_wrap_func(void *arg) -{ -void (*func)(void*); - func = arg; - - /* boost the server thread priority be better responsiveness */ - (void)SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_HIGHEST); - - func(NULL); - - return (unsigned)0; -} - - -void armci_create_server_thread ( void* (* func)(void*) ) -{ - /* as we need to use std C rt library we cannot use CreateThread */ - armci_serv_handle = NEWTHREAD(NULL, 0, armci_wrap_func, (void*)func, - 0, &armci_serv_tid); - if(!armci_serv_handle) - armci_die("armci_create_server_thread: create failed",0); - - - -} - -void armci_terminate_server_thread() -{ -/*int rc;*/ -/* TerminateThread(armci_serv_handle,&rc);*/ -} - -/****************************** PTHREADS *****************************/ -#elif defined(PTHREADS) -#include - -thread_id_t armci_serv_tid; - -void armci_create_server_thread ( void* (* func)(void*) ) -{ -pthread_attr_t attr; -int rc; - - if(pthread_attr_init(&attr)) - armci_die("armci_create_server_thread: attr init failed",0); - -#if defined(AIX) || defined(SOLARIS) - pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); -#endif - - rc = pthread_create(&armci_serv_tid, &attr, func, NULL); - if(rc) armci_die("armci_create_server_thread: create failed",errno); - - pthread_attr_destroy(&attr); -} - - -void armci_terminate_server_thread() -{ - if(pthread_join(armci_serv_tid,NULL)) - armci_die("armci_terminate_server_thread: failed",0); -} - - -#else -/**************************** Unix processes ******************************/ - -#if HAVE_UNISTD_H -# include -#endif -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_SYS_WAIT_H -# include -#endif - -pid_t server_pid= (pid_t)0; - - -char child_stack[256*1024]; -char *child_stack_top = &child_stack[256*1024-1]; - -void armci_create_server_process ( void* (* func)(void*) ) -{ -pid_t pid; -/* - if ( (pid = fork() ) < 0) - armci_die("fork failed", (int)pid); -*/ - pid = clone(func, (void*)child_stack_top, - CLONE_THREAD|CLONE_SIGHAND|CLONE_VM, NULL); - - if (pid == -1) { - armci_die("fork failed", (int)pid); - } - - server_pid = pid; -} - - -void armci_wait_server_process() -{ - int stat; - pid_t rc; - - if(!server_pid) return; - rc = wait (&stat); - if (rc != server_pid){ - perror("ARMCI master: wait for child process (server) failed:"); - } - server_pid = (pid_t)0; -} - -#endif diff --git a/armci/src-portals/spinlock.h b/armci/src-portals/spinlock.h deleted file mode 100644 index a6b306f10..000000000 --- a/armci/src-portals/spinlock.h +++ /dev/null @@ -1,235 +0,0 @@ -/** - * @file spinlock.h - * - * This file attempts to implement spin locks for various platforms and/or CPU - * instruction sets. - */ -#ifndef SPINLOCK_H -#define SPINLOCK_H - -#define DEBUG_SPINLOCK 0 - -#define OPENPA 0 - -#if OPENPA -# if DEBUG_SPINLOCK -# warning SPINLOCK: openpa -# endif -# define SPINLOCK -# include "opa_primitives.h" -# define LOCK_T OPA_int_t -# define TESTANDSET(x) OPA_swap_int((x), 1) -# define MEMORY_BARRIER OPA_read_write_barrier - -#elif (defined(PPC) || defined(__PPC__) || defined(__PPC)) -# if DEBUG_SPINLOCK -# warning SPINLOCK: PPC -# endif -# define SPINLOCK -# include "asm-ppc.h" -//# define TESTANDSET testandset -//# define TESTANDSET acquireLock -# define armci_acquire_spinlock acquire_spinlock -# define armci_release_spinlock release_spinlock -# define MEMORY_BARRIER memory_barrier -static int testandset(void *spinlock) { - int v=1; - atomic_exchange(&v,spinlock,sizeof(int)); - return v; -} -static void memory_barrier() { - __asm__ __volatile__ ("sync" : : : "memory"); -} - -#elif defined(__i386__) || defined(__x86_64__) -# if DEBUG_SPINLOCK -# warning SPINLOCK: x86_64 -# endif -# define SPINLOCK -# include "atomics-i386.h" -static int testandset(void *spinlock) { - int v=1; - atomic_exchange(&v,spinlock,sizeof(int)); - return v; -} -# define TESTANDSET testandset - -#elif defined(__ia64) -# if DEBUG_SPINLOCK -# warning SPINLOCK: ia64 -# endif -# define SPINLOCK -# include "atomic_ops_ia64.h" -static int testandset(void *spinlock) { - int val=1; - int res; - atomic_swap_int(spinlock, val, &res); - return res; -} -# define TESTANDSET testandset - -#elif defined(DECOSF) -# if DEBUG_SPINLOCK -# warning SPINLOCK: DECOSF -# endif -# error "no implementation" - -#elif defined(SGI) -# if DEBUG_SPINLOCK -# warning SPINLOCK: SGI -# endif -# include -# define SPINLOCK -# define TESTANDSET(x) __lock_test_and_set((x), 1) -# define RELEASE_SPINLOCK __lock_release - -/*#elif defined(AIX)*/ -#elif HAVE_SYS_ATOMIC_OP_H -# if DEBUG_SPINLOCK -# warning SPINLOCK: sys/atomic_op.h (AIX) -# endif -# include -# define SPINLOCK -# define TESTANDSET(x) (_check_lock((x), 0, 1)==TRUE) -# define RELEASE_SPINLOCK(x) _clear_lock((x),0) - -#elif defined(SOLARIS) -# if DEBUG_SPINLOCK -# warning SPINLOCK: SOLARIS -# endif -# include -# include -# define SPINLOCK -# define TESTANDSET(x) (!_lock_try((x))) -# define RELEASE_SPINLOCK _lock_clear - -#elif defined(MACX) - -#elif defined(HPUX__) -# if DEBUG_SPINLOCK -# warning SPINLOCK: HPUX__ -# endif -extern int _acquire_lock(); -extern void _release_lock(); -# define SPINLOCK -# define TESTANDSET(x) (!_acquire_lock((x))) -# define RELEASE_SPINLOCK _release_lock - -#elif defined(HPUX) && defined(__ia64) /* HPUX on IA64, non gcc */ -# if DEBUG_SPINLOCK -# warning SPINLOCK: HPUX ia64 -# endif -# define SPINLOCK -typedef unsigned int slock_t; -# include -# define TESTANDSET(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE) -# define RELEASE_SPINLOCK(lock) (*((volatile LOCK_T *) (lock)) = 0) - -#elif defined(NEC) -# if DEBUG_SPINLOCK -# warning SPINLOCK: NEC -# endif -extern ullong ts1am_2me(); -# define LOCK_T ullong -# define _LKWD (1ULL << 63) -# define SPINLOCK -# define TESTANDSET(x) ((_LKWD & ts1am_2me(_LKWD, 0xffULL, (ullong)(x)))) -# define MEMORY_BARRIER mpisx_clear_cache -extern void mpisx_clear_cache(); -# define RELEASE_SPINLOCK(x) ts1am_2me(0ULL, 0xffULL, (ullong)x); - -#endif - -#ifdef SPINLOCK - -#if DEBUG_ -# if HAVE_STDIO_H -# include -# endif -#endif - -#if HAVE_UNISTD_H -# include -#endif - -#ifndef DBL_PAD -# define DBL_PAD 16 -#endif - -/* make sure that locks are not sharing the same cache line */ -typedef struct{ - double lock[DBL_PAD]; -}pad_lock_t; - -#ifndef LOCK_T -# define LOCK_T int -#endif -#define PAD_LOCK_T pad_lock_t - -static inline void armci_init_spinlock(LOCK_T *mutex) -{ -#if OPENPA - OPA_store_int(mutex, 0); -#else - *mutex =0; -#endif -} - -#ifdef TESTANDSET - -static inline void armci_acquire_spinlock(LOCK_T *mutex) -{ -#if defined(BGML) || defined(DCMF) - return; -#else - int loop=0, maxloop =10; - - while (TESTANDSET(mutex)){ - loop++; - if(loop==maxloop){ -# if DEBUG_ - extern int armci_me; - printf("%d:spinlock sleeping\n",armci_me); fflush(stdout); -# endif - usleep(1); - loop=0; - } - } -#endif -} - -#ifdef RELEASE_SPINLOCK -# ifdef MEMORY_BARRIER -# define armci_release_spinlock(x) MEMORY_BARRIER(); RELEASE_SPINLOCK(x) -# else -# define armci_release_spinlock(x) RELEASE_SPINLOCK(x) -# endif -#else -static inline void armci_release_spinlock(LOCK_T *mutex) -{ -#if defined(BGML) || defined(DCMF) - return; -#else -# ifdef MEMORY_BARRIER - MEMORY_BARRIER (); -# endif -#if OPENPA - OPA_store_int(mutex, 0); -#else - *mutex =0; -#endif -# ifdef MEMORY_BARRIER - MEMORY_BARRIER (); -# endif -# if (defined(MACX)||defined(LINUX)) && defined(__GNUC__) && defined(__ppc__) - __asm__ __volatile__ ("isync" : : : "memory"); -# endif -#endif -} -#endif /* RELEASE_SPINLOCK */ - -#endif /* TESTANDSET */ - -#endif /* SPINLOCK */ - -#endif /* SPINLOCK_H */ diff --git a/armci/src-portals/strided.c b/armci/src-portals/strided.c deleted file mode 100644 index 1dfcffd6b..000000000 --- a/armci/src-portals/strided.c +++ /dev/null @@ -1,1499 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include - -#include "armcip.h" -#include "copy.h" -#include "acc.h" -#include "memlock.h" -#include -#include - -#define DATA_SERVER_ 1 - -#ifdef ORNL_USE_DS_FOR_REMOTE_GETS -#define DATA_SERVER_GET_ 1 -#else -#define DATA_SERVER_GET_ 0 -#endif - -#define ARMCI_OP_2D(op, scale, proc, src, dst, bytes, count, src_stride, dst_stride,lockit)\ -if(op == GET || op ==PUT)\ - armci_copy_2D(op, proc, src, dst, bytes, count, src_stride,dst_stride);\ -else if(count==1) armci_acc_1D(op, scale, proc, src, dst, bytes,lockit);\ -else\ - armci_acc_2D(op, scale, proc, src, dst, bytes, count, src_stride,dst_stride,lockit) - -/* macro supports run-time selection of request sending scheme */ -#if defined(CLIENT_BUF_BYPASS) -#define CAN_REQUEST_DIRECTLY _armci_bypass -#else -# if defined(HITACHI) -# define CAN_REQUEST_DIRECTLY 0 -# else -# define CAN_REQUEST_DIRECTLY 1 -# endif -#endif - -#define PREPROCESS_STRIDED(tmp_count) {\ - tmp_count=0;\ - if(stride_levels) \ - for(;stride_levels;stride_levels--)if(count[stride_levels]>1)break;\ - if(stride_levels&&(count[0]==src_stride_arr[0]&&count[0]==dst_stride_arr[0])){\ - tmp_count=seg_count[1];\ - count = seg_count+1;\ - seg_count[1] = seg_count[0] * seg_count[1];\ - stride_levels --;\ - src_stride_arr ++; dst_stride_arr++ ;\ - }\ -} -#define POSTPROCESS_STRIDED(tmp_count) if(tmp_count)seg_count[1]=tmp_count - -#define SERVER_GET 1 -#define SERVER_NBGET 2 -#define DIRECT_GET 3 -#define DIRECT_NBGET 4 -#define SERVER_PUT 5 -#define SERVER_NBPUT 6 -#define DIRECT_PUT 7 -#define DIRECT_NBPUT 8 - - -# define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\ - else if(__prot==SERVER_PUT);\ - else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\ - if(armci_prot_switch_fence[__proc]==SERVER_PUT)\ - ARMCI_DoFence(__proc);\ - }\ - else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\ - if(armci_prot_switch_fence[__proc]==SERVER_PUT)\ - ARMCI_DoFence(__proc);\ - }\ - else;\ - armci_prot_switch_fence[__proc]=__prot - -#ifndef REGIONS_REQUIRE_MEMHDL -# define ARMCI_MEMHDL_T void -#endif - -ARMCI_MEMHDL_T *mhloc=NULL,*mhrem=NULL; - -#ifdef REGIONS_REQUIRE_MEMHDL - int armci_region_both_found_hndl(void *loc, void *rem, int size, int node, - ARMCI_MEMHDL_T **loc_memhdl,ARMCI_MEMHDL_T **rem_memhdl); -# define ARMCI_REGION_BOTH_FOUND(_s,_d,_b,_p) \ - armci_region_both_found_hndl((_s),(_d),(_b),(_p),&mhloc,&mhrem) -#else -# define ARMCI_REGION_BOTH_FOUND(_s,_d,_b,_p) \ - armci_region_both_found((_s),(_d),(_b),(_p)) -#endif - -#ifdef HAS_RDMA_GET - -# ifdef REGIONS_REQUIRE_MEMHDL - void armci_client_direct_get(int p, void *src_buf, void *dst_buf, int len, - void** cptr,int nbtag,ARMCI_MEMHDL_T *lochdl,ARMCI_MEMHDL_T *remhdl); -# else - void armci_client_direct_get(int p, void *src_buf, void *dst_buf, int len, - void** contextptr,int nbtag,void *mhdl,void *mhdl1); -# endif -# define ARMCI_NBREM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \ - armci_client_direct_get((_p),(_s),(_d),(_cou)[0],&((_hdl)->cmpl_info),(_hdl)->tag,(void *)mhloc,(void *)mhrem); \ - -# define ARMCI_REM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \ - armci_client_direct_get((_p),(_s),(_d),(_cou)[0],NULL,0,(void *)mhloc,(void *)mhrem); \ - -#else - -# define ARMCI_REM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \ - armci_rem_get((_p),(_s),(_sst),(_d),(_dst),(_cou),(_lev),(_hdl),(void *)mhloc,(void *)mhrem) -# define ARMCI_NBREM_GET ARMCI_REM_GET - -#endif - - extern int* armci_prot_switch_fence; - extern int armci_prot_switch_preproc; - extern int armci_prot_switch_preop; - - -int armci_iwork[MAX_STRIDE_LEVEL]; - -/*\ 2-dimensional array copy -\*/ -static void armci_copy_2D(int op, int proc, void *src_ptr, void *dst_ptr, - int bytes, int count, int src_stride, int dst_stride) -{ - int armci_th_idx = ARMCI_THREAD_IDX; - -#ifdef LAPI2__ -# define COUNT 1 -#else -# define COUNT count -#endif - -#ifdef __crayx1 - int shmem = 1; -#else - int shmem = SAMECLUSNODE(proc); -#endif - if(shmem) { - /* data is in local/shared memory -- can use memcpy */ -// printf("%s: shmem==true; count==%d\n",Portals_ID(),count); - if(count==1){ - armci_copy(src_ptr, dst_ptr, bytes); -// printf("%s: shmem==true; finished\n",Portals_ID(),count); - }else { - char *ps=(char*)src_ptr; - char *pd=(char*)dst_ptr; - int j; - for (j = 0; j < count; j++){ - bcopy(ps,pd,bytes); - ps += src_stride; - pd += dst_stride; - } - } - } else { - - /* data not in local/shared memory-access through global address space*/ - - if(op==PUT){ - - printf("%s: pre UPDATE_FENCE_STATE\n",Portals_ID()); - UPDATE_FENCE_STATE(proc, PUT, COUNT); - printf("%s: post UPDATE_FENCE_STATE\n",Portals_ID()); -#ifdef LAPI - SET_COUNTER(ack_cntr[armci_th_idx],COUNT); -#endif - if(count==1){ - armci_put(src_ptr, dst_ptr, bytes, proc); - }else{ - armci_put2D(proc, bytes, count, src_ptr, src_stride, - dst_ptr, dst_stride); - } - - }else{ - -#ifdef LAPI - SET_COUNTER(get_cntr[armci_th_idx], COUNT); -#endif - if(count==1){ - armci_get(src_ptr, dst_ptr, bytes, proc); - }else{ - armci_get2D(proc, bytes, count, src_ptr, src_stride, - dst_ptr, dst_stride); - } - } - } -} - - -#if (defined(CRAY) && !defined(__crayx1)) || defined(FUJITSU) -#ifdef CRAY -# define DAXPY SAXPY -#else -# define DAXPY daxpy_ -#endif - -static int ONE=1; -#define THRESH_ACC 32 - -static void daxpy_2d_(void* alpha, int *rows, int *cols, void *a, int *ald, - void* b, int *bld) -{ - int c,r; - double *A = (double*)a; - double *B = (double*)b; - double Alpha = *(double*)alpha; - - if(*rows < THRESH_ACC) - for(c=0;c<*cols;c++) - for(r=0;r<*rows;r++) - A[c* *ald+ r] += Alpha * B[c* *bld+r]; - else for(c=0;c<*cols;c++) - DAXPY(rows, alpha, B + c* *bld, &ONE, A + c* *ald, &ONE); -} -#endif - - -void armci_acc_1D(int op, void *scale, int proc, void *src, void *dst, int bytes, int lockit) -{ -int rows; -void (ATR *func)(void*, void*, void*, int*); - ARMCI_PR_DBG("enter",0); - switch (op){ - case ARMCI_ACC_INT: - rows = bytes/sizeof(int); - func = I_ACCUMULATE_1D; - break; - case ARMCI_ACC_LNG: - rows = bytes/sizeof(long); - func = L_ACCUMULATE_1D; - break; - case ARMCI_ACC_DBL: - rows = bytes/sizeof(double); - func = D_ACCUMULATE_1D; - break; - case ARMCI_ACC_DCP: - rows = bytes/(2*sizeof(double)); - func = Z_ACCUMULATE_1D; - break; - case ARMCI_ACC_CPL: - rows = bytes/(2*sizeof(float)); - func = C_ACCUMULATE_1D; - break; - case ARMCI_ACC_FLT: - rows = bytes/sizeof(float); - func = F_ACCUMULATE_1D; - break; - default: armci_die("ARMCI accumulate: operation not supported",op); - func = F_ACCUMULATE_1D; /*avoid compiler whining */ - } - - - if(lockit){ - ARMCI_LOCKMEM(dst, bytes + (char*)dst, proc); - } - func(scale, dst, src, &rows); - if(lockit)ARMCI_UNLOCKMEM(proc); - ARMCI_PR_DBG("exit",0); -} - -/*\ 2-dimensional accumulate -\*/ -void armci_acc_2D(int op, void* scale, int proc, void *src_ptr, void *dst_ptr, - int bytes, int cols, int src_stride, int dst_stride, int lockit) -{ -int rows, lds, ldd, span; -void (ATR *func)(void*, int*, int*, void*, int*, void*, int*); - - ARMCI_PR_DBG("enter",0); - -/* - if((long)src_ptr%ALIGN)armci_die("src not aligned",(long)src_ptr); - if((long)dst_ptr%ALIGN)armci_die("src not aligned",(long)dst_ptr); -*/ - - switch (op){ - case ARMCI_ACC_INT: - rows = bytes/sizeof(int); - ldd = dst_stride/sizeof(int); - lds = src_stride/sizeof(int); - func = I_ACCUMULATE_2D; - break; - case ARMCI_ACC_LNG: - rows = bytes/sizeof(long); - ldd = dst_stride/sizeof(long); - lds = src_stride/sizeof(long); - func = L_ACCUMULATE_2D; - break; - case ARMCI_ACC_DBL: - rows = bytes/sizeof(double); - ldd = dst_stride/sizeof(double); - lds = src_stride/sizeof(double); - func = D_ACCUMULATE_2D; - break; - case ARMCI_ACC_DCP: - rows = bytes/(2*sizeof(double)); - ldd = dst_stride/(2*sizeof(double)); - lds = src_stride/(2*sizeof(double)); - func = Z_ACCUMULATE_2D; - break; - case ARMCI_ACC_CPL: - rows = bytes/(2*sizeof(float)); - ldd = dst_stride/(2*sizeof(float)); - lds = src_stride/(2*sizeof(float)); - func = C_ACCUMULATE_2D; - break; - case ARMCI_ACC_FLT: - rows = bytes/sizeof(float); - ldd = dst_stride/sizeof(float); - lds = src_stride/sizeof(float); - func = F_ACCUMULATE_2D; - break; - case ARMCI_ACC_RA: - rows = bytes/sizeof(long); - ldd = dst_stride/sizeof(long); - lds = src_stride/sizeof(long); - func = RA_ACCUMULATE_2D; - break; - default: armci_die("ARMCI accumulate: operation not supported",op); - func = F_ACCUMULATE_2D; /*avoid compiler whining */ - } - - - if(lockit){ - span = cols*dst_stride; - ARMCI_LOCKMEM(dst_ptr, span + (char*)dst_ptr, proc); - } - func(scale, &rows, &cols, dst_ptr, &ldd, src_ptr, &lds); - if(lockit)ARMCI_UNLOCKMEM(proc); - ARMCI_PR_DBG("exit",0); - -} - - -/*\ compute range of strided data AND lock it -\*/ -static void -armci_lockmem_patch(void* dst_ptr, int dst_stride_arr[], int count[], int stride_levels, int proc) -{ - long span = count[stride_levels]; - ARMCI_PR_DBG("enter",0); - span *= dst_stride_arr[stride_levels-1]; - - /* lock region of remote memory */ - ARMCI_LOCKMEM(dst_ptr, span + (char*)dst_ptr, proc); - ARMCI_PR_DBG("exit",0); -} - - -/*\ strided accumulate on top of remote memory copy: - * copies remote data to local buffer, accumulates, puts it back - * Note: if we are here then remote patch must fit in the ARMCI buffer -\*/ -int armci_acc_copy_strided(int optype, void* scale, int proc, - void* src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels) -{ - void *buf_ptr = armci_internal_buffer; - int rc, i, *buf_stride_arr = armci_iwork; - ARMCI_PR_DBG("enter",0); - armci_lockmem_patch(dst_ptr,dst_stride_arr, count, stride_levels, proc); - - /* setup stride array for internal buffer */ - buf_stride_arr[0]=count[0]; - for(i=0; i< stride_levels; i++) { - buf_stride_arr[i+1]= buf_stride_arr[i]*count[i+1]; - } - - /* get remote data to local buffer */ - rc = armci_op_strided(GET, scale, proc, dst_ptr, dst_stride_arr, buf_ptr, - buf_stride_arr, count, stride_levels, 0,NULL); - - if(rc) { ARMCI_UNLOCKMEM(proc); return(rc); } - - /* call local accumulate with lockit=0 (we locked it already) and proc=me */ - rc = armci_op_strided(optype, scale, armci_me, src_ptr, src_stride_arr, - buf_ptr,buf_stride_arr, count, stride_levels,0,NULL); - if(rc) { ARMCI_UNLOCKMEM(proc); return(rc); } - - /* put data back from the buffer to remote location */ - rc = armci_op_strided(PUT, scale, proc, buf_ptr, buf_stride_arr, dst_ptr, - dst_stride_arr, count, stride_levels,0,NULL); - - FENCE_NODE(proc); /* make sure put completes before unlocking */ - ARMCI_UNLOCKMEM(proc); /* release memory lock */ - ARMCI_PR_DBG("exit",0); - - return(rc); -} - - - -/*\ Strided operation -\*/ -int armci_op_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, int lockit, - armci_ihdl_t nb_handle) -{ -char *src = (char*)src_ptr, *dst=(char*)dst_ptr; -int s2, s3, i,j, unlockit=0; -int total_of_2D; -int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL]; - ARMCI_PR_DBG("enter",op); -# if defined(ACC_COPY) - -# ifdef ACC_SMP - if(ARMCI_ACC(op) && !(SAMECLUSNODE(proc)) ) -# else - if ( ARMCI_ACC(op) && proc!=armci_me) -# endif - /* copy remote data, accumulate, copy back*/ - return (armci_acc_copy_strided(op,scale, proc, src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, count, stride_levels)); - - else; /* do it directly through shared/local memory */ -# endif - - - if(ARMCI_ACC(op) && (stride_levels>2) && lockit){ - /* we need one lock operation only - must be done outside 2d acc */ - armci_lockmem_patch(dst_ptr,dst_stride_arr, count, stride_levels, proc); - unlockit=1; - lockit =0; - } -/* if(proc!=armci_me) INTR_OFF;*/ - if(armci_me>=0 && !SAMECLUSNODE(proc)) { - printf("%s network_strided not supported (in op_strided)\n",Portals_ID()); - abort(); - armci_network_strided(op,scale,proc,src_ptr,src_stride_arr,dst_ptr, - dst_stride_arr,count,stride_levels,nb_handle); - } - else { -// printf("%s in large switch stmt in op_strided (stride_levels=%d)\n",Portals_ID(),stride_levels); - switch (stride_levels) { - case 0: /* 1D copy */ - - ARMCI_OP_2D(op, scale, proc, src_ptr, dst_ptr, count[0], 1, - count[0], count[0], lockit); - - break; - - case 1: /* 2D op */ - ARMCI_OP_2D(op, scale, proc, src_ptr, dst_ptr, count[0], count[1], - src_stride_arr[0], dst_stride_arr[0], lockit); - break; - - case 2: /* 3D op */ - for (s2= 0; s2 < count[2]; s2++){ /* 2D copy */ - ARMCI_OP_2D(op, scale, proc, src+s2*src_stride_arr[1], - dst+s2*dst_stride_arr[1], count[0], count[1], - src_stride_arr[0], dst_stride_arr[0], lockit ); - } - break; - - case 3: /* 4D op */ - for(s3=0; s3< count[3]; s3++){ - src = (char*)src_ptr + src_stride_arr[2]*s3; - dst = (char*)dst_ptr + dst_stride_arr[2]*s3; - for (s2= 0; s2 < count[2]; s2++){ /* 3D copy */ - ARMCI_OP_2D(op, scale, proc, src+s2*src_stride_arr[1], - dst+s2*dst_stride_arr[1], - count[0], count[1],src_stride_arr[0], - dst_stride_arr[0],lockit); - } - } - break; - - default: /* N-dimensional */ - { - /* stride_levels is not the same as ndim. it is ndim-1 - * For example a 10x10x10... array, suppose the datatype is byte - * the stride_arr is 10, 10x10, 10x10x10 .... - */ - index[2] = 0; unit[2] = 1; total_of_2D = count[2]; - for(j=3; j<=stride_levels; j++) { - index[j] = 0; unit[j] = unit[j-1] * count[j-1]; - total_of_2D *= count[j]; - } - - for(i=0; i= count[j]) index[j] = 0; - } - - ARMCI_OP_2D(op, scale, proc, src, dst, count[0], count[1], - src_stride_arr[0], dst_stride_arr[0], lockit); - } - - } - } - } // ends else block - -// printf("%s after switch stmt; prior to fence/lock\n",Portals_ID()); - - if(unlockit){ -# if defined(ACC_COPY) - FENCE_NODE(proc); -# endif - ARMCI_UNLOCKMEM(proc); /* release memory lock */ - } - -// printf("%s after fence/lock; leaving op_strided\n",Portals_ID()); - ARMCI_PR_DBG("exit",op); - return 0; -} - - -int PARMCI_PutS( void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ) -{ -int rc=0, direct=1; -int *count=seg_count, tmp_count=0; - - ARMCI_PR_DBG("enter",proc); - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - - ORDER(PUT,proc); /* ensure ordering */ - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_ - if(stride_levels) direct=SAMECLUSNODE(proc); - direct=SAMECLUSNODE(proc); -#endif - - // printf("%s direct=%d, proc=%d\n",Portals_ID(),direct,proc); - - if(!direct){ - DO_FENCE(proc,SERVER_PUT); -// printf("%s calling pack_strided in PARMCI_PutS\n",Portals_ID()); - rc = armci_pack_strided(PUT, NULL, proc, src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels, NULL, -1, -1, -1,NULL); - } - else - { - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT); -// printf("%s calling op_strided in PARMCI_PutS\n",Portals_ID()); - rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr,count,stride_levels, - 0,NULL); - } - POSTPROCESS_STRIDED(tmp_count); - - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; - -} - -int PARMCI_PutS_flag( - void* src_ptr, /* pointer to 1st segment at source */ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination */ - int dst_stride_arr[], /* array of strides at destination */ - int count[], /* number of units at each stride level, - count[0] = #bytes */ - int stride_levels, /* number of stride levels */ - int *flag, /* pointer to remote flag */ - int val, /* value to set flag upon completion of - data transfer */ - int proc /* remote process(or) ID */ - ) -{ - int bytes; - /* Put local data on remote processor */ - PARMCI_PutS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, - count, stride_levels, proc); - - /* Send signal to remote processor that data transfer has - * been completed. */ - bytes = sizeof(int); - ARMCI_Put(&val, flag, bytes, proc); - return 1; -} - - -int PARMCI_Put_flag(void *src, void* dst,int bytes,int *f,int v,int proc) { - return PARMCI_PutS_flag(src, NULL, dst, NULL, &bytes, 0, f, v, proc); -} - - -int PARMCI_PutS_flag_dir(void *src_ptr, int src_stride_arr[], - void* dst_ptr, int dst_stride_arr[], - int seg_count[], int stride_levels, - int *flag, int val, int proc) { - return PARMCI_PutS_flag(src_ptr, src_stride_arr,dst_ptr,dst_stride_arr, - seg_count, stride_levels, flag, val, proc); -} - - -int PARMCI_GetS( void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ) -{ -int rc,direct=1; -int *count=seg_count, tmp_count=0; - ARMCI_PR_DBG("enter",proc); - - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(seg_count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0||proc>=armci_nproc){printf("\n%d:%s:proc=%d",armci_me,FUNCTION_NAME,proc);fflush(stdout);return FAIL5;} - - ORDER(GET,proc); /* ensure ordering */ - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_GET_ - if(stride_levels)direct=SAMECLUSNODE(proc); - direct=SAMECLUSNODE(proc); -#endif - if(!direct){ - DO_FENCE(proc,SERVER_GET); - rc = armci_pack_strided(GET, NULL, proc, src_ptr, src_stride_arr, - dst_ptr,dst_stride_arr,count,stride_levels, - NULL,-1,-1,-1,NULL); - - }else{ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET); - rc = armci_op_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr, - dst_stride_arr,count, stride_levels,0,NULL); - } - - POSTPROCESS_STRIDED(tmp_count); - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - - - - -int PARMCI_AccS( int optype, /* operation */ - void *scale, /* scale factor x += scale*y */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc /* remote process(or) ID */ - ) -{ -int rc, direct=1; -int *count=seg_count, tmp_count=0; - - ARMCI_PR_DBG("enter",proc); - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(src_stride_arr == NULL || dst_stride_arr ==NULL) return FAIL2; - if(count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - - ORDER(optype,proc); /* ensure ordering */ - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_ - direct=SAMECLUSNODE(proc); -#endif - -# if defined(ACC_COPY) && !defined(ACC_SMP) - if(armci_me != proc) direct=0; -# endif - if(direct) - rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels,1,NULL); - else{ - DO_FENCE(proc,SERVER_PUT); - rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,NULL); - } - POSTPROCESS_STRIDED(tmp_count); - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - - -/* - whatever original put and get functions were here have been - replaced with the proper ones from the main armci branch. - the old functions were entirely responsible for causing the - test_vector_acc test to fail in test.x -*/ - -int PARMCI_Put(void *src, void* dst, int bytes, int proc) { - int rc=0; -//ARMCI_PROFILE_START_STRIDED(&bytes, 0, proc, ARMCI_PROF_PUT); - rc = PARMCI_PutS(src, NULL, dst, NULL, &bytes, 0, proc); -//ARMCI_PROFILE_STOP_STRIDED(ARMCI_PROF_PUT); - assert(rc==0); - return rc; -} - -int PARMCI_Get(void *src, void* dst, int bytes, int proc) { - int rc=0; -//ARMCI_PROFILE_START_STRIDED(&bytes, 0, proc, ARMCI_PROF_GET); - -#ifdef __crayx1 - memcpy(dst,src,bytes); -#else - rc = PARMCI_GetS(src, NULL, dst, NULL, &bytes, 0, proc); -#endif -//ARMCI_PROFILE_STOP_STRIDED(ARMCI_PROF_GET); -//dassert(1,rc==0); - assert(rc==0); - return rc; -} - -int PARMCI_Acc(int optype, void *scale, void *src, void* dst, int bytes, int proc) { - int rc=0; - rc = PARMCI_AccS(optype, scale, src, NULL, dst, NULL, &bytes, 0, proc); - return rc; -} - -#define PACK1D 1 - -#if PACK1D -# define armci_read_strided1 armci_read_strided -# define armci_write_strided1 armci_write_strided -#else -# define armci_read_strided2 armci_read_strided -# define armci_write_strided2 armci_write_strided -#endif - -void armci_write_strided1(void *ptr, int stride_levels, int stride_arr[], - int count[], char *buf) -{ - int i, j; - long idx; /* index offset of current block position to ptr */ - int n1dim; /* number of 1 dim block */ - int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; - int bytes = count[0]; - ARMCI_PR_DBG("enter",stride_levels); - - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - - /* calculate the destination indices */ - bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) { - bvalue[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - for(i=0; i (count[j]-1)) bvalue[j] = 0; - } - - armci_copy( ((char*)ptr)+idx, buf, bytes); - buf += count[0]; - } - ARMCI_PR_DBG("exit",stride_levels); -} - - -void armci_write_strided2(void *ptr, int stride_levels, int stride_arr[], - int count[], char *buf) -{ - int i, j; - int total; /* number of 2 dim block */ - int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL]; - ARMCI_PR_DBG("enter",stride_levels); - - if(stride_levels == 0){ - armci_copy( ptr, buf, count[0]); - }else if (count[0]%ALIGN_SIZE || (unsigned long)ptr%ALIGN_SIZE ) - armci_write_strided1(ptr,stride_levels, stride_arr,count,buf); - else { - int rows, ld, idx, ldd; - char *src; - rows = count[0]/8; - ld = stride_arr[0]/8; - switch(stride_levels){ - case 1: - DCOPY21(&rows, count+1, ptr, &ld, buf, &idx); - break; - case 2: - ldd = stride_arr[1]/stride_arr[0]; - DCOPY31(&rows, count+1, count+2, ptr, &ld, &ldd, buf,&idx); - - break; - default: - index[2] = 0; unit[2] = 1; total = count[2]; - for(j=3; j<=stride_levels; j++) { - index[j] = 0; unit[j] = unit[j-1] * count[j-1]; - total *= count[j]; - } - for(i=0; i= count[j]) index[j] = 0; - } - DCOPY21(&rows, count+1,src, &ld, buf, &idx); - buf = (char*) ((double*)buf + idx); - } - } /*switch */ - } /*else */ - ARMCI_PR_DBG("exit",stride_levels); -} - - -void armci_read_strided1(void *ptr, int stride_levels, int stride_arr[], - int count[], char *buf) -{ - int i, j; - long idx; /* index offset of current block position to ptr */ - int n1dim; /* number of 1 dim block */ - int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; - int bytes = count[0]; - - ARMCI_PR_DBG("enter",stride_levels); - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - - /* calculate the destination indices */ - bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) { - bvalue[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - for(i=0; i (count[j]-1)) bvalue[j] = 0; - } - - armci_copy(buf, ((char*)ptr)+idx,bytes); - buf += count[0]; - } - ARMCI_PR_DBG("exit",stride_levels); -} - - -void armci_read_strided2(void *ptr, int stride_levels, int stride_arr[], - int count[], char *buf) -{ - int i, j; - int total; /* number of 2 dim block */ - int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL]; - - ARMCI_PR_DBG("enter",stride_levels); - if(stride_levels == 0){ - armci_copy( buf, ptr, count[0]); - }else if (count[0]%ALIGN_SIZE || (unsigned long)ptr%ALIGN_SIZE) - armci_read_strided1(ptr,stride_levels, stride_arr,count,buf); - else { - int rows, ld, idx, ldd; - char *src; - rows = count[0]/8; - ld = stride_arr[0]/8; - switch(stride_levels){ - case 1: - DCOPY12(&rows, count+1, ptr, &ld, buf, &idx); - break; - case 2: - ldd = stride_arr[1]/stride_arr[0]; - DCOPY13(&rows, count+1, count+2, ptr, &ld, &ldd, buf,&idx); - break; - default: - index[2] = 0; unit[2] = 1; total = count[2]; - for(j=3; j<=stride_levels; j++) { - index[j] = 0; unit[j] = unit[j-1] * count[j-1]; - total *= count[j]; - } - for(i=0; i= count[j]) index[j] = 0; - } - DCOPY12(&rows, count+1,src, &ld, buf, &idx); - buf = (char*) ((double*)buf + idx); - } - } /*switch */ - } /*else */ - ARMCI_PR_DBG("exit",stride_levels); -} - -/*\Non-Blocking API -\*/ -int PARMCI_NbPutS( void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* pointer to 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /* armci non-blocking call handle*/ - ) -{ -armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; -int *count=seg_count, tmp_count=0; -int rc=0, direct=1; - ARMCI_PR_DBG("enter",proc); - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_ - if(stride_levels)direct=SAMECLUSNODE(proc); - direct=SAMECLUSNODE(proc); -#endif - - /* aggregate put */ - if(nb_handle && nb_handle->agg_flag == SET) { - if(!direct){ - rc= armci_agg_save_strided_descriptor(src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, - count, stride_levels, proc, - PUT, nb_handle); - POSTPROCESS_STRIDED(tmp_count); - return(rc); - } - } - else { - UPDATE_FENCE_INFO(proc); - - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = PUT; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(PUT, proc); - } - - if(!direct){ - DO_FENCE(proc,SERVER_NBPUT); - rc = armci_pack_strided(PUT, NULL, proc, src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels,NULL,-1,-1,-1,nb_handle); - } - else{ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_NBPUT); - rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr, - dst_ptr,dst_stride_arr,count,stride_levels, 0,nb_handle); - } - - POSTPROCESS_STRIDED(tmp_count); - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - -int PARMCI_NbGetS( void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: byte_count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /* armci non-blocking call handle*/ - ) -{ -armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; -int rc=0,direct=1; -int *count=seg_count, tmp_count=0; - - ARMCI_PR_DBG("enter",proc); - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(seg_count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - -#if DATA_SERVER_GET_ - if(stride_levels)direct=SAMECLUSNODE(proc); - direct=SAMECLUSNODE(proc); -#endif - - PREPROCESS_STRIDED(tmp_count); - - if(nb_handle && nb_handle->agg_flag == SET) { - if(!direct){ - rc= armci_agg_save_strided_descriptor(src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, - count, stride_levels, proc, - GET, nb_handle); - POSTPROCESS_STRIDED(tmp_count); - return(rc); - } - } - else { - /* ORDER(GET,proc); ensure ordering */ - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = GET; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(GET, proc); - } - - if(!direct){ - DO_FENCE(proc,SERVER_NBGET); - rc = armci_pack_strided(GET, NULL, proc, src_ptr, src_stride_arr, - dst_ptr,dst_stride_arr,count,stride_levels, - NULL,-1,-1,-1,nb_handle); - } - else{ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET); - rc = armci_op_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr, - dst_stride_arr,count, stride_levels,0,nb_handle); - } - - POSTPROCESS_STRIDED(tmp_count); - - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - - -int PARMCI_NbAccS( int optype, /* operation */ - void *scale, /* scale factor x += scale*y */ - void *src_ptr, /* pointer to 1st segment at source*/ - int src_stride_arr[], /* array of strides at source */ - void* dst_ptr, /* 1st segment at destination*/ - int dst_stride_arr[], /* array of strides at destination */ - int seg_count[], /* number of segments at each stride - levels: count[0]=bytes*/ - int stride_levels, /* number of stride levels */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /* armci non-blocking call handle*/ - ) -{ -armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; -int *count=seg_count, tmp_count=0; -int rc, direct=1; - - ARMCI_PR_DBG("enter",proc); - if(src_ptr == NULL || dst_ptr == NULL) return FAIL; - if(src_stride_arr == NULL || dst_stride_arr ==NULL) return FAIL2; - if(count[0]<0)return FAIL3; - if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; - if(proc<0)return FAIL5; - - UPDATE_FENCE_INFO(proc); - PREPROCESS_STRIDED(tmp_count); - -#if DATA_SERVER_ - direct=SAMECLUSNODE(proc); -#endif - -# if defined(ACC_COPY) && !defined(ACC_SMP) - if(armci_me != proc) direct=0; -# endif - - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = optype; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(optype, proc); - - - if(direct){ - rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels,1,NULL); - } - else{ - DO_FENCE(proc,SERVER_NBPUT); - rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr, - dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,nb_handle); - } - - POSTPROCESS_STRIDED(tmp_count); - - ARMCI_PR_DBG("exit",proc); - if(rc) return FAIL6; - else return 0; -} - - -#if !defined(ACC_COPY)&&!defined(CRAY_YMP)&&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML) -# define REMOTE_OP -#endif - -#define INIT_NB_HANDLE(nb,o,p) if(nb){\ - (nb)->tag = 0;\ - (nb)->op = (o); (nb)->proc= (p);\ - (nb)->bufid=NB_NONE;}\ - else { (nb)=armci_set_implicit_handle(o, p); (nb)->tag=0; } - -void set_nbhandle(armci_ihdl_t *nbh, armci_hdl_t *nb_handle, int op, - int proc) -{ - if(nb_handle) - { - *nbh=(armci_ihdl_t)nb_handle; - } - else - { - *nbh=armci_set_implicit_handle(op, proc); - } -} - - -int PARMCI_NbPut(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle) -{ - -int rc=0, direct=0; -armci_ihdl_t nb_handle = (armci_ihdl_t)uhandle; - ARMCI_PR_DBG("enter",proc); - - if(src == NULL || dst == NULL) return FAIL; - - direct =SAMECLUSNODE(proc); - - /* aggregate put */ - if(nb_handle && nb_handle->agg_flag == SET) { - if(direct) { armci_copy(src,dst,bytes); rc=0; } - else - rc=armci_agg_save_descriptor(src,dst,bytes,proc,PUT,0,nb_handle); - return rc; - } - - if(direct) { - /*armci_wait needs proc to compute direct*/ - INIT_NB_HANDLE(nb_handle,PUT,proc); - armci_copy(src,dst,bytes); - } - else{ - # ifdef PORTALS - rc=PARMCI_NbPutS(src, NULL,dst,NULL, &bytes,0,proc,uhandle); - # else -# ifdef ARMCI_NB_PUT - INIT_NB_HANDLE(nb_handle,PUT,proc); - UPDATE_FENCE_STATE(proc, PUT, 1); - ARMCI_NB_PUT(src, dst, bytes, proc, &nb_handle->cmpl_info); -# else - rc=PARMCI_NbPutS(src, NULL,dst,NULL, &bytes,0,proc,uhandle); -# endif - # endif - } - - ARMCI_PR_DBG("exit",proc); - return(rc); -} - - -int PARMCI_NbGet(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle) -{ - -int rc=0, direct=0; -armci_ihdl_t nb_handle = (armci_ihdl_t)uhandle; - ARMCI_PR_DBG("enter",proc); - - if(src == NULL || dst == NULL) return FAIL; - - direct =SAMECLUSNODE(proc); - - if(nb_handle && nb_handle->agg_flag == SET) { - if(direct) { armci_copy(src,dst,bytes); rc=0; } - else - rc=armci_agg_save_descriptor(src,dst,bytes,proc,GET,0,nb_handle); - return rc; - } - - if(direct) { - /*armci_wait needs proc to compute direct*/ - INIT_NB_HANDLE(nb_handle,PUT,proc); - armci_copy(src,dst,bytes); - }else{ - - # ifdef PORTALS - rc=PARMCI_NbGetS(src, NULL,dst,NULL, &bytes,0,proc,uhandle); - # else -# ifdef ARMCI_NB_GET - /*set tag and op in the nb handle*/ - INIT_NB_HANDLE(nb_handle,GET,proc); - - ARMCI_NB_GET(src, dst, bytes, proc, &nb_handle->cmpl_info); -# else - rc=PARMCI_NbGetS(src, NULL,dst,NULL, &bytes,0,proc,uhandle); -# endif - # endif - } - ARMCI_PR_DBG("exit",proc); - return(rc); -} - - -static void _armci_rem_value(int op, void *src, void *dst, int proc, - int bytes) { - int rc=0; - int armci_th_idx = ARMCI_THREAD_IDX; - - ORDER(op,proc); /* ensure ordering */ - -#if defined(REMOTE_OP) && !defined(QUADRICS) - rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL, - &bytes, 0, NULL, 0, NULL); - if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6); -#else - - if(op==PUT) { - UPDATE_FENCE_STATE(proc, PUT, 1); -# ifdef LAPI - SET_COUNTER(ack_cntr[armci_th_idx], 1); -# endif -#if defined(BGML) - /* fprintf(stderr,"bytes: %d\n",bytes); */ - /* this call is blocking, so local count is fine */ - BG1S_t req; - unsigned count=1; - BGML_Callback_t cb_wait={wait_callback, &count}; - BG1S_Memput(&req, proc, src, 0, dst, bytes, &cb_wait, 1); - BGML_Wait(&count); -#else - - armci_put(src, dst, bytes, proc); -#endif - } - else { -# ifdef LAPI - SET_COUNTER(get_cntr[armci_th_idx], 1); -# endif -#if defined(BGML) - /* fprintf(stderr,"before memget\n"); */ - BG1S_t req; - unsigned count=1; - BGML_Callback_t cb_wait={wait_callback, &count}; - BG1S_Memget(&req, proc, dst, 0, src, bytes, &cb_wait, 1); - BGML_Wait(&count); - -#else - armci_get(src, dst, bytes, proc); -#endif - } - - /* deal with non-blocking loads and stores */ -# if defined(LAPI) || defined(_ELAN_PUTGET_H) - if(proc != armci_me){ - if(op == GET){ - WAIT_FOR_GETS; /* wait for data arrival */ - }else { - WAIT_FOR_PUTS; /* data must be copied out*/ - } - } -#endif -#endif -} - -/* non-blocking remote value put/get operation */ -static void _armci_nb_rem_value(int op, void *src, void *dst, int proc, - int bytes, armci_ihdl_t nb_handle) { - int rc=0, pv=0; - int armci_th_idx = ARMCI_THREAD_IDX; - - if(nb_handle && nb_handle->agg_flag == SET) { - if(op==PUT) pv = 1; - (void)armci_agg_save_descriptor(src,dst,bytes,proc,op,pv,nb_handle); - return; - } - else { - if(op==PUT) UPDATE_FENCE_INFO(proc); - - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = op; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(op, proc); - } - -#if defined(REMOTE_OP) && !defined(QUADRICS) - rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL, - &bytes, 0, NULL, 0, nb_handle); - if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6); -#else - - if(op==PUT) { - UPDATE_FENCE_STATE(proc, PUT, 1); -# ifdef LAPI - SET_COUNTER(ack_cntr[armci_th_idx], 1); -# endif - armci_put(src, dst, bytes, proc); - } - else { -# ifdef LAPI - SET_COUNTER(get_cntr[armci_th_idx], 1); -# endif - armci_get(src, dst, bytes, proc); - } - - /* deal with non-blocking loads and stores */ -# if defined(LAPI) || defined(_ELAN_PUTGET_H) -# ifdef LAPI - if(!nb_handle) -# endif - { - if(proc != armci_me){ - if(op == GET){ - WAIT_FOR_GETS; /* wait for data arrival */ - }else { - WAIT_FOR_PUTS; /* data must be copied out*/ - } - } - } -# endif -#endif -} - - -#define CHK_ERR(dst, proc) \ - if(dst==NULL) armci_die("PARMCI_PutValue: NULL pointer passed",FAIL); \ - if(proc<0) armci_die("PARMCI_PutValue: Invalid process rank", proc); - -#define CHK_ERR_GET(src, dst, proc, bytes) \ - if(src==NULL || dst==NULL) armci_die("PARMCI_GetValue: NULL pointer passed",FAIL); \ - if(proc<0) armci_die("PARMCI_GetValue: Invalid process rank", proc); \ - if(bytes<0) armci_die("PARMCI_GetValue: Invalid size", bytes); - -/** - * Register-Originated Put. - */ -int PARMCI_PutValueInt(int src, void *dst, int proc) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(int *)dst = src; - else _armci_rem_value(PUT, &src, dst, proc, sizeof(int)); - return 0; -} - -int PARMCI_PutValueLong(long src, void *dst, int proc) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(long *)dst = src; - else _armci_rem_value(PUT, &src, dst, proc, sizeof(long)); - return 0; -} - -int PARMCI_PutValueFloat(float src, void *dst, int proc) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(float *)dst = src; - else _armci_rem_value(PUT, &src, dst, proc, sizeof(float)); - return 0; -} - -int PARMCI_PutValueDouble(double src, void *dst, int proc) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(double *)dst = src; - else _armci_rem_value(PUT, &src, dst, proc, sizeof(double)); - return 0; -} - -/** - * Non-Blocking register-originated put. - */ -int PARMCI_NbPutValueInt(int src, void *dst, int proc, armci_hdl_t* usr_hdl) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(int *)dst = src; - else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(int),(armci_ihdl_t)usr_hdl); - return 0; -} - -int PARMCI_NbPutValueLong(long src, void *dst, int proc, armci_hdl_t* usr_hdl) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(long *)dst = src; - else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(long),(armci_ihdl_t)usr_hdl); - return 0; -} - -int PARMCI_NbPutValueFloat(float src, void *dst, int proc, armci_hdl_t* usr_hdl) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(float *)dst = src; - else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(float),(armci_ihdl_t)usr_hdl); - return 0; -} - -int PARMCI_NbPutValueDouble(double src, void *dst, int proc, armci_hdl_t* usr_hdl) -{ - CHK_ERR(dst, proc); - if( SAMECLUSNODE(proc) ) *(double *)dst = src; - else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(double),(armci_ihdl_t)usr_hdl); - return 0; - } - -#if 1 -/** - * Register-Originated Get. - */ -int PARMCI_GetValueInt(void *src, int proc) -{ - int dst; - if( SAMECLUSNODE(proc) ) return *(int *)src; - else _armci_rem_value(GET, src, &dst, proc, sizeof(int)); - return dst; -} - -long PARMCI_GetValueLong(void *src, int proc) -{ - long dst; - if( SAMECLUSNODE(proc) ) return *(long *)src; - else _armci_rem_value(GET, src, &dst, proc, sizeof(long)); - return dst; -} - -float PARMCI_GetValueFloat(void *src, int proc) -{ - float dst; - if( SAMECLUSNODE(proc) ) return *(float *)src; - else _armci_rem_value(GET, src, &dst, proc, sizeof(float)); - return dst; -} - -double PARMCI_GetValueDouble(void *src, int proc) -{ - double dst; - if( SAMECLUSNODE(proc) ) return *(double *)src; - else _armci_rem_value(GET, src, &dst, proc, sizeof(double)); - return dst; -} - -#endif - -#if 0 -/** - * Register-Originated Get. - */ -int PARMCI_GetValue(void *src, void *dst, int proc, int bytes) -{ - CHK_ERR_GET(src, dst, proc, bytes); - if( SAMECLUSNODE(proc) ) { armci_copy(src, dst, bytes); } - else _armci_rem_value(GET, src, dst, proc, bytes); - return 0; -} - -/** - * Non-Blocking register-originated get. - */ -int PARMCI_NbGetValue(void *src, void *dst, int proc, int bytes, armci_hdl_t* usr_hdl) -{ - CHK_ERR_GET(src, dst, proc, bytes); - if( SAMECLUSNODE(proc) ) { armci_copy(src, dst, bytes); } - else _armci_nb_rem_value(GET, src, dst, proc, bytes, (armci_ihdl_t)usr_hdl); - return 0; -} -#endif - diff --git a/armci/src-portals/threads.c b/armci/src-portals/threads.c deleted file mode 100644 index 68857520a..000000000 --- a/armci/src-portals/threads.c +++ /dev/null @@ -1,117 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: threads.c,v 1.1.2.5 2007-08-28 21:29:46 manoj Exp $ */ - -#if 0 -# define PRNDBG3(m,a1,a2,a3) \ - fprintf(stderr,"DBG %d: " m,armci_me,a1,a2,a3);fflush(stderr) -# define PRNDBG(m) PRNDBG3(m,0,0,0) -# define PRNDBG1(m,a1) PRNDBG3(m,a1,0,0) -# define PRNDBG2(m,a1,a2) PRNDBG3(m,a1,a2,0) -#else -# define PRNDBG(m) -# define PRNDBG1(m,a1) -# define PRNDBG2(m,a1,a2) -# define PRNDBG3(m,a1,a2,a3) -#endif - - -#include -#include "armcip.h" - -armci_user_threads_t armci_user_threads; - -void armci_init_threads() -{ - int i, bytes; - char *uval = getenv("ARMCI_MAX_THREADS"); - - armci_user_threads.max = 1; - armci_user_threads.avail = 0; - - if (uval != NULL) sscanf(uval, "%d", &armci_user_threads.max); - - if (armci_user_threads.max < 1 || - armci_user_threads.max > ARMCI_THREADS_LIMIT) - { - printf("Error: Only 1-%d threads are supported. ",ARMCI_THREADS_LIMIT); - printf("Set ARMCI_MAX_THREADS appropriately\n"); fflush(stdout); - armci_die("armci_init_threads: failed", 0); - } - - bytes = sizeof(thread_id_t) * armci_user_threads.max; - if ( !(armci_user_threads.ids = (thread_id_t*) malloc(bytes)) ) - { - armci_die("armci_init_threads: armci_user_threads.ids malloc failed", - armci_user_threads.max); - } - memset(armci_user_threads.ids, 0, bytes); - -#if 0 /* spinlock has void return value */ - if (THREAD_LOCK_INIT(armci_user_threads.lock) || - THREAD_LOCK_INIT(armci_user_threads.buf_lock) || - THREAD_LOCK_INIT(armci_user_threads.net_lock)) - armci_die("armci_init_threads:locks initialization failed", 0); -#else - THREAD_LOCK_INIT(armci_user_threads.lock); - THREAD_LOCK_INIT(armci_user_threads.buf_lock); - THREAD_LOCK_INIT(armci_user_threads.net_lock); -#endif - -#if 0 - /* using one lock per socket for now, it might be feasible (and usefull) - * to use two (one for sending and one for receiving) */ - armci_user_threads.sock_locks = malloc(armci_nclus *sizeof(thread_lock_t)); - for (i = 0; i < armci_nclus; i++) - if (THREAD_LOCK_INIT(armci_user_threads.sock_locks[i])) - armci_die("armci_init_threads:sock locks initialization failed", i); -#endif -} - -void armci_finalize_threads() -{ - THREAD_LOCK_DESTROY(armci_user_threads.lock); - THREAD_LOCK_DESTROY(armci_user_threads.net_lock); - THREAD_LOCK_DESTROY(armci_user_threads.buf_lock); - free(armci_user_threads.ids); -} - -/* calling armci_thread_idx for every function that accesses thread-private data - * might be expensive -- needs optiomization */ -INLINE int armci_thread_idx() -{ - int i, n = ARMCI_MIN(armci_user_threads.avail, armci_user_threads.max); - thread_id_t id = THREAD_ID_SELF(); - - for (i = 0; i < n; i++) if (id == armci_user_threads.ids[i]) { - /*PRNDBG2("thread id=%ld already registered, idx=%d\n", id, i);*/ - return i; - } - - /* see this thread for the first time */ - return armci_register_thread(id); -} - -INLINE int armci_register_thread(thread_id_t id) -{ - int i; - - THREAD_LOCK(armci_user_threads.lock); - - i = armci_user_threads.avail; - armci_user_threads.avail++; - - THREAD_UNLOCK(armci_user_threads.lock); - - if (i < armci_user_threads.max) - armci_user_threads.ids[i] = id; - else - armci_die("armci_thread_idx: too many threads, adjust ARMCI_MAX_THREADS", - armci_user_threads.avail); - - PRNDBG2("registered a new thread: idx=%d, id=%ld\n", i, id); - return i; -} - diff --git a/armci/src-portals/timer.c b/armci/src-portals/timer.c deleted file mode 100644 index 6c1a2977b..000000000 --- a/armci/src-portals/timer.c +++ /dev/null @@ -1,43 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: timer.c,v 1.3 2004-04-09 22:03:51 manoj Exp $ */ -#ifdef WIN32 - static double msec; /* reference for timer */ -# include -#else -# include -# include - static unsigned firstsec=0; /* Reference for timer */ - static unsigned firstusec=0; /* Reference for timer */ -#endif -static int first_call=1; - -double armci_timer() -{ -#ifdef WIN32 - double t0 = (double)GetCurrentTime(); - if(first_call){ - first_call=0; msec=t0; return 0.0; - } - t0 -=msec; - if(t0<0.0)t0 += (double)0xffffffff; - return 0.01*t0; -#else - double low, high; - struct timeval tp; - struct timezone tzp; - (void) gettimeofday(&tp,&tzp); - - if (first_call) { - firstsec = tp.tv_sec; - firstusec = tp.tv_usec; - first_call = 0; - } - low = (double)(tp.tv_usec>>1) - (double) (firstusec>>1); - high = (double) (tp.tv_sec - firstsec); - return high + 1.0e-6*(low+low); -#endif -} - diff --git a/armci/src-portals/utils.c b/armci/src-portals/utils.c deleted file mode 100644 index df43fc60a..000000000 --- a/armci/src-portals/utils.c +++ /dev/null @@ -1,230 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* - * A barrier causes threads to wait until a set of threads has - * all "reached" the barrier. The number of threads required is - * set when the barrier is initialized, and cannot be changed - * except by reinitializing. - * - * The barrier_init() and barrier_destroy() functions, - * respectively, allow you to initialize and destroy the - * barrier. - * - * The barrier_wait() function allows a thread to wait for a - * barrier to be completed. One thread (the one that happens to - * arrive last) will return from barrier_wait() with the status - * -1 on success -- others will return with 0. The special - * status makes it easy for the calling code to cause one thread - * to do something in a serial region before entering another - * parallel section of code. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "utils.h" - -#define DEBUG_ - -int mt_size; /* number of processes: needed for collective mt ops */ -int mt_tpp; /* number of threads used for collective ops */ -thread_barrier_t mt_barrier; /* static barrier used for multi-threaded MT_BARRIER */ - -int armci_malloc_mt(void *ptr[], int bytes) -{ - int rc, th_size, i, j; - - th_size = mt_size * mt_tpp; - if (thread_barrier_wait(&mt_barrier)==-1) { - rc = PARMCI_Malloc(ptr, bytes * mt_tpp); -#ifdef DEBUG - printf("bytes=%d\n", bytes); - for (i = 0; i < mt_size; i++) printf("ptr[%d]=%p\n",i,ptr[i]); -#endif - /* at this point proc ptrs are at beggining of the list */ - for (i = mt_size - 1; i >= 0; i--) for (j = mt_tpp - 1; j >= 0; j--) { -#ifdef DEBUG - printf("mt_size=%d,mt_tpp=%d,i=%d,j=%d,ptr[%d]=%p+%d\n", - mt_size,mt_tpp,i,j,i*mt_tpp+j,ptr[i],j*bytes); - fflush(stdout); -#endif - ptr[i * mt_tpp + j] = ((char*)ptr[i]) + j * bytes; - } - } - thread_barrier_wait(&mt_barrier); - - return rc; -} - -int armci_free_mt(void *ptr, int th_idx) -{ -} - -#ifdef POSIX_THREADS -/* - * Initialize a barrier for use. - */ -int thread_barrier_init (thread_barrier_t *barrier, int count) -{ - int status; - - barrier->threshold = barrier->counter = count; - barrier->cycle = 0; - status = pthread_mutex_init (&barrier->mutex, NULL); - if (status != 0) - return status; - status = pthread_cond_init (&barrier->cv, NULL); - if (status != 0) { - pthread_mutex_destroy (&barrier->mutex); - return status; - } - barrier->valid = BARRIER_VALID; - return 0; -} - -/* - * Destroy a barrier when done using it. - */ -int thread_barrier_destroy (thread_barrier_t *barrier) -{ - int status, status2; - - if (barrier->valid != BARRIER_VALID) - return EINVAL; - - status = pthread_mutex_lock (&barrier->mutex); - if (status != 0) - return status; - - /* - * Check whether any threads are known to be waiting; report - * "BUSY" if so. - */ - if (barrier->counter != barrier->threshold) { - pthread_mutex_unlock (&barrier->mutex); - return EBUSY; - } - - barrier->valid = 0; - status = pthread_mutex_unlock (&barrier->mutex); - if (status != 0) - return status; - - /* - * If unable to destroy either 1003.1c synchronization - * object, return the error status. - */ - status = pthread_mutex_destroy (&barrier->mutex); - status2 = pthread_cond_destroy (&barrier->cv); - return (status == 0 ? status : status2); -} - -/* - * Wait for all members of a barrier to reach the barrier. When - * the count (of remaining members) reaches 0, broadcast to wake - * all threads waiting. - */ -int thread_barrier_wait (thread_barrier_t *barrier) -{ - int status, cancel, tmp, cycle; - - if (barrier->valid != BARRIER_VALID) - return EINVAL; - - status = pthread_mutex_lock (&barrier->mutex); - if (status != 0) - return status; - - cycle = barrier->cycle; /* Remember which cycle we're on */ - - if (--barrier->counter == 0) { - barrier->cycle = !barrier->cycle; - barrier->counter = barrier->threshold; - status = pthread_cond_broadcast (&barrier->cv); - /* - * The last thread into the barrier will return status - * -1 rather than 0, so that it can be used to perform - * some special serial code following the barrier. - */ - if (status == 0) - status = -1; - } else { - /* - * Wait with cancellation disabled, because barrier_wait - * should not be a cancellation point. - */ - pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &cancel); - - /* - * Wait until the barrier's cycle changes, which means - * that it has been broadcast, and we don't want to wait - * anymore. - */ - while (cycle == barrier->cycle) { - status = pthread_cond_wait ( - &barrier->cv, &barrier->mutex); - if (status != 0) break; - } - - pthread_setcancelstate (cancel, &tmp); - } - /* - * Ignore an error in unlocking. It shouldn't happen, and - * reporting it here would be misleading -- the barrier wait - * completed, after all, whereas returning, for example, - * EINVAL would imply the wait had failed. The next attempt - * to use the barrier *will* return an error, or hang, due - * to whatever happened to the mutex. - */ - pthread_mutex_unlock (&barrier->mutex); - return status; /* error, -1 for waker, or 0 */ -} -#endif - -#if 0 - -/*** - NAME - timing.c - PURPOSE - Timing routines for calculating the execution time: - void start_timer(void); Set the timer. - double elapsed_time(void); Return the timing elapsed since - the timer has been set. - NOTES - Jialin Ju - Oct 16, 1995 Created. -***/ - -/* Timing routines that use standard Unix gettingofday() */ -static struct timezone tz; -static struct timeval start_time, finish_time; - -/* Start measuring a time delay */ -void start_timer(void) -{ - gettimeofday( &start_time, &tz); -} - -/* Retunrn elapsed time in milliseconds */ -double elapsed_time(void) -{ - gettimeofday( &finish_time, &tz); - return(1000.0*(finish_time.tv_sec - start_time.tv_sec) + - (finish_time.tv_usec - start_time.tv_usec)/1000.0 ); -} - -/* Return the stopping time in milliseconds */ -double stop_time(void) -{ - gettimeofday( &finish_time, &tz); - return(1000.0*finish_time.tv_sec + finish_time.tv_usec/1000.0); -} - -#endif diff --git a/armci/src-portals/utils.h b/armci/src-portals/utils.h deleted file mode 100644 index d975e8b97..000000000 --- a/armci/src-portals/utils.h +++ /dev/null @@ -1,116 +0,0 @@ -/* $Id: utils.h,v 1.1.2.3 2007-07-02 05:35:31 d3p687 Exp $ - * - * primitives for transparent handling of multi-threading - */ - -#ifndef UTILS_H -#define UTILS_H - -/* - * This header file describes the "barrier" synchronization - * construct. The type barrier_t describes the full state of the - * barrier including the POSIX 1003.1c synchronization objects - * necessary. - * - * A barrier causes threads to wait until a set of threads has - * all "reached" the barrier. The number of threads required is - * set when the barrier is initialized, and cannot be changed - * except by reinitializing. - */ - - -#ifdef THREAD_SAFE -# ifdef POSIX_THREADS - -# include - -#if 1 - typedef pthread_mutex_t thread_lock_t; -# define THREAD_LOCK_INIT(x) pthread_mutex_init(&x,NULL) -# define THREAD_LOCK_DESTROY(x) pthread_mutex_destroy(&x) -# define THREAD_LOCK(x) pthread_mutex_lock(&x) -# define THREAD_UNLOCK(x) pthread_mutex_unlock(&x) -#else - -#ifndef INLINE -# define INLINE -# include "spinlock.h" -# undef INLINE -#else -# include "spinlock.h" -#endif - - typedef LOCK_T thread_lock_t; -# define THREAD_LOCK_INIT(x) armci_init_spinlock(&x) -# define THREAD_LOCK_DESTROY(x) 0 -# define THREAD_LOCK(x) armci_acquire_spinlock(&x) -# define THREAD_UNLOCK(x) armci_release_spinlock(&x) -#endif - typedef pthread_t thread_t; -# define THREAD_CREATE(th_,func_,arg_) pthread_create(th_,NULL,func_,arg_) -# define THREAD_JOIN(th_,ret_) pthread_join(th_,ret_) - - /* structure describing a barrier */ - typedef struct thread_barrier_tag { - pthread_mutex_t mutex; /* Control access to barrier */ - pthread_cond_t cv; /* wait for barrier */ - int valid; /* set when valid */ - int threshold; /* number of threads required */ - int counter; /* current number of threads */ - int cycle; /* alternate wait cycles (0 or 1) */ - } thread_barrier_t; - -# define BARRIER_VALID 0xdbcafe - - /* support static initialization of barriers */ -# define BARRIER_INITIALIZER(cnt) {\ - PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER,\ - BARRIER_VALID, cnt, cnt, 0} - -# else -# error ONLY PTHREADS SUPPORT HAS BEEN IMPLEMENTED -# endif - -# define TH2PROC(th_) (th_/mt_tpp) /* computes processor from thread id */ - - /* barrier functions */ - int thread_barrier_init (thread_barrier_t *barrier, int count); - int thread_barrier_destroy (thread_barrier_t *barrier); - int thread_barrier_wait (thread_barrier_t *barrier); - - /* multi-threaded memory functions */ - int armci_malloc_mt(void *ptr[], int bytes); - int armci_free_mt(void *ptr, int th_idx); -# define ARMCI_MALLOC_MT armci_malloc_mt -# define ARMCI_FREE_MT armci_free_mt - - -# define TH_INIT(p_,t_) mt_size=p_;mt_tpp=t_;\ - thread_barrier_init(&mt_barrier,mt_tpp) -# define TH_FINALIZE() thread_barrier_destroy(&mt_barrier) -# define MT_BARRIER() if (thread_barrier_wait(&mt_barrier)==-1) armci_msg_barrier();\ - thread_barrier_wait(&mt_barrier) - - extern int mt_size; - extern int mt_tpp; - extern thread_barrier_t mt_barrier; -#else -# define THREAD_LOCK_INIT(x) -# define THREAD_LOCK_DESTROY(x) -# define THREAD_LOCK(x) -# define THREAD_UNLOCK(x) -# define TH_INIT(p_,t_) -# define TH_FINALIZE() -# define MT_BARRIER armci_msg_barrier -# define ARMCI_MALLOC_MT PARMCI_Malloc -# define ARMCI_FREE_MT(p_,th_) PARMCI_Free(p_) -#endif - - - - - - -#endif/*UTILS_H*/ - - diff --git a/armci/src-portals/vector.c b/armci/src-portals/vector.c deleted file mode 100644 index 79f55fcb6..000000000 --- a/armci/src-portals/vector.c +++ /dev/null @@ -1,603 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: vector.c,v 1.32.6.4 2007-08-29 17:32:32 manoj Exp $ */ -#include "armcip.h" -#include "copy.h" -#include "acc.h" -#include "memlock.h" -#include -#include - -#define SERVER_GET 1 -#define SERVER_NBGET 2 -#define DIRECT_GET 3 -#define DIRECT_NBGET 4 -#define SERVER_PUT 5 -#define SERVER_NBPUT 6 -#define DIRECT_PUT 7 -#define DIRECT_NBPUT 8 - - -# define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\ - else if(__prot==SERVER_PUT);\ - else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\ - if(armci_prot_switch_fence[__proc]==SERVER_PUT)\ - ARMCI_DoFence(__proc);\ - }\ - else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\ - if(armci_prot_switch_fence[__proc]==SERVER_PUT)\ - ARMCI_DoFence(__proc);\ - }\ - else;\ - armci_prot_switch_fence[__proc]=__prot - -/* -typedef struct { - float real; - float imag; -} complex_t; - -typedef struct { - double real; - double imag; -} dcomplex_t; -*/ - -/* -void I_ACCUMULATE(void* scale, int elems, void*src, void* dst) -{ - int j; - int *a=(int*)dst, *b=(int*)src; - int alpha = *(int*)scale; - - for(j=0;j BUFSIZE/2){ - /* for large segments use strided implementation */ - for(j=0; j< dr.ptr_array_len; j++){ - rc = armci_acc_copy_strided(op, scale,proc, - dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL, - &dr.bytes, 0); - if(rc)return(rc); - } - }else{ - armci_giov_t dl; - /*lock memory:should optimize it to lock only a chunk at a time*/ - armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc); - /* copy as many blocks as possible into the local buffer */ - dl.bytes = dr.bytes; - nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes); - for(j=0; j< dr.ptr_array_len; j+= nb){ - int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j); - int k; - /* setup vector descriptor for remote memory copy - to bring data into buffer*/ - dl.ptr_array_len = nblocks; - dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */ - for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer; - dl.dst_ptr_array = pwork; - /* get data to the local buffer */ - rc = armci_copy_vector(GET, &dl, 1, proc); - if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);} - /* update source array for accumulate */ - dl.src_ptr_array = dr.src_ptr_array +j; - /* do scatter accumulate updating copy of data in buffer */ - armci_scatter_acc(op, scale, dl, armci_me, 0); - /* modify descriptor-now source becomes destination for PUT*/ - dl.dst_ptr_array = dr.dst_ptr_array + j; - dl.src_ptr_array = pwork; - /* put data back */ - rc = armci_copy_vector(PUT, &dl, 1, proc); - FENCE_NODE(proc); - if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);} - } - ARMCI_UNLOCKMEM(proc); - } - }/*endfor*/ - } -#endif - - return 0; -} - - - - -int armci_copy_vector(int op, /* operation code */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ) -{ - int i,s,shmem= SAMECLUSNODE(proc); - int armci_th_idx = ARMCI_THREAD_IDX; - - if(shmem){ - /* local/shared memory copy */ - for(i = 0; i< len; i++){ - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_copy(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],darr[i].bytes); - } - } - - }else { - switch(op){ - case PUT: - - for(i = 0; i< len; i++){ - - UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len); - - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s], - darr[i].bytes, proc); - } - } - break; - case GET: - for(i = 0; i< len; i++){ - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s], - darr[i].bytes,proc); - } - } - break; - default: - armci_die("armci_copy_vector: wrong optype",op); - } - } - - return 0; -} - - -void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf) -{ -int i,s; -char *ptr = (char*)buf; - for(i = 0; i< len; i++){ - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_copy(darr[i].src_ptr_array[s],ptr,darr[i].bytes); - ptr += darr[i].bytes; - } - } -} - - -void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf) -{ -int i,s; -char *ptr = (char*)buf; - - for(i = 0; i< len; i++){ - for( s=0; s< darr[i].ptr_array_len; s++){ - armci_copy(ptr, darr[i].dst_ptr_array[s],darr[i].bytes); - ptr += darr[i].bytes; - } - } -} - -int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ) -{ - int rc=0, i,direct=1; - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - ORDER(PUT,proc); /* ensure ordering */ - direct=SAMECLUSNODE(proc); - - if(direct){ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT); - rc = armci_copy_vector(PUT, darr, len, proc); - } - else{ - DO_FENCE(proc,SERVER_PUT); - rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL); - } - - if(rc) return FAIL6; - else return 0; - -} - - -int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ) -{ - int rc=0, i,direct=1; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - ORDER(GET,proc); /* ensure ordering */ -#ifndef QUADRICS - direct=SAMECLUSNODE(proc); -#endif - - if(direct){ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET); - rc = armci_copy_vector(GET, darr, len, proc); - } - else{ - DO_FENCE(proc,SERVER_GET); - rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL); - } - - if(rc) return FAIL6; - else return 0; -} - - - - -int PARMCI_AccV( int op, /* oeration code */ - void *scale, /*scaling factor for accumulate */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc /* remote process(or) ID */ - ) -{ - int rc=0, i,direct=0; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - ORDER(op,proc); /* ensure ordering */ - direct=SAMECLUSNODE(proc); -# if defined(ACC_COPY) && !defined(ACC_SMP) - if(armci_me != proc) direct=0; -# error "grrr" -# endif - if(direct) { - rc = armci_acc_vector( op, scale, darr, len, proc); - } else { - DO_FENCE(proc,SERVER_PUT); - rc = armci_pack_vector(op, scale, darr, len, proc,NULL); - } - - if(rc) return FAIL6; - else return 0; -} - - -/*****************************************************************************/ - -/*\ Non-blocking vector API -\*/ -int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /*non-blocking request handle*/ - ) -{ - armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; - int rc=0, i,direct=1; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - direct=SAMECLUSNODE(proc); - /* aggregate put */ - if(nb_handle && nb_handle->agg_flag == SET) { - if(!direct) { - rc=armci_agg_save_giov_descriptor(darr, len, proc, PUT, nb_handle); - return rc; - } - } - else { - - /*ORDER(PUT,proc); ensure ordering */ - UPDATE_FENCE_INFO(proc); - - /*set tag and op in the nb handle*/ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = PUT; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(PUT, proc); - } - - if(direct){ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT); - rc = armci_copy_vector(PUT, darr, len, proc); - } - else{ - DO_FENCE(proc,SERVER_NBPUT); - rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle); - } - - if(rc) return FAIL6; - else return 0; -} - -int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /*non-blocking request handle*/ - ) -{ - armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; - int rc=0, i,direct=1; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - direct=SAMECLUSNODE(proc); - - /* aggregate get */ - if(nb_handle && nb_handle->agg_flag == SET) { - if(!direct) { - rc=armci_agg_save_giov_descriptor(darr, len, proc, GET, nb_handle); - return rc; - } - } - else { - /* ORDER(GET,proc); ensure ordering */ - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = GET; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(GET, proc); - } - - if(direct){ - if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET); - rc = armci_copy_vector(GET, darr, len, proc); - } - else{ - DO_FENCE(proc,SERVER_NBGET); - rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle); - } - - if(rc) return FAIL6; - else return 0; -} - - -int PARMCI_NbAccV( int op, /* oeration code */ - void *scale, /*scaling factor for accumulate */ - armci_giov_t darr[], /* descriptor array */ - int len, /* length of descriptor array */ - int proc, /* remote process(or) ID */ - armci_hdl_t* usr_hdl /*non-blocking request handle*/ - ) -{ - armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl; - int rc=0, i,direct=1; - - if(len<1) return FAIL; - for(i=0;i= armci_nproc)return FAIL5; - - UPDATE_FENCE_INFO(proc); - direct=SAMECLUSNODE(proc); - - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = op; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = armci_set_implicit_handle(op, proc); - -# if defined(ACC_COPY) && !defined(ACC_SMP) - if(armci_me != proc) direct=0; -# endif - - if(direct) - rc = armci_acc_vector( op, scale, darr, len, proc); - else{ - DO_FENCE(proc,SERVER_NBPUT); - rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle); - } - - if(rc) return FAIL6; - else return 0; -} -/*****************************************************************************/ diff --git a/armci/src/collectives/message.c b/armci/src/collectives/message.c index 290ef0d03..443d0c3e5 100644 --- a/armci/src/collectives/message.c +++ b/armci/src/collectives/message.c @@ -3,11 +3,7 @@ #endif /* $Id: message.c,v 1.58.6.4 2007-04-24 10:08:26 vinod Exp $ */ -#if defined(BGML) -# include "bgml.h" -#elif defined(PVM) -# include -#elif defined(TCGMSG) +#if defined(TCGMSG) # include static void tcg_brdcst(long type, void *buf, long lenbuf, long originator) { @@ -61,10 +57,8 @@ static void tcg_rcv(long type, void *buf, long lenbuf, long *lenmes, # include #endif #ifdef _POSIX_PRIORITY_SCHEDULING -#ifndef HITACHI # include #endif -#endif #include "armci.h" #include "acc.h" @@ -89,7 +83,7 @@ static int _armci_gop_init=0; /* tells us if we have a buffers allocated */ static int _armci_gop_shmem =0; /* tells us to use shared memory for gops */ extern void armci_util_wait_int(volatile int *, int , int ); static int empty=EMPTY,full=FULL; -#if !defined(SGIALTIX) && defined(SYSV) || defined(MMAP) || defined(WIN32) +#if defined(SYSV) || defined(MMAP) || defined(WIN32) static void **ptr_arr=NULL; #endif @@ -115,17 +109,6 @@ static bufstruct *_gop_buffer; #ifdef NEED_MEM_SYNC # ifdef AIX # define SET_SHM_FLAG(_flg,_val) _clear_lock((int *)(_flg),_val); -# elif defined(NEC) -# define SET_SHM_FLAG(_flg,_val) MEM_FENCE; *(_flg)=(_val) -# elif defined(__ia64) -# if defined(__GNUC__) && !defined (__INTEL_COMPILER) -# define SET_SHM_FLAG(_flg,_val)\ - __asm__ __volatile__ ("mf" ::: "memory"); *(_flg)=(_val) -# else /* Intel Compiler */ - extern void _armci_ia64_mb(); -# define SET_SHM_FLAG(_flg,_val)\ - _armci_ia64_mb(); *(_flg)=(_val); -# endif # elif defined(MACX) # if defined(__GNUC__) # define SET_SHM_FLAG(_flg,_val)\ @@ -195,12 +178,7 @@ int armci_msg_generate_tree(int *idlist,int idlen,int *id_tree,int TREE) /*\ * ************************************************************* \*/ -#ifdef CRAY -char *mp_group_name = (char *)NULL; -#else char *mp_group_name = "mp_working_group"; -#endif - static void _allocate_mem_for_work(){ work = (double *)malloc(sizeof(double)*BUF_SIZE); @@ -224,15 +202,12 @@ void armci_msg_gop_init() memory from malloc because of a problem with cc on SV1 */ if(work==NULL)_allocate_mem_for_work(); -#if !defined(SGIALTIX) && defined(SYSV) || defined(MMAP) || defined(WIN32) +#if defined(SYSV) || defined(MMAP) || defined(WIN32) if(ARMCI_Uses_shm()){ char *tmp; int size = sizeof(bufstruct); int bytes = size * armci_clus_info[armci_clus_me].nslave; -#ifdef LAPI - void armci_msg_barr_init(); - armci_msg_barr_init(); -#endif + bytes += size*2; /* extra for brdcst */ ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); @@ -259,34 +234,14 @@ void armci_msg_gop_init() _armci_gop_shmem = 1; } #endif - /*stuff needed for barrier and binomial bcast/reduce*/ -#ifdef LAPI - if(!_armci_barrier_shmem){ - int size = 2*sizeof(int); - /*allocate memory to send/rcv data*/ - barr_snd_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - barr_rcv_ptr = (void **)malloc(sizeof(void *)*armci_nproc); - - if(PARMCI_Malloc(barr_snd_ptr,size))armci_die("malloc barrinit failed",0); - if(PARMCI_Malloc(barr_rcv_ptr,size))armci_die("malloc barrinit failed",0); - if(barr_rcv_ptr[armci_me]==NULL || barr_snd_ptr[armci_me]==NULL) - armci_die("problems in malloc barr_init",0); - powof2nodes=1; - LnB = floor(log(armci_nclus)/log(2))+1; - if(pow(2,LnB-1)flag1,FULL,100000); - SET_SHM_FLAG(&(BAR_BUF(i)->flag1),empty); - } - if(armci_nclus>1){ - last = ((int)pow(2,(LnB-1)))^armci_clus_me; - if(last>=0 && lastarmci_clus_me){ /*the pow2 set of procs*/ - if(last=0 && next armci_me){ - armci_msg_snd(ARMCI_TAG, srcp,4,next_node); - armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_node); - } - else{ - /*would we gain anything by doing a snd,rcv instead of rcv,snd*/ - armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_node); - armci_msg_snd(ARMCI_TAG, srcp,4,next_node); - } - armci_util_wait_int((volatile int *)dstn,barr_count,100000); - } - } - if(last1*/ - for(i=1;iflag2),full); - } - else { /*if not master, partake in the smp barrier,only*/ - i=armci_me-armci_master; - SET_SHM_FLAG(&(BAR_BUF(i)->flag1),full); - armci_util_wait_int(&BAR_BUF(i)->flag2,FULL,100000); - SET_SHM_FLAG(&(BAR_BUF(i)->flag2),empty); - } -} - -#endif /*barrier enabled only for lapi*/ + void parmci_msg_barrier() { -#ifdef BGML - bgml_barrier (3); /* this is always faster than MPI_Barrier() */ -#elif defined(MSG_COMMS_MPI) - MPI_Barrier(ARMCI_COMM_WORLD); -# elif defined(PVM) - pvm_barrier(mp_group_name, armci_nproc); -# elif defined(LAPI) -#if !defined(NEED_MEM_SYNC) - if(_armci_barrier_init) - _armci_msg_barrier(); - else +#if defined(MSG_COMMS_MPI) + MPI_Barrier(ARMCI_COMM_WORLD); +#else + tcg_synch(ARMCI_TAG); #endif - { - tcg_synch(ARMCI_TAG); - } -# else - { - tcg_synch(ARMCI_TAG); - } -# endif } /***********************End Barrier Code*************************************/ @@ -492,13 +345,11 @@ void armci_msg_init(int *argc, char ***argv) if (!TCGREADY_()) { tcgi_pbegin(*argc,*argv); } -#elif defined(BGML) - /* empty */ #elif defined(MSG_COMMS_MPI) int flag=0; MPI_Initialized(&flag); if (!flag) { -# if defined(DCMF) || defined(MPI_MT) +# if defined(MPI_MT) int provided; MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided); # else @@ -523,11 +374,7 @@ void armci_msg_init_comm(MPI_Comm comm) int armci_msg_me() { -#ifdef BGML - return BGML_Messager_rank(); -#elif defined(DCMF) - return DCMF_Messager_rank(); -#elif defined(MSG_COMMS_MPI) +#if defined(MSG_COMMS_MPI) static int counter = 0; if (counter == 0) { int me; @@ -537,8 +384,6 @@ int armci_msg_me() } return armci_me; -#elif defined(PVM) - return(pvm_getinst(mp_group_name,pvm_mytid())); #else return (int)NODEID_(); #endif @@ -547,11 +392,7 @@ int armci_msg_me() int armci_msg_nproc() { -#ifdef BGML - return BGML_Messager_size(); -#elif defined(DCMF) - return DCMF_Messager_size(); -#elif defined(MSG_COMMS_MPI) +#if defined(MSG_COMMS_MPI) static int counter = 0; if (counter == 0) { int nproc; @@ -560,48 +401,27 @@ int armci_msg_nproc() counter = 1; } return armci_nproc; -#elif defined(PVM) - return(pvm_gsize(mp_group_name)); #else return (int)NNODES_(); #endif } -#ifdef CRAY_YMP -#define BROKEN_MPI_ABORT -#endif - -#ifndef PVM double armci_timer() { -#ifdef BGML - return BGML_Timer(); -#elif defined(DCMF) - return DCMF_Timer(); -#elif defined(MSG_COMMS_MPI) - +#if defined(MSG_COMMS_MPI) return MPI_Wtime(); #else return TCGTIME_(); #endif } -#endif void armci_msg_abort(int code) { -#ifdef BGML - fprintf(stderr,"ARMCI aborting [%d]\n", code); -#elif defined(DCMF) - fprintf(stderr,"ARMCI aborting [%d]\n", code); -#elif defined(MSG_COMMS_MPI) +#if defined(MSG_COMMS_MPI) # ifndef BROKEN_MPI_ABORT MPI_Abort(ARMCI_COMM_WORLD,code); # endif -#elif defined(PVM) - char error_msg[25]; - sprintf(error_msg, "ARMCI aborting [%d]", code); - pvm_halt(); #else Error("ARMCI aborting",(long)code); #endif @@ -664,23 +484,19 @@ void armci_msg_bcast_scope(int scope, void *buf, int len, int root) int up, left, right, Root; if(!buf)armci_die("armci_msg_bcast: NULL pointer", len); -#ifdef BGML - BGTr_Bcast(root, buf, len, 3); -#else armci_msg_bintree(scope, &Root, &up, &left, &right); if(root !=Root){ if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root); if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root); } - - /* printf("%d: scope=%d left=%d right=%d up=%d\n",armci_me, scope, + + /* printf("%d: scope=%d left=%d right=%d up=%d\n",armci_me, scope, left, right, up);*/ if(armci_me != Root && up!=-1) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, up); if (left > -1) armci_msg_snd(ARMCI_TAG, buf, len, left); if (right > -1) armci_msg_snd(ARMCI_TAG, buf, len, right); -#endif } @@ -836,9 +652,6 @@ int nslave = armci_clus_info[armci_clus_me].nslave; #if defined(MPI_SPAWN) || defined(MPI_MT) armci_msg_bcast_scope(SCOPE_ALL, (buf), (len), (root)); return; -#endif -#ifdef LAPI - if(_armci_gop_init){_armci_msg_binomial_bcast(buf,len,root);return;} #endif /* inter-node operation between masters */ if(armci_nclus>1)armci_msg_bcast_scope(SCOPE_MASTERS, buf, len, root); @@ -860,39 +673,27 @@ void armci_msg_brdcst(void* buffer, int len, int root) { if(!buffer)armci_die("armci_msg_brdcast: NULL pointer", len); -#ifdef BGML - BGTr_Bcast(root, buffer, len, PCLASS); -# elif defined(MSG_COMMS_MPI) - MPI_Bcast(buffer, len, MPI_CHAR, root, ARMCI_COMM_WORLD); -# elif defined(PVM) - armci_msg_bcast(buffer, len, root); -# else +#if defined(MSG_COMMS_MPI) + MPI_Bcast(buffer, len, MPI_CHAR, root, ARMCI_COMM_WORLD); +#else { long ttag=ARMCI_TAG, llen=len, rroot=root; tcg_brdcst(ttag, buffer, llen, rroot); } -# endif +#endif } void armci_msg_snd(int tag, void* buffer, int len, int to) { -# ifdef MSG_COMMS_MPI - MPI_Send(buffer, len, MPI_CHAR, to, tag, ARMCI_COMM_WORLD); -# elif defined(PVM) - pvm_psend(pvm_gettid(mp_group_name, to), tag, buffer, len, PVM_BYTE); -# elif defined(BGML) - /* We don't actually used armci_msg_snd in ARMCI. we use optimized - * collectives where - * armci_msg_snd is used. If you build Global Arrays, the MSG_COMMS_MPI flag is - * set, so that - * will work fine - */ - armci_die("bgl shouldn't use armci_msg_snd", armci_me); -# else - long ttag=tag, llen=len, tto=to, block=1; - tcg_snd(ttag, buffer, llen, tto, block); -# endif +#ifdef MSG_COMMS_MPI + MPI_Send(buffer, len, MPI_CHAR, to, tag, ARMCI_COMM_WORLD); +#else + { + long ttag=tag, llen=len, tto=to, block=1; + tcg_snd(ttag, buffer, llen, tto, block); + } +#endif } @@ -900,22 +701,15 @@ void armci_msg_snd(int tag, void* buffer, int len, int to) \*/ void armci_msg_rcv(int tag, void* buffer, int buflen, int *msglen, int from) { -# ifdef MSG_COMMS_MPI - MPI_Status status; - MPI_Recv(buffer, buflen, MPI_CHAR, from, tag, ARMCI_COMM_WORLD, &status); - if(msglen) MPI_Get_count(&status, MPI_CHAR, msglen); -# elif defined(PVM) - int src, rtag,mlen; - pvm_precv(pvm_gettid(mp_group_name, from), tag, buffer, buflen, PVM_BYTE, - &src, &rtag, &mlen); - if(msglen)*msglen=mlen; -#elif defined(BGML) - armci_die("bgl shouldn't use armci_msg_rcv", armci_me); -# else - long ttag=tag, llen=buflen, mlen, ffrom=from, sender, block=1; - tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block); - if(msglen)*msglen = (int)mlen; -# endif +#ifdef MSG_COMMS_MPI + MPI_Status status; + MPI_Recv(buffer, buflen, MPI_CHAR, from, tag, ARMCI_COMM_WORLD, &status); + if(msglen) MPI_Get_count(&status, MPI_CHAR, msglen); +#else + long ttag=tag, llen=buflen, mlen, ffrom=from, sender, block=1; + tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block); + if(msglen)*msglen = (int)mlen; +#endif } @@ -932,13 +726,6 @@ int armci_msg_rcvany(int tag, void* buffer, int buflen, int *msglen) if(msglen)if(MPI_SUCCESS!=MPI_Get_count(&status, MPI_CHAR, msglen)) armci_die("armci_msg_rcvany: count failed ", tag); return (int)status.MPI_SOURCE; -# elif defined(PVM) - int src, rtag,mlen; - pvm_precv(-1, tag, buffer, buflen, PVM_BYTE, &src, &rtag, &mlen); - if(msglen)*msglen=mlen; - return(pvm_getinst(mp_group_name,src)); -# elif defined (BGML) - armci_die("bgl shouldn't use armci_msg_rcvany", armci_me); # else long ttag=tag, llen=buflen, mlen, ffrom=-1, sender, block=1; tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block); @@ -1508,45 +1295,12 @@ static void fdoop2(int n, char *op, float *x, float* work, float* work2) \*/ void armci_msg_gop_scope(int scope, void *x, int n, char* op, int type) { -int root, up, left, right, size; -int tag=ARMCI_TAG; -int ndo, len, lenmes, orign =n, ratio; -void *origx =x; - if(!x)armci_die("armci_msg_gop: NULL pointer", n); - if(work==NULL)_allocate_mem_for_work(); -#ifdef BGML - BGML_Dt dt; - BGML_Op theop; - - if(n > 0 && (strncmp(op, "+", 1) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_SUM; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else if(n > 0 && (strncmp(op, "max", 3) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_MAX; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else if(n > 0 && (strncmp(op, "min", 3) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE)) - { - theop=BGML_MIN; - if(type==ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type==ARMCI_DOUBLE) - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS); - } - else -#endif + int root, up, left, right, size; + int tag=ARMCI_TAG; + int ndo, len, lenmes, orign =n, ratio; + void *origx =x; + if(!x)armci_die("armci_msg_gop: NULL pointer", n); + if(work==NULL)_allocate_mem_for_work(); { armci_msg_bintree(scope, &root, &up, &left, &right); @@ -1557,7 +1311,7 @@ void *origx =x; else size = sizeof(double); ratio = sizeof(double)/size; - + while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) { len = lenmes = ndo*size; @@ -1587,7 +1341,7 @@ void *origx =x; /* Now, root broadcasts the result down the binary tree */ len = orign*size; armci_msg_bcast_scope(scope, origx, len, root); - } + } } @@ -1837,104 +1591,36 @@ void armci_msg_reduce(void *x, int n, char* op, int type) /* inter-node operation between masters */ if(armci_nclus>1){ -#ifdef LAPI - if(_armci_gop_init) - _armci_msg_binomial_reduce(x,n,op,type); - else -#endif - armci_msg_reduce_scope(SCOPE_MASTERS, x, n, op, type); + armci_msg_reduce_scope(SCOPE_MASTERS, x, n, op, type); } } static void armci_msg_gop2(void *x, int n, char* op, int type) { -int size, root=0; - if(work==NULL)_allocate_mem_for_work(); - if(type==ARMCI_INT) size = sizeof(int); - else if(type==ARMCI_LONG) size = sizeof(long); - else if(type==ARMCI_LONG_LONG) size = sizeof(long long); - else if(type==ARMCI_FLOAT) size = sizeof(float); - else size = sizeof(double); -#ifdef BGML /*optimize what we can at the message layer */ - void *origx=x; - BGML_Dt dt; - BGML_Op rop; - - if(n>0 && (strncmp(op, "+", 1) == 0)) - { - rop=BGML_SUM; - if(type == ARMCI_INT) - { - dt=BGML_SIGNED_INT; - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - else if(type == ARMCI_LONG || type == ARMCI_LONG_LONG) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); -/* dt=BGML_UNSIGNED_LONG; */ -/* BGTr_Allreduce(origx, x, n, dt, rop, -1, 3);*/ - } - else if(type == ARMCI_DOUBLE) - { - dt=BGML_DOUBLE; - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - else if(type == ARMCI_FLOAT) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } - else - { - fprintf(stderr,"Unknown data type\n"); - exit(1); - } - } - - else if(n>0 && ((strncmp(op, "max", 3) == 0) || (strncmp(op, "min", 3) ==0 ))) - { - if(strncmp(op, "max", 3) == 0) - rop=BGML_MAX; - else - rop=BGML_MIN; - - if(type == ARMCI_INT) - dt=BGML_SIGNED_INT; - else if(type == ARMCI_DOUBLE) - dt=BGML_DOUBLE; - else if(type == ARMCI_FLOAT) - dt=BGML_FLOAT; - else if(type == ARMCI_LONG) - dt=BGML_SIGNED_LONG; - else if(type == ARMCI_LONG_LONG) - { - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } - else - { - fprintf(stderr,"Unknown data type\n"); - exit(1); - } - if(type != ARMCI_LONG_LONG) - BGTr_Allreduce(origx, x, n, dt, rop, -1, 3); - } - - else -#endif - { /* brackets needed for final gelse clause of bgml */ - - armci_msg_reduce(x, n, op, type); - armci_msg_bcast(x, size*n, root); - } + int size, root=0; + if (work==NULL) { + _allocate_mem_for_work(); + } + if (type==ARMCI_INT) { + size = sizeof(int); + } else if (type==ARMCI_LONG) { + size = sizeof(long); + } else if (type==ARMCI_LONG_LONG) { + size = sizeof(long long); + } else if (type==ARMCI_FLOAT) { + size = sizeof(float); + } else { + size = sizeof(double); + } + armci_msg_reduce(x, n, op, type); + armci_msg_bcast(x, size*n, root); } static void armci_sel(int type, char *op, void *x, void* work, int n) { -int selected=0; + int selected=0; switch (type) { case ARMCI_INT: if(strncmp(op,"min",3) == 0){ @@ -1975,9 +1661,9 @@ int selected=0; \*/ void armci_msg_sel_scope(int scope, void *x, int n, char* op, int type, int contribute) { -int root, up, left, right; -int tag=ARMCI_TAG; -int len, lenmes, min; + int root, up, left, right; + int tag=ARMCI_TAG; + int len, lenmes, min; min = (strncmp(op,"min",3) == 0); if(!min && (strncmp(op,"max",3) != 0)) @@ -2027,30 +1713,11 @@ int len, lenmes, min; /*\ combine array of longs/ints/doubles accross all processes \*/ -#if defined(NEC) - -void armci_msg_igop(int *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_INT); } - -void armci_msg_lgop(long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG); } - -void armci_msg_llgop(long long *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG_LONG); } - -void armci_msg_dgop(double *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_DOUBLE); } - -void armci_msg_fgop (float *x, int n, char* op) -{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_FLOAT);} - -#else void armci_msg_igop(int *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_INT); } void armci_msg_lgop(long *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_LONG); } void armci_msg_llgop(long long *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_LONG_LONG); } void armci_msg_fgop(float *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_FLOAT); } void armci_msg_dgop(double *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_DOUBLE); } -#endif /*\ add array of longs/ints within the same cluster node @@ -2313,19 +1980,3 @@ void armci_msg_group_dgop(double *x, int n, char* op,ARMCI_Group *group) # endif /* ifdef MSG_COMMS_MPI */ /*********************** End ARMCI Groups Code ****************************/ - -#ifdef PVM -/* set the group name if using PVM */ -void ARMCI_PVM_Init(char *mpgroup) -{ -#ifdef CRAY - mp_group_name = (char *)NULL; -#else - if(mpgroup != NULL) { -/* free(mp_group_name); */ - mp_group_name = (char *)malloc(25 * sizeof(char)); - strcpy(mp_group_name, mpgroup); - } -#endif -} -#endif diff --git a/armci/src/common/aggregate.c b/armci/src/common/aggregate.c index 95af57b6e..4c17ae9b6 100644 --- a/armci/src/common/aggregate.c +++ b/armci/src/common/aggregate.c @@ -307,40 +307,22 @@ void armci_agg_complete(armci_ihdl_t nb_handle, int condition) { armci_me, nb_handle->proc, index, aggr[index]->request_len); #endif - /* complete the data transfer. NOTE: in LAPI, Non-blocking calls - (followed by wait) performs better than blocking put/get */ - if(aggr[index]->request_len) { - switch(nb_handle->op) { -#ifdef LAPI - armci_hdl_t usr_hdl; + /* complete the data transfer. NOTE: in some APIs, Non-blocking calls + (followed by wait) performs better than blocking put/get */ + if(aggr[index]->request_len) { + switch(nb_handle->op) { case PUT: - ARMCI_INIT_HANDLE(&usr_hdl); - if((rc=PARMCI_NbPutV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc, (armci_hdl_t*)&usr_hdl))) - ARMCI_Error("armci_agg_complete: nbputv failed",rc); - PARMCI_Wait((armci_hdl_t*)&usr_hdl); - break; - case GET: - ARMCI_INIT_HANDLE(&usr_hdl); - if((rc=PARMCI_NbGetV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc, (armci_hdl_t*)&usr_hdl))) - ARMCI_Error("armci_agg_complete: nbgetv failed",rc); - PARMCI_Wait((armci_hdl_t*)&usr_hdl); - break; -#else - case PUT: - if((rc=PARMCI_PutV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc))) - ARMCI_Error("armci_agg_complete: putv failed",rc); - break; + if((rc=PARMCI_PutV(aggr[index]->darr, aggr[index]->request_len, + nb_handle->proc))) + ARMCI_Error("armci_agg_complete: putv failed",rc); + break; case GET: - if((rc=PARMCI_GetV(aggr[index]->darr, aggr[index]->request_len, - nb_handle->proc))) - ARMCI_Error("armci_agg_complete: getv failed",rc); - break; -#endif - } + if((rc=PARMCI_GetV(aggr[index]->darr, aggr[index]->request_len, + nb_handle->proc))) + ARMCI_Error("armci_agg_complete: getv failed",rc); + break; } + } /* setting request length to zero, as the requests are completed */ aggr[index]->request_len = 0; diff --git a/armci/src/common/armci.c b/armci/src/common/armci.c index 5d4876dd4..3fbd93cfd 100644 --- a/armci/src/common/armci.c +++ b/armci/src/common/armci.c @@ -37,16 +37,6 @@ #if HAVE_STDARG_H # include #endif -#if defined(CRAY) && !defined(__crayx1) -# include -# include -# if HAVE_UNISTD_H -# include -# endif -#endif -#ifdef LAPI -# include "lapidefs.h" -#endif #if HAVE_ERRNO_H # include #endif @@ -56,26 +46,6 @@ #include "armci_shmem.h" #include "signaltrap.h" -#ifdef ARMCIX -#include "armcix.h" -#endif -#ifdef BGML -#include "bgml.h" -#if HAVE_ASSERT_H -# include -#endif -#include "bgmldefs.h" -extern void armci_msg_barrier(void); -#endif - -#ifdef CRAY_SHMEM -# ifdef CRAY_XT -# include -# else -# include -# endif -#endif - /* global variables -- Initialized in PARMCI_Init() and never modified*/ int armci_me, armci_nproc; int armci_clus_me, armci_nclus, armci_master; @@ -87,10 +57,10 @@ int *_armci_argc=NULL; char ***_armci_argv=NULL; thread_id_t armci_usr_tid; -#if !defined(HITACHI) && !defined(THREAD_SAFE) +#if !defined(THREAD_SAFE) double armci_internal_buffer[BUFSIZE_DBL]; #endif -#if defined(SYSV) || defined(WIN32) || defined(MMAP) || defined(HITACHI) || defined(CATAMOUNT) || defined(BGML) +#if defined(SYSV) || defined(WIN32) || defined(MMAP) # include "locks.h" lockset_t lockid; #endif @@ -101,12 +71,6 @@ int armci_prot_switch_preproc = -1; int armci_prot_switch_preop = -1; #endif -#ifdef BGML -/* void armci_allocate_locks(); */ - void armci_init_memlock(); -#endif - - typedef struct{ int sent; int received; @@ -115,18 +79,12 @@ typedef struct{ armci_notify_t **_armci_notify_arr; -#ifdef CRAY_XT -int _armci_malloc_local_region; -#endif - void ARMCI_Cleanup() { -#if (defined(SYSV) || defined(WIN32) || defined(MMAP))&& !defined(HITACHI) +#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) Delete_All_Regions(); if(armci_nproc>1) -#if !defined(LAPI) - DeleteLocks(lockid); -#endif + DeleteLocks(lockid); /* in case of an error notify server that it is time to quit */ #if defined(DATA_SERVER) @@ -168,29 +126,17 @@ static void armci_perror_msg() } -#if defined(IBM) || defined(IBM64) -int AR_caught_sigint; -int AR_caught_sigterm; -#else extern int AR_caught_sigint; extern int AR_caught_sigterm; -#endif void armci_abort(int code) { -#if !defined(BGML) armci_perror_msg(); -#endif ARMCI_Cleanup(); /* data server process cannot use message-passing library to abort * it simply exits, parent will get SIGCHLD and abort the program */ -#if defined(IBM) || defined(IBM64) - /* hack for a problem in POE signal handlers in non-LAPI MPI */ - if(AR_caught_sigint || AR_caught_sigterm) - _exit(1); -#endif #if defined(DATA_SERVER) if(armci_me<0) @@ -212,12 +158,7 @@ void ARMCI_Error(char *msg, int code) void armci_allocate_locks() { - /* note that if ELAN_ACC is defined the scope of locks is limited to SMP */ -#if !defined(CRAY_SHMEM) && \ - ( defined(HITACHI) || defined(CATAMOUNT) || \ - (defined(QUADRICS) && defined(_ELAN_LOCK_H) && !defined(ELAN_ACC)) ) - armcill_allocate_locks(NUM_LOCKS); -#elif (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(HITACHI) +#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) if(armci_nproc == 1)return; # if defined(SPINLOCK) || defined(PMUTEX) || defined(PSPIN) CreateInitLocks(NUM_LOCKS, &lockid); @@ -232,7 +173,7 @@ void armci_allocate_locks() void ARMCI_Set_shm_limit(unsigned long shmemlimit) { -#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(HITACHI) +#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) #define EXTRASHM 1024 /* extra shmem used internally in ARMCI */ unsigned long limit; limit = shmemlimit+EXTRASHM; @@ -264,13 +205,6 @@ void armci_init_memlock() bzero(memlock_table_array[armci_me],bytes); -#ifdef BGML - bgml_init_locks ((void *) memlock_table_array[armci_me]); -#elif ARMCIX - ARMCIX_init_memlock ((memlock_t *) memlock_table_array[armci_me]); -#endif - - #ifdef MEMLOCK_SHMEM_FLAG /* armci_use_memlock_table is a pointer to local memory variable=1 * we overwrite the pointer with address of shared memory variable @@ -290,25 +224,6 @@ void armci_init_memlock() armci_msg_barrier(); } - -#if defined(SYSV) || defined(WIN32) || defined(MMAP) -# if defined(QUADRICS) && !defined(NO_SHM) -static void armci_check_shmmax() -{ - long mylimit, limit; - mylimit = limit = (long) armci_max_region(); - armci_msg_bcast_scope(SCOPE_MASTERS, &limit, sizeof(long), 0); - if(mylimit != limit){ - printf("%d:Shared mem limit in ARMCI is %ld bytes on node %s vs %ld on %s\n", - armci_me,mylimit<<10,armci_clus_info[armci_clus_me].hostname, - limit<<10, armci_clus_info[0].hostname); - fflush(stdout); sleep(1); - armci_die("All nodes must have the same SHMMAX limit if NO_SHM is not defined",0); - } -} -# endif -#endif - extern void armci_region_shm_malloc(void *ptr_arr[], size_t bytes); #ifdef ENABLE_CHECKPOINT @@ -406,7 +321,7 @@ int _armci_init(MPI_Comm comm) "instead of PARMCI_Init(). Please replace PARMCI_Init() " " with PARMCI_Init_args(&argc, &argv) as in the API docs", 0L); #endif -#if defined(MPI_MT) || defined(DCMF) +#if defined(MPI_MT) { int provided; MPI_Query_thread(&provided); @@ -429,23 +344,7 @@ int _armci_init(MPI_Comm comm) } } #endif - -#ifdef BGML - BGML_Messager_Init(); - BG1S_Configuration_t config; - config=BG1S_Configure(NULL); - config.consistency= BG1S_ConsistencyModel_Weak; - BG1S_Configure(&config); - - unsigned long long available = BGML_Messager_available(); - if (available & BGML_MESSAGER_GI) - bgml_barrier = (BGML_Barrier) BGGI_Barrier; - else - bgml_barrier = (BGML_Barrier) BGTr_Barrier; -#endif -#ifdef ARMCIX - ARMCIX_Init (); -#endif + armci_nproc = armci_msg_nproc(); armci_me = armci_msg_me(); armci_usr_tid = THREAD_ID_SELF(); /*remember the main user thread id */ @@ -457,31 +356,6 @@ int _armci_init(MPI_Comm comm) printf("WARNING: PARMCI_Init is called from thread %d, should be 0\n",th_idx); #endif -#ifdef _CRAYMPP - cmpl_proc=-1; -#endif -#ifdef LAPI -# ifdef AIX - { - char *tmp1 = getenv("RT_GRQ"), *tmp2 = getenv("AIXTHREAD_SCOPE"); - if(tmp1 == NULL || strcmp((const char *)tmp1,"ON")) - armci_die("Armci_Init: environment variable RT_GRQ not set. It should be set as RT_GRQ=ON, to restore original thread scheduling LAPI relies upon",0); - if(tmp2 == NULL || strcmp((const char *)tmp2,"S")) - armci_die("Armci_Init: environment variable AIXTHREAD_SCOPE=S should be set to assure correct operation of LAPI", 0); - } -# endif - armci_init_lapi(); -#endif - -#ifdef PORTALS - armci_init_portals(); - shmem_init(); -#endif - -#ifdef CRAY_SHMEM - shmem_init(); -#endif - armci_init_clusinfo(); #ifdef MSG_COMMS_MPI @@ -500,16 +374,9 @@ int _armci_init(MPI_Comm comm) #if defined(SYSV) || defined(WIN32) || defined(MMAP) /* init shared/K&R memory */ if(ARMCI_Uses_shm() ) { -# ifdef SGIALTIX - armci_altix_shm_init(); -# else - armci_shmem_init(); -# endif + armci_shmem_init(); } -# if defined(QUADRICS) && !defined(NO_SHM) - if(armci_me == armci_master)armci_check_shmmax(); -# endif #endif #ifdef REGION_ALLOC @@ -522,42 +389,6 @@ int _armci_init(MPI_Comm comm) } #endif -#ifdef MULTI_CTX - /* this is a hack for the Elan-3 multi-tiled memory (qsnetlibs v 1.4.10) - * we need to allocate and then free memory to satisfy libelan requirements - * for symmetric memory addresses - */ - if(armci_nclus >1){ - int segments, segsize, seg; - void **addr; - armci_nattach_preallocate_info(&segments, &segsize); - - segsize -= 1024*1024; /* leave some for the K&RM headers */ - if(armci_me!=armci_master)segsize=0; /* only one allocates mem on node*/ - - addr = (void*) malloc(segments*armci_nproc*sizeof(void*)); - if(!addr)armci_die("armci_init:addr malloc failed",segments*armci_nproc); - - for(seg=0; seg< segments; seg++) /* allocate segments */ - if(PARMCI_Malloc(addr+armci_nproc*seg,segsize)) - armci_die("problem in Elan-3 mem preallocation",seg); - - for(seg=0; seg< segments; seg++) /* return to free pool */ - if(armci_me==armci_master) - if(PARMCI_Free(*(addr+armci_nproc*seg+armci_me))) - armci_die("problem in Elan-3 mem preallocation - free stage",seg); - free(addr); - -#if 0 - if(armci_me==armci_master){ - printf("%d:preallocated %d segments %d bytes each\n",armci_me, - segments, segsize); fflush(stdout); - } -#endif - - } -#endif - /* allocate locks: we need to do it before server is started */ armci_allocate_locks(); armci_init_fence(); @@ -578,16 +409,14 @@ int _armci_init(MPI_Comm comm) if(armci_nclus >1) armci_start_server(); # endif -#if defined(GM) || defined(VAPI) || defined(PORTALS) || (defined(LAPI) && defined(LAPI_RDMA)) +#if defined(VAPI) /* initialize registration of memory */ armci_region_init(); #endif armci_msg_barrier(); armci_init_memlock(); /* allocate data struct for locking memory areas */ -#if !defined(GM) armci_notify_init(); -#endif armci_msg_barrier(); armci_msg_gop_init(); @@ -646,12 +475,6 @@ void PARMCI_Finalize() } #endif -#ifdef PORTALS - armci_fini_portals(); -#endif -#ifdef LAPI - armci_term_lapi(); -#endif #ifdef ALLOW_PIN free(armci_prot_switch_fence); #endif @@ -661,9 +484,6 @@ void PARMCI_Finalize() #ifdef MSG_COMMS_MPI armci_group_finalize(); #endif -#ifdef ARMCIX - ARMCIX_Finalize (); -#endif #ifdef MSG_COMMS_MPI MPI_Comm_free(&ARMCI_COMM_WORLD); /*SK: free at last*/ #endif @@ -854,7 +674,7 @@ char *ptr; nb_handle = NULL; } -#if defined(LAPI) || defined(GM) || defined(VAPI) || defined(QUADRICS) +#if defined(VAPI) if(armci_rem_gpc(GET, darr, 2, &send, proc, 1, nb_handle)) #endif return FAIL2; diff --git a/armci/src/common/clusterinfo.c b/armci/src/common/clusterinfo.c index 99b54c5c7..8b08de065 100644 --- a/armci/src/common/clusterinfo.c +++ b/armci/src/common/clusterinfo.c @@ -32,14 +32,6 @@ * Must define NO_SHMMAX_SEARCH in shmem.c to prevent depleting shared memory * due to a gready shmem request by the master process on cluster node 0. */ -#if defined(DECOSF) && defined(QUADRICS) -# if !defined(REGION_ALLOC) -# define NO_SHMEM - extern int armci_enable_alpha_hack(); -# endif -#else -# define armci_enable_alpha_hack() 1 -#endif #define DEBUG 0 #define MAX_HOSTNAME 80 @@ -68,41 +60,7 @@ /*** stores cluster configuration. Initialized before user threads are created and then read-only ***/ armci_clus_t *armci_clus_info; -#ifdef HITACHI -#include -# define GETHOSTNAME sr_gethostname -ndes_t _armci_group; - -static int sr_gethostname(char *name, int len) -{ -int no; -pid_t ppid; - - if(hmpp_nself (&_armci_group,&no,&ppid,0,NULL) <0) - return -1; - - if(len<6)armci_die("len too small",len); - if(no>1024)armci_die("expected node id <1024",no); - sprintf(name,"n%d",no); - return 0; -} -#elif defined(SGIALTIX) -# define GETHOSTNAME altix_gethostname -static int altix_gethostname(char *name, int len) { - sprintf(name,"altix"); - return 0; -} -#elif defined(CRAY_XT) /* && !defined(PORTALS) */ -#define GETHOSTNAME cnos_gethostname -static int cnos_gethostname(char *name, int len) -{ - int size,rank; - size=PMI_Get_rank(&rank); - sprintf(name,"%d",rank); -} -#else # define GETHOSTNAME gethostname -#endif static char* merge_names(char *name) { @@ -304,7 +262,7 @@ static void print_clus_info() int i; if(PRINT_CLUSTER_INFO && armci_nclus > 1 && armci_me ==0){ -#if defined(DATA_SERVER) || defined(SERVER_THREAD) || defined(PORTALS) +#if defined(DATA_SERVER) || defined(SERVER_THREAD) printf("ARMCI configured for %d cluster nodes. Network protocol is '%s'.\n", armci_nclus, network_protocol); #else @@ -372,11 +330,9 @@ void armci_init_clusinfo() #endif #ifdef NO_SHMEM - if(armci_enable_alpha_hack()) { name[len]='0'+armci_me; name[len+1]='\0'; len++; - } #endif if(DEBUG) @@ -390,20 +346,18 @@ void armci_init_clusinfo() process_hostlist(name); /* compute cluster info */ #endif -#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(HITACHI) +#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) armci_set_shmem_limit_per_node(armci_clus_info[0].nslave); #endif armci_master = armci_clus_info[armci_clus_me].master; #ifdef NO_SHMEM - if(armci_enable_alpha_hack()) { int i; for(i=0;idatalen); arg.buf_posted = arg.buf = buf; -#ifdef HITACHI - arg.count = 0; -#else arg.count = bufsize; -#endif arg.proc = (msginfo->operation==GET)?msginfo->to:msginfo->from; arg.op = msginfo->operation; @@ -121,10 +115,8 @@ void armci_pipe_receive_strided(request_header_t* msginfo, void *ptr, { buf_arg_t arg; int packsize = PACK_SIZE(msginfo->datalen); -#if defined(GM) - arg.buf_posted = msginfo->tag.data_ptr; -#endif -#if (defined(VIA) && defined(VIA_USES_RDMA)) || defined(VAPI) + +#if defined(VAPI) arg.buf_posted = msginfo->tag; #endif @@ -143,10 +135,7 @@ void armci_pipe_send_strided(request_header_t *msginfo, void *buf, int buflen, buf_arg_t arg; int packsize = PACK_SIZE(msginfo->datalen); -#if defined(GM) || defined(HITACHI) - arg.buf_posted = msginfo->tag.data_ptr; -#endif -#if (defined(VIA) && defined(VIA_USES_RDMA)) || defined(VAPI) +#if defined(VAPI) arg.buf_posted = msginfo->tag; #endif @@ -157,15 +146,12 @@ int packsize = PACK_SIZE(msginfo->datalen); armci_dispatch_strided(ptr, stride_arr, count, strides, -1, -1, packsize, armcill_pipe_send_chunk, &arg); -#ifdef GM - armci_serv_send_nonblocking_complete(0); -#endif } #endif /**************************** end of pipelining for medium size msg ***********/ -#if defined(CLIENT_BUF_BYPASS) && !defined(GM) +#if defined(CLIENT_BUF_BYPASS) /**************** NOTE: for now this code can only handle contiguous data *****/ void armci_send_strided_data_bypass(int proc, request_header_t *msginfo, void *loc_buf, int msg_buflen, @@ -773,7 +759,7 @@ void armci_send_data(request_header_t* msginfo, void *data) { int to = msginfo->from; -#if defined(VIA) || defined(GM) || defined(VAPI) +#if defined(VAPI) /* if the data is in the pinned buffer: MessageRcvBuffer */ #if defined(PEND_BUFS) extern int armci_data_in_serv_buf(void *); @@ -786,10 +772,6 @@ void armci_send_data(request_header_t* msginfo, void *data) armci_WriteToDirect(to, msginfo, data); else { /* copy the data to the MessageRcvBuffer */ -#ifdef GM - /* leave space for header ack */ - char *buf = MessageRcvBuffer + sizeof(long); -#else char *buf = MessageRcvBuffer; # if defined(PEND_BUFS) fprintf(stderr, "%d:: op=%d len=%d ptr=%p working on unpinned memory. aborting!\n", armci_me, msginfo->operation,msginfo->datalen, data); @@ -798,21 +780,12 @@ void armci_send_data(request_header_t* msginfo, void *data) /* extern char *armci_openib_get_msg_rcv_buf(int); */ /* buf = armci_openib_get_msg_rcv_buf(msginfo->from); */ # endif -#endif assert(buf != NULL); armci_copy(data, buf, msginfo->datalen); armci_WriteToDirect(to, msginfo, buf); } #else -#ifdef DOELAN4 - /*this is because WriteToDirect is a no-op in elan4.c so we have - * to do a put. This will not cause problems anywhere else in the - * code and this part on elan4 will only be invoked in a GPC - */ - PARMCI_Put(data,msginfo->tag.data_ptr,msginfo->datalen,to); -#else - armci_WriteToDirect(to, msginfo, data); -#endif + armci_WriteToDirect(to, msginfo, data); #endif } @@ -961,7 +934,7 @@ void armci_data_server(void *mesg) } armci_server_ipc(msginfo, descr, buffer, buflen); break; -#if defined(SOCKETS) || defined(HITACHI) || defined(MPI_SPAWN) || defined(MPI_MT) +#if defined(SOCKETS) || defined(MPI_SPAWN) || defined(MPI_MT) case QUIT: if(DEBUG_){ printf("%d(serv):got QUIT request from %d\n",armci_me, from); @@ -1083,7 +1056,7 @@ void armci_start_server() void *armci_server_code(void *data) { #ifdef SERVER_THREAD -#if (defined(GM) || defined(VAPI) || defined(QUADRICS)) && ARMCI_ENABLE_GPC_CALLS +#if (defined(VAPI)) && ARMCI_ENABLE_GPC_CALLS # ifdef PTHREADS extern pthread_t data_server; data_server = pthread_self(); diff --git a/armci/src/common/gpc.c b/armci/src/common/gpc.c index 6822fbe5b..a9b34fa96 100644 --- a/armci/src/common/gpc.c +++ b/armci/src/common/gpc.c @@ -263,12 +263,6 @@ int example_func(int to, int from, void *hdr, int hlen, int rtype); -#ifdef LAPI -void armci_gpc_set_serverpid(){ -} -#endif - - /*\ * Translate pointer to memory on processor "proc" * to be used in a callback function send by processor "from" @@ -283,7 +277,7 @@ return ptr; \*/ void ARMCI_Gpc_lock(int proc) { -#if defined(CLUSTER) && !defined(SGIALTIX) +#if defined(CLUSTER) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; @@ -306,7 +300,7 @@ return 0; \*/ void ARMCI_Gpc_unlock(int proc) { -#if defined(CLUSTER) && !defined(SGIALTIX) +#if defined(CLUSTER) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; diff --git a/armci/src/common/noncont.c b/armci/src/common/noncont.c deleted file mode 100644 index 509b911c4..000000000 --- a/armci/src/common/noncont.c +++ /dev/null @@ -1,312 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: noncont.c,v 1.3.2.2 2007-05-04 16:43:35 d3p687 Exp $ - * noncont.c - * - * Developed by Andriy Kot - * Copyright (c) 2006 Pacific Northwest National Laboratory - * - * Alternative version of non-contiguous calls using non-blocking ones - * - * Changelog: - * 2006-09-08 - created - * - */ - -#include "armcip.h" -#include "copy.h" -#include "acc.h" -#include "memlock.h" -#if HAVE_STDIO_H -# include -#endif -#ifdef PORTALS -#include "armci_portals.h" -#endif - - -#if 0 -# define PRN_DBG_MSG3(m,a1,a2,a3) \ - fprintf(stderr,"DBG %d: " m,armci_me,a1,a2,a3);fflush(stderr) -# define PRN_DBG_MSG(m) PRN_DBG_MSG3(m,0,0,0) -# define PRN_DBG_MSG1(m,a1) PRN_DBG_MSG3(m,a1,0,0) -# define PRN_DBG_MSG2(m,a1,a2) PRN_DBG_MSG3(m,a1,a2,0) -#else -# define PRN_DBG_MSG(m) -# define PRN_DBG_MSG1(m,a1) -# define PRN_DBG_MSG2(m,a1,a2) -# define PRN_DBG_MSG3(m,a1,a2,a3) -#endif - -#if 0 -# define CALL_IN(_func) { if (armci_me == 0) printf("ENTERED %s\n", _func); fflush(stdout); } -# define CALL_OUT(_func) { if (armci_me == 0) printf("EXITING %s\n", _func); fflush(stdout); } -#else -# define CALL_IN(_func) -# define CALL_OUT(_func) -#endif - -#ifdef NB_NONCONT - -#if defined(QUADRICS) -typedef ELAN_EVENT *HTYPE; -#define SHMEM_HANDLE_SUPPORTED -#elif defined(CRAY_SHMEM) -typedef void *HTYPE; -#else -typedef armci_ireq_t HTYPE; -#endif - -#define MAX_SLOTS_LL 64 -#define MIN_OUTSTANDING 6 -static int max_pending = 16; /* throttle number of outstanding nb calls */ - -/* might have to use MAX_SLOTS_LL < MAX_PENDING due to throttling problem */ -#define MAX_PENDING 6 -#define ZR (HTYPE)0 - -static HTYPE put_dscr[MAX_SLOTS_LL]; -static HTYPE get_dscr[MAX_SLOTS_LL]; -/* static variables alreay initialize to 0 (?) -static HTYPE put_dscr[MAX_SLOTS_LL]= { -ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR, -ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR}; - -static HTYPE get_dscr[MAX_SLOTS_LL] = { -ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR, -ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR,ZR}; -*/ - -#if defined(PORTALS) -extern ARMCI_MEMHDL_T *mhloc; -extern ARMCI_MEMHDL_T *mhrem; -# define INI_HDL(_hdl, _op, _proc) { \ - (_hdl).tag = GET_NEXT_NBTAG(); \ - (_hdl).op = _op; \ - (_hdl).proc = _proc; \ - (_hdl).bufid = NB_NONE; \ - } -# define CLR_HDL(_hdl) ((_hdl).tag = 0) -# define CHK_HDL(_hdl) (_hdl.tag) -#else -# define CLR_HDL(_hdl) ((_hdl) = ZR) -# define CHK_HDL(_hdl) (_hdl) -# define INI_HDL(_hdl, _op, _proc) -#endif - -static int cur_get=0; -static int cur_put=0; -static int pending_get=0; -static int pending_put=0; - -/* strided put, nonblocking */ -void armcill_put2D(int proc, int bytes, int count, void* src_ptr,int src_stride, - void* dst_ptr,int dst_stride) -{ - CALL_IN("armcill_put2D"); - - int _j, i, batch, issued=0; - char *ps=src_ptr, *pd=dst_ptr; - - for (_j = 0; _j < count; ){ - /* how big a batch of requests can we issue */ - batch = (count - _j )=max_pending)cur_put=0; - } - } - - if(issued != count) - armci_die2("armcill_put2D: mismatch %d %d \n", count,issued); - - CALL_OUT("armcill_put2D"); -} - - -/* blocking vector put */ -void armcill_putv(int proc, int bytes, int count, void* src[], void* dst[]) -{ - int _j, i, batch, issued=0; - void *ps, *pd; - - for (_j = 0; _j < count; ){ - /* how big a batch of requests can we issue */ - batch = (count - _j )=max_pending)cur_put=0; - } - } - if(issued != count) - armci_die2("armcill_putv: mismatch\n", count,issued); - -#ifdef SHMEM_HANDLE_SUPPORTED - for(i=0; i=max_pending)cur_get=0; - } - } - - if(issued != count) - armci_die2("armcill_get2D: mismatch %d %d \n", count,issued); - - CALL_OUT("armcill_get2D"); -} - - -/* blocking vector get */ -void armcill_getv(int proc, int bytes, int count, void* src[], void* dst[]) -{ - int _j, i, batch, issued=0; - void *ps, *pd; - - for (_j = 0; _j < count; ){ - /* how big a batch of requests can we issue */ - batch = (count - _j )=max_pending)cur_get=0; - } - } - if(issued != count) - armci_die2("armcill_getv: mismatch %d %d \n", count,issued); - -#ifdef SHMEM_HANDLE_SUPPORTED - for(i=0; i #endif -#if !defined(ACC_COPY) &&!defined(CRAY_YMP) &&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML) &&!defined(DCMF) +#if !defined(ACC_COPY) &&!defined(CYGNUS)&&!defined(CYGWIN) # define REMOTE_OP #endif @@ -97,13 +97,9 @@ int armci_pack_strided(int op, void* scale, int proc, #ifdef STRIDED_GET_BUFLEN if(op==GET)bufsize=STRIDED_GET_BUFLEN; -# ifdef HITACHI - else - if(stride_levels || ARMCI_ACC(op))bufsize=MSG_BUFLEN_SMALL-PAGE_SIZE; -# endif #endif -#if (defined(GM_) || defined(VIA_) || defined(VAPI_)) +#if (defined(GM_) || defined(VAPI_)) /*we cant assume that the entire available buffer will be used for data, fact that the header and descriptor also go in the same buffer should be considered while packing. diff --git a/armci/src/common/regions.c b/armci/src/common/regions.c index a7444de46..e99073725 100644 --- a/armci/src/common/regions.c +++ b/armci/src/common/regions.c @@ -21,9 +21,6 @@ * rdma. Coincidentally all these networks also used a server thread. * so server_regions were allocated and enabled when REGIONS_REQUIRE_MEMHDL * was defined. - * With Catamount, we require portals memory descriptors to be stored - * there is no server but we still need the server_regions to post match all - * md to accept all incomming requests */ #include "armcip.h" @@ -35,12 +32,8 @@ #endif #include "copy.h" -/*this should match similar def in portals.c vapi.c and openib.c */ -#ifdef PORTALS -# define MAX_REGIONS 24 -#else +/*this should match similar def in vapi.c and openib.c */ # define MAX_REGIONS 8 -#endif typedef struct { void *start; @@ -94,27 +87,11 @@ static int armci_region_record(void *start, void *end, armci_reglist_t *reg) static void armci_region_register(void *start, long size, armci_reglist_t *reg) { -#ifdef REGIONS_REQUIRE_MEMHDL -# if defined(PORTALS) - int regid = reg->n; -# endif -#endif ARMCI_PR_DBG("enter",0); if(reg->n >= MAX_REGIONS) return; if(armci_nclus<=1)return; -#ifdef REGIONS_REQUIRE_MEMHDL -# if defined(PORTALS) - /*we really shouldn't have network specific ifdef's here but this is an - * exception to avoid significant code change in the portals layer - * ARMCI portals layer maintains a list of memory descriptors for each - * region allocated. It uses them in a round robin fashion. We store it - * in the memhdl to identify which memory region the memory used by a - * communication call corresponds to. - */ - (reg->list+(regid))->memhdl.regid=regid; -# endif - +#ifdef REGIONS_REQUIRE_MEMHDL if(!armci_pin_contig_hndl(start, size, &((reg->list+reg->n)->memhdl))){ printf("%d pin failed %p bytes=%ld\n",armci_me,start,size); fflush(stdout); return; @@ -132,54 +109,23 @@ static void armci_region_register(void *start, long size, armci_reglist_t *reg) void armci_region_register_shm(void *start, long size) { -#ifdef PORTALS -armci_reglist_t *reg = clus_regions+armci_clus_me; -#endif if(allow_pin) armci_region_register(start, size, clus_regions+armci_clus_me); else{ needs_pin_shmptr = start; needs_pin_shmsize= size; } - -#ifdef PORTALS - /* we mark the region as local region so that portals layer uses - * the md from memhdl instead of any region list*/ - (reg->list+(reg->n-1))->memhdl.islocal=0; -#endif - -#if 0 - if(allow_pin){ - printf("\n%d:%d registering shm %p bytes=%ld\n",armci_me,allow_pin,start,size); - fflush(stdout); - } -#endif } void armci_region_register_loc(void *start, long size) { -#ifdef PORTALS - armci_reglist_t *reg = &loc_regions_arr; -#endif if(allow_pin)armci_region_register(start, size, &loc_regions_arr); else{ needs_pin_ptr = start; needs_pin_size= size; } -#ifdef PORTALS - { - extern int _armci_malloc_local_region; - if(_armci_malloc_local_region){ - (reg->list+(reg->n-1))->memhdl.islocal=1; - _armci_malloc_local_region=0; - } - else - (reg->list+(reg->n-1))->memhdl.islocal=0; - } -#endif - #ifdef DEBUG_ if(allow_pin){ printf("\n%d:%d registered local %p bytes=%ld\n",armci_me,allow_pin,start,size); @@ -335,14 +281,7 @@ int armci_region_both_found_hndl(void *loc, void *rem, int size, int node, } } -#ifdef PORTALS - if(found!=1){ - *loc_memhdl=NULL; - found=1; - } -#else if(!found) return 0; -#endif else {*loc_memhdl=&((reg->list+i)->memhdl);} /* now check remote shared */ @@ -567,18 +506,11 @@ void armci_global_region_exchange(void *start, long size) clreglist = &(loc_regions_arr); else clreglist = (clus_regions+armci_clus_me); -#if defined(DATA_SERVER) || defined(PORTALS) -# if defined(PORTALS) - ((reglist->list+reglist->n)->memhdl).regid=(reglist->n); -# endif +#if defined(DATA_SERVER) armci_serv_register_req((clreglist->list+foundclus)->start,((char *)(clreglist->list+foundclus)->end-(char *)((clreglist->list+foundclus)->start)),&((reglist->list+reglist->n)->memhdl)); #endif (void)armci_region_record((clreglist->list+foundclus)->start,(clreglist->list+foundclus)->end,reglist); -#ifdef LAPI_RDMA - armci_copy(&(clreglist->list+foundclus)->memhdl, &(reglist->list+foundclus)->memhdl, sizeof(ARMCI_MEMHDL_T)); -#endif - #if DEBUG printf("\n%d:serv recording st=%p end=%p sz=%d from %d n=%d sz=%d\n",armci_me,(clreglist->list+foundclus)->start,(clreglist->list+foundclus)->end,(clreglist->list+foundclus)->end-(clreglist->list+foundclus)->start,armci_clus_me,reglist->n,sizeof(ARMCI_MEMHDL_T));fflush(stdout); #endif diff --git a/armci/src/common/request.c b/armci/src/common/request.c index 8fb5d01b7..35dafab8f 100644 --- a/armci/src/common/request.c +++ b/armci/src/common/request.c @@ -45,7 +45,7 @@ extern void armci_sock_send(int to, void *data, int len); #endif -#if !defined(GM) && !defined(VIA) && !defined(LAPI) &&!defined(VAPI) +#if !defined(VAPI) double _armci_rcv_buf[MSG_BUFLEN_DBL]; double _armci_snd_buf[MSG_BUFLEN_DBL]; char* MessageSndBuffer = (char*)_armci_snd_buf; @@ -121,7 +121,7 @@ request_header_t *msginfo = (request_header_t*) buffer; } } -#if defined(ALLOW_PIN) || defined(LAPI2) +#if defined(ALLOW_PIN) if(msginfo->pinned && msginfo->bypass){ armci_rcv_strided_data_bypass_both(msginfo->to,msginfo,loc_ptr,count, stride_levels); @@ -408,9 +408,7 @@ request_header_t *msginfo = (request_header_t*)GET_SEND_BUFFER(bufsize,REGISTER, msginfo->datalen = sizeof(ARMCI_MEMHDL_T); msginfo->operation = REGISTER; msginfo->bytes = msginfo->dscrlen+ msginfo->datalen; -#ifndef LAPI msginfo->tag.ack = 0; -#endif buf = (char *)(msginfo+1); ADDBUF(buf,void*,ptr); ADDBUF(buf,long,sz); @@ -864,9 +862,6 @@ int armci_rem_strided(int op, void* scale, int proc, #endif if(nb_handle) -#ifdef ACC_SMP - if(!ARMCI_ACC(op)) -#endif { /* INIT_SENDBUF_INFO(nb_handle,buf,op,proc); same as _armci_buf_set_tag, why here? */ _armci_buf_set_tag(buf,nb_handle->tag,0); @@ -909,10 +904,6 @@ int armci_rem_strided(int op, void* scale, int proc, msginfo->pinned=0; } # endif -#ifdef LAPI2 - msginfo->bypass=0; - msginfo->pinned=0; -#endif /* align buf for doubles (8-bytes) before copying data */ ALLIGN8(buf); @@ -1011,9 +1002,7 @@ int armci_rem_strided(int op, void* scale, int proc, armci_send_req(proc, msginfo, bufsize); } #if !defined(MPI_SPAWN) && !defined(MPI_MT) -#ifdef ACC_SMP - if(!ARMCI_ACC(op)) -#endif + armci_save_strided_dscr(&buf0,dst_ptr,dst_stride_arr,count, stride_levels,1); #endif @@ -1232,7 +1221,7 @@ extern void armci_wait_for_blocking_scatter(); #endif -#if (defined(ALLOW_PIN) || defined(LAPI2)) && !defined(HAS_RDMA_GET) +#if defined(ALLOW_PIN) && !defined(HAS_RDMA_GET) /*\ client version of remote strided get \*/ int armci_rem_get(int proc, @@ -1307,14 +1296,10 @@ int armci_rem_get(int proc, msginfo->bytes = msginfo->dscrlen; -#if defined(GM) || defined(VAPI) || defined(QUADRICS) +#if defined(VAPI) /* prepare for set the stamp at the end of the user buffer */ if(count[0]bypass){ dscr += (1+stride_levels)*sizeof(int); /* move past count */ GETBUF(dscr,void*,client_ptr); @@ -1412,7 +1397,7 @@ void armci_server(request_header_t *msginfo, char *dscr, char* buf, int buflen) if(msginfo->operation == GET){ -# if defined(CLIENT_BUF_BYPASS) || defined(LAPI2) || defined(VAPI) +# if defined(CLIENT_BUF_BYPASS) || defined(VAPI) /* This path was not updated */ if(msginfo->bypass){ armci_send_strided_data_bypass(proc, msginfo, buf, buflen, loc_ptr, loc_stride_arr, @@ -1467,7 +1452,7 @@ void armci_server(request_header_t *msginfo, char *dscr, char* buf, int buflen) armci_process_extheader(msginfo, dscr_save, buf, buflen); } -#if ARMCI_ENABLE_GPC_CALLS && (defined(LAPI) || defined(GM) || defined(VAPI) || defined(DOELAN4) || defined(SOCKETS)) +#if ARMCI_ENABLE_GPC_CALLS && (defined(VAPI) || defined(SOCKETS)) static int gpc_call_process( request_header_t *msginfo, int len, char *dscr, char* buf, int buflen, char *sbuf); #endif @@ -1503,7 +1488,7 @@ void armci_server_vector( request_header_t *msginfo, case GET: /* fprintf(stderr, "%d:: Got a vector message!!\n", armci_me); */ if(msginfo->ehlen) { -#if ARMCI_ENABLE_GPC_CALLS && (defined(LAPI) || defined(GM) || defined(VAPI) || defined(DOELAN4)) +#if ARMCI_ENABLE_GPC_CALLS && defined(VAPI) gpc_call_process(msginfo, len, dscr, buf, buflen, sbuf); #else armci_die("Unexpected vector message with non-zero ehlen. GPC call?", @@ -1582,22 +1567,15 @@ void armci_server_vector( request_header_t *msginfo, /**Server side routine to handle a GPC call request**/ /*===============Register this memory=====================*/ #if ARMCI_ENABLE_GPC_CALLS -#if defined(LAPI) || defined(GM) || defined(VAPI) || defined(QUADRICS) +#if defined(VAPI) gpc_buf_t *gpc_req; -/*VT: I made the change below because DATA_SERVER is not defined for elan4 - *VT: This will only be invoked in case of GPC call and should not intefere - *VT: with any other call - */ -#if (defined(DOELAN4) || defined(DATA_SERVER)) && defined(SERVER_THREAD) +#if defined(DATA_SERVER) && defined(SERVER_THREAD) # ifdef PTHREADS pthread_t data_server; # else # error Threading other than pthreads not yet implemented # endif #endif -#if defined(LAPI) -pthread_t data_server; -#endif void block_thread_signal(int signo) { sigset_t mask; diff --git a/armci/src/common/signaltrap.c b/armci/src/common/signaltrap.c index 757e5294b..1ca20d47a 100644 --- a/armci/src/common/signaltrap.c +++ b/armci/src/common/signaltrap.c @@ -41,12 +41,7 @@ #if !defined(armci_die) extern void Error(); #endif - -#if (defined(ENCORE) || defined(SEQUENT) || defined(ARDENT)) -# define SigType int -#else # define SigType void -#endif #ifndef SIG_ERR # define SIG_ERR (SigType (*)())-1 @@ -65,14 +60,7 @@ SigType (*SigSegvOrig)(); /*********************** SIGINT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigIntHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigIntHandler(sig) -#endif int sig; { AR_caught_sigint = 1; @@ -103,14 +91,7 @@ void RestoreSigInt() /*********************** SIGABORT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigAbortHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigAbortHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -131,21 +112,11 @@ void TrapSigAbort() /*********************** SIGCHLD *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigChldHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigChldHandler(sig) -#endif int sig; { int status; -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - union wait ustatus; -#endif - + #if defined(LINUX) pid_t ret; /* Trap signal as soon as possible to avoid race */ @@ -153,18 +124,6 @@ SigType SigChldHandler(sig) Error("SigChldHandler: error from signal setting SIGCHLD",0); #endif -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - -# if defined(LINUX) - ret = wait(&ustatus); - if((ret == 0) || ((ret == -1) && (errno == ECHILD))) { return; } -# else - (void) wait(&ustatus); -# endif - status = ustatus.w_status; - -#else - # if defined(LINUX) ret = waitpid(0, &status, WNOHANG); if((ret == 0) || ((ret == -1) && (errno == ECHILD))) { return; } @@ -172,7 +131,6 @@ SigType SigChldHandler(sig) (void)wait(&status); # endif -#endif AR_caught_sigchld=1; AR_caught_sig= sig; Error("Child process terminated prematurely, status=",(int) status); @@ -204,14 +162,7 @@ void RestoreSigChldDfl() /*********************** SIGBUS *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigBusHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigBusHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -235,14 +186,7 @@ void TrapSigBus() /*********************** SIGFPE *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigFpeHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigFpeHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -267,14 +211,7 @@ void TrapSigFpe() /*********************** SIGILL *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigIllHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigIllHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -294,14 +231,7 @@ void TrapSigIll() /*********************** SIGSEGV *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigSegvHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigSegvHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -371,14 +301,7 @@ void RestoreSigSegv() /*********************** SIGSYS *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigSysHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigSysHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -399,14 +322,7 @@ void TrapSigSys() /*********************** SIGTRAP *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigTrapHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigTrapHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -425,14 +341,7 @@ void TrapSigTrap() /*********************** SIGHUP *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigHupHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigHupHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -462,14 +371,7 @@ void RestoreSigHup() /*********************** SIGTERM *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigTermHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigTermHandler(sig) -#endif int sig; { AR_caught_sigterm = 1; @@ -499,14 +401,7 @@ void RestoreSigTerm() /*********************** SIGIOT *************************************/ #ifdef SIGIOT -#if defined(SUN) && !defined(SOLARIS) -SigType SigIotHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigIotHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -526,14 +421,7 @@ void TrapSigIot() /*********************** SIGCONT *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigContHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigContHandler(sig) -#endif int sig; { /* Error("Trace Cont error, status=",(int) sig);*/ @@ -550,14 +438,7 @@ void TrapSigCont() } /*********************** SIGXCPU *************************************/ -#if defined(SUN) && !defined(SOLARIS) -SigType SigXcpuHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else SigType SigXcpuHandler(sig) -#endif int sig; { AR_caught_sig= sig; @@ -577,9 +458,7 @@ void TrapSigXcpu() void ARMCI_ChildrenTrapSignals() { -#ifndef LAPI TrapSigBus(); -#endif TrapSigFpe(); TrapSigIll(); #ifdef ENABLE_CHECKPOINT @@ -592,23 +471,12 @@ void ARMCI_ChildrenTrapSignals() TrapSigAbort(); TrapSigTerm(); TrapSigInt(); - -#if defined(LAPI) || defined(SGI) - TrapSigIot(); -#endif - -#ifdef SGI - TrapSigXcpu(); -#endif - } void ARMCI_ParentTrapSignals() { -#ifndef LAPI TrapSigChld(); -#endif TrapSigHup(); } @@ -623,9 +491,7 @@ void ARMCI_RestoreSignals() void ARMCI_ParentRestoreSignals() { -#ifndef LAPI RestoreSigChld(); -#endif ARMCI_RestoreSignals(); RestoreSigHup(); } diff --git a/armci/src/common/spawn.c b/armci/src/common/spawn.c index b45febf5c..09f13e3b7 100644 --- a/armci/src/common/spawn.c +++ b/armci/src/common/spawn.c @@ -76,7 +76,7 @@ int rc; if(pthread_attr_init(&attr)) armci_die("armci_create_server_thread: attr init failed",0); -#if defined(AIX) || defined(SOLARIS) +#if defined(AIX) pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); #endif diff --git a/armci/src/devices/lapi/lapi.c b/armci/src/devices/lapi/lapi.c deleted file mode 100644 index acf473908..000000000 --- a/armci/src/devices/lapi/lapi.c +++ /dev/null @@ -1,754 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* initialization of data structures and setup of lapi internal parameters */ - -#include -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STRINGS_H -# include -#endif -#include "lapidefs.h" -#include "armcip.h" -#include "copy.h" -#ifdef AIX -#include -#endif -#if (defined(PPC) || defined(__PPC__) || defined(__PPC)) -# include "asm-ppc.h" -#endif - -#define DEBUG_ 0 -#define ERROR(str,val) armci_die((str),(val)) -#define BUF_TO_EVBUF(buf) ((lapi_cmpl_t*)(((char*)buf) - sizeof(lapi_cmpl_t))) - -char err_msg_buf[LAPI_MAX_ERR_STRING]; /* for error msg returned by LAPI */ - -/* -** macro to check return code of function calls. keeps return -** code checking logic from needing to be in main logic -*/ -#define CHECK(func_and_args) \ -{ \ - int rc; \ - if ((rc = (func_and_args)) != LAPI_SUCCESS) { \ - LAPI_Msg_string(rc, err_msg_buf); \ - fprintf(stderr, \ - "LAPI ERROR: %s, rc = %d\n", err_msg_buf, rc); \ - armci_die("LAPI Error", 0); \ - } \ -} - -#if ARMCI_ENABLE_GPC_CALLS - extern gpc_buf_t *gpc_req; -#endif -int lapi_max_uhdr_data_sz; /* max data payload */ -lapi_cmpl_t *cmpl_arr; /* completion state array, dim=NPROC */ -lapi_cmpl_t hdr_cntr; /* AM header buffer counter */ -lapi_cmpl_t buf_cntr; /* AM data buffer counter */ -lapi_cmpl_t* ack_cntr; /* ACK counter used in handshaking protocols - between origin and target */ -lapi_cmpl_t* get_cntr; /* counter used with lapi_get */ -lapi_user_cxt_t *lapi_remote_cxt; /* Remote context for RDMA call */ - -int intr_status; -lapi_info_t lapi_info; -#ifndef TCGMSG -lapi_handle_t lapi_handle; -#endif -pthread_mutex_t _armci_mutex_thread=PTHREAD_MUTEX_INITIALIZER; - - -double _armci_rcv_buf[MSG_BUFLEN_DBL]; -char* MessageRcvBuffer = (char*)_armci_rcv_buf; -char* MessageSndBuffer = (char*)0; - -extern void armci_waitsome(int factor); - -/************* LAPI Active Message handlers *******************************/ - -volatile static int hndlcnt=0, header_cnt=0; -static int hhnum=0; -static long num_malloc=0; /* trace and limit the number malloc calls in HH */ -#define MAX_NUM_MALLOC 100 - -/* trace state of accumulate lock */ -int kevin_ok=1; /* "1" indicates that no other thread is holding the lock */ - - -void armci_completion_handler(lapi_handle_t *t_hndl, void *save) -{ - lapi_handle_t hndl = *t_hndl; - int need_data; - void *message; - int whofrom, msglen; - request_header_t *msginfo = (request_header_t *)save; - char *descr= (char*)(msginfo+1), *buf=MessageRcvBuffer; - int buflen=MSG_BUFLEN; -#if ARMCI_ENABLE_GPC_CALLS - extern pthread_t data_server; - data_server = pthread_self(); -#endif - - if(DEBUG_) - fprintf(stderr,"%d:CH:op=%d from=%d datalen=%d dscrlen=%d\n", armci_me, - msginfo->operation, msginfo->from,msginfo->datalen,msginfo->dscrlen); - - /*** assure that descriptor and data are in the right format and place ***/ - if( msginfo->dscrlen < 0 || msginfo->datalen <0 ){ - /* for large put/acc/scatter need to get the data */ - int rc; - lapi_cntr_t req_cntr; - int bytes=0; - char *origin_ptr = msginfo->tag.buf; - - if (msginfo->dscrlen<0) { - descr =MessageRcvBuffer; - msginfo->dscrlen = -msginfo->dscrlen; - buf = descr + msginfo->dscrlen; - buflen += msginfo->dscrlen; - bytes += msginfo->dscrlen; - - } - if (msginfo->datalen <0){ - msginfo->datalen = -msginfo->datalen; - bytes += msginfo->datalen; - } - - if(rc=LAPI_Setcntr(hndl, &req_cntr, 0)) ERROR("CH:setcntr failed",rc); - if(rc=LAPI_Get(hndl, (uint)msginfo->from, bytes, - origin_ptr, MessageRcvBuffer, - msginfo->tag.cntr,&req_cntr))ERROR("CH:LAPI_Get failed",rc); - - if(rc=LAPI_Waitcntr(hndl, &req_cntr,1,NULL))ERROR("CH:Waitcntr failed",rc); - - - } else{ - - /* desc is in save, data could be but not for GET */ - if(msginfo->operation !=GET)buf = descr + msginfo->dscrlen; - buflen = MSG_BUFLEN; - } - - /* fprintf(stderr,"CH: val=%lf\n",*(double*)(buf+msginfo->datalen -8));*/ - - - /*** dispatch request to the appropriate handler function ***/ - switch(msginfo->operation){ - case LOCK: armci_server_lock(msginfo); - break; - case UNLOCK: armci_server_unlock(msginfo, descr); - break; - default: - if(msginfo->format == STRIDED) - armci_server(msginfo, descr, buf, buflen); - else - armci_server_vector(msginfo, descr, buf, buflen); - } - - free(msginfo); -#ifdef LINUX - (void)fetch_and_add(&num_malloc, (long)-1); -#else - (void)fetch_and_addlp(&num_malloc, (long)-1); -#endif -} - - - - -void* armci_header_handler(lapi_handle_t *t_hndl, void *uhdr, uint *t_uhdrlen, - uint *msglen, compl_hndlr_t **handler, void** psave) -{ - lapi_handle_t hndl = *t_hndl; - uint uhdrlen = *t_uhdrlen; - request_header_t *msginfo = (request_header_t *)uhdr; - - if(DEBUG_) - fprintf(stderr,"%d:HH: op=%d from %d\n",armci_me,msginfo->operation, - msginfo->from); - if(msginfo->to != armci_me) - armci_die("wrong message delivered",msginfo->to); - - /* process small requests that do not require comms in header handler */ - if(msginfo->datalen >0 && msginfo->dscrlen>0 && msginfo->operation != GET - && msginfo->operation != LOCK && msginfo->operation != UNLOCK){ - - /* If another thread is in accumulate use compl. handler path: - * Try to avoid blocking inside HH which degrades Lapi performance. - * The completion handler path requires malloc to save request info. - * Only up to approx. MAX_NUM_MALLOC requests can be rescheduled to - * run in CH instead of HH. - * MAX_NUM_MALLOC is a soft limit to avoid cost of locking when reading - */ - - if( msginfo->operation==PUT || num_malloc>MAX_NUM_MALLOC || kevin_ok){ - - char *descr = (char*)(msginfo+1); - char *buf = descr + msginfo->dscrlen; - int buflen = uhdrlen - sizeof(request_header_t) - msginfo->dscrlen; - - if(DEBUG_) - fprintf(stderr,"%d:HH: buf =%lf\n",armci_me,*(double*)buf); - if(msginfo->format == STRIDED) - armci_server(msginfo, descr, buf, buflen); - else - armci_server_vector(msginfo, descr, buf, buflen); - - /* fprintf(stderr,"%d:HH: getting out of server\n",armci_me);*/ - *psave = NULL; - *handler = NULL; - return(NULL); - } - } - -#ifdef LINUX - (void)fetch_and_add(&num_malloc, (long)1); -#else - (void)fetch_and_addlp(&num_malloc, (long)1); /* AIX atomic increment */ -#endif - - msginfo = (request_header_t*) malloc(uhdrlen); /* recycle pointer */ - if(!msginfo) ERROR("HH: malloc failed in header handler",num_malloc); - - /* save the request info for processing in compl. handler */ - memcpy((char*)msginfo, uhdr, uhdrlen); - *psave = msginfo; - *handler = armci_completion_handler; - - return(NULL); -} - - -void armci_send_req(int proc, request_header_t* msginfo, int len) -{ - int msglen = sizeof(request_header_t); - lapi_cntr_t *pcmpl_cntr, *pcntr = &(BUF_TO_EVBUF(msginfo)->cntr); - int rc; - - msginfo->tag.cntr= pcntr; -#if ARMCI_ENABLE_GPC_CALLS - if(msginfo->operation==GET && msginfo->format==VECTOR && msginfo->ehlen){ - msginfo->tag.buf = (char *)(msginfo+1)+msginfo->dscrlen; - } - else -#endif - msginfo->tag.buf = msginfo+1; - - if(msginfo->operation==GET || msginfo->operation==LOCK){ - - SET_COUNTER(*(lapi_cmpl_t*)pcntr,1);/*dataarrive in same buf*/ - /*The GPC case. Note that we don't use the parameter len*/ - if(msginfo->format==VECTOR && msginfo->ehlen > 0) - msglen += msginfo->datalen; - if(lapi_max_uhdr_data_sz < msginfo->dscrlen){ - - msginfo->dscrlen = -msginfo->dscrlen; /* no room for descriptor */ - pcntr = NULL; /* GET(descr) from CH will increment buf cntr */ - - }else msglen += msginfo->dscrlen; - - /* - we should send the mutex, too. When op==LOCK, Value of len parameter - is already sizeof(reqest_header_t)+sizeof(int), since we dont use - len but construct our own msglen, we need to add sizeof(int). - */ - if(msginfo->operation==LOCK) msglen += sizeof(int); - - pcmpl_cntr=NULL; /* don't trace completion status for load ops */ - - }else if (msginfo->operation==UNLOCK){ - - msglen += msginfo->dscrlen; - pcmpl_cntr=NULL; /* don't trace completion status for unlock */ - - }else{ - - if(lapi_max_uhdr_data_sz < (msginfo->datalen + msginfo->dscrlen)){ - - msginfo->datalen = -msginfo->datalen; - msginfo->dscrlen = -msginfo->dscrlen; - pcntr = NULL; /* GET/LOCK from CH will increment buf cntr */ - - }else msglen += msginfo->dscrlen+msginfo->datalen; - - /* trace completion of store ops */ - pcmpl_cntr = &cmpl_arr[msginfo->to].cntr; - - } - - if(msginfo->operation==PUT || ARMCI_ACC(msginfo->operation)) - UPDATE_FENCE_STATE(msginfo->to, msginfo->operation, 1); - - if((rc=LAPI_Amsend(lapi_handle,(uint)msginfo->to, - (void*)armci_header_handler, msginfo, msglen, NULL, 0, - NULL, pcntr, pcmpl_cntr))) armci_die("AM failed",rc); - - if(DEBUG_) fprintf(stderr,"%d sending req=%d to %d\n", - armci_me, msginfo->operation, proc); -} - - - -/*\ client sends strided data + request to server - \*/ -void armci_send_strided(int proc, request_header_t *msginfo, char *bdata, - void *ptr, int strides, int stride_arr[], int count[]) -{ - - armci_write_strided(ptr, strides, stride_arr, count, bdata); - armci_send_req(proc,msginfo,msginfo->bytes + sizeof(request_header_t)); -} - - -/*\ server sends data back to client - \*/ -void armci_send_data(request_header_t* msginfo, void *data) -{ - armci_lapi_send(msginfo->tag, data, msginfo->datalen, msginfo->from); -} - - -/*\ server sends strided data back to client - \*/ -void armci_send_strided_data(int proc, request_header_t *msginfo, char *bdata, - void *ptr, int strides, int stride_arr[], int count[]) -{ - armci_write_strided(ptr, strides, stride_arr, count, bdata); - armci_lapi_send(msginfo->tag, bdata, msginfo->datalen, msginfo->from); -} - - -char* armci_rcv_data(int proc, request_header_t *msginfo) -{ - lapi_cmpl_t *pcntr=BUF_TO_EVBUF(msginfo); - CLEAR_COUNTER((*pcntr)); -#if ARMCI_ENABLE_GPC_CALLS - if(msginfo->operation==GET && msginfo->format==VECTOR && msginfo->ehlen){ - return((char *)(msginfo+1)+msginfo->dscrlen); - } - else -#endif - return (char*)(msginfo+1); -} - - - -/*\ client receives strided data from server - \*/ -void armci_rcv_strided_data(int proc, request_header_t* msginfo, int datalen, - void *ptr, int strides, int stride_arr[], int count[]) -{ - lapi_cmpl_t *pcntr=BUF_TO_EVBUF(msginfo); - CLEAR_COUNTER((*pcntr)); - armci_read_strided(ptr, strides, stride_arr, count, (char*)(msginfo+1)); -} - - - -/*\ client receives vector data from server to buffer and unpacks it - \*/ -void armci_rcv_vector_data(int proc, request_header_t* msginfo, - armci_giov_t darr[], int len) -{ - char *buf; - buf = armci_rcv_data(proc,msginfo); - armci_vector_from_buf(darr, len, buf); -} - - -/*\ initialization of LAPI related data structures - \*/ -void armci_init_lapi() -{ - int rc, p; - int lapi_max_uhdr_sz; - lapi_cmpl_t *pcntr; - lapi_remote_cxt_t util_cxt; /* For call to obtain rCxt */ - -#ifndef TCGMSG - rc = LAPI_Init(&lapi_handle, &lapi_info); - if(rc) ERROR("lapi_init failed",rc); -#endif - - /* set the max limit for AM header data length */ - rc = LAPI_Qenv(lapi_handle,MAX_UHDR_SZ, &lapi_max_uhdr_sz); - if(rc) ERROR("armci_init_lapi: LAPI_Qenv failed", rc); - - /* fprintf(stderr,"max header size = %d\n",lapi_max_uhdr_sz);*/ - - /* how much data can fit into AM header ? */ - lapi_max_uhdr_data_sz = lapi_max_uhdr_sz - sizeof(request_header_t); - - /* allocate memory for completion state array */ - cmpl_arr = (lapi_cmpl_t*)malloc(armci_nproc*sizeof(lapi_cmpl_t)); - if(cmpl_arr==NULL) ERROR("armci_init_lapi:malloc for cmpl_arr failed",0); - - /* allocate memory for ack and get counters, 1 if not thread safe */ -#ifdef THREAD_SAFE - ack_cntr = calloc(armci_user_threads.max, sizeof(lapi_cmpl_t)); - get_cntr = calloc(armci_user_threads.max, sizeof(lapi_cmpl_t)); -#else - ack_cntr = calloc(1, sizeof(lapi_cmpl_t)); - get_cntr = calloc(1, sizeof(lapi_cmpl_t)); -#endif - if (!(ack_cntr && get_cntr)) - ERROR("armci_init_lapi:calloc for ack or get counters failed",0); - - /* initialize completion state array */ - for(p = 0; p< armci_nproc; p++){ - rc = LAPI_Setcntr(lapi_handle, &cmpl_arr[p].cntr, 0); - if(rc) ERROR("armci_init_lapi: LAPI_Setcntr failed (arr)",rc); - cmpl_arr[p].oper = -1; - cmpl_arr[p].val = 0; - } - - /* initialize ack/buf/hdr counters */ -#ifdef THREAD_SAFE -# define N armci_user_threads.max -#else -# define N 1 -#endif - for (p = 0; p < N; p++) { - rc = LAPI_Setcntr(lapi_handle, &(ack_cntr[p].cntr), 0); - if(rc) ERROR("armci_init_lapi: LAPI_Setcntr failed (ack)",rc); - ack_cntr[p].val = 0; - - rc = LAPI_Setcntr(lapi_handle, &(get_cntr[p].cntr), 0); - if(rc) ERROR("armci_init_lapi: LAPI_Setcntr failed (get)",rc); - get_cntr[p].val = 0; - } - rc = LAPI_Setcntr(lapi_handle, &hdr_cntr.cntr, 0); - if(rc) ERROR("armci_init_lapi: LAPI_Setcntr failed (hdr)",rc); - hdr_cntr.val = 0; - rc = LAPI_Setcntr(lapi_handle, &buf_cntr.cntr, 0); - if(rc) ERROR("armci_init_lapi: LAPI_Setcntr failed (buf)",rc); - buf_cntr.val = 0; -#if 0 - pcntr = (lapi_cmpl_t*)MessageSndBuffer; - rc = LAPI_Setcntr(lapi_handle, &pcntr->cntr, 0); - if(rc) ERROR("armci_init_lapi: LAPI_Setcntr failed (bufcntr)",rc); - pcntr->val = 0; -#endif - -#ifdef LAPI_RDMA - /* allocate rCxt */ - lapi_remote_cxt = (lapi_user_cxt_t*)malloc(armci_nproc * - sizeof(lapi_user_cxt_t)); - if(lapi_remote_cxt==NULL) ERROR("armci_init_lapi: rCxt malloc failed",0); - - /* obtain remote context "rCxt" for RDMA Operation of all procs */ - for(p = 0; p< armci_nproc; p++){ - if(p==armci_me) continue; - util_cxt.Util_type = LAPI_REMOTE_RCXT; - util_cxt.operation = LAPI_RDMA_ACQUIRE; - util_cxt.dest = p; - CHECK(LAPI_Util(lapi_handle, (lapi_util_t *) &util_cxt)); - lapi_remote_cxt[p] = util_cxt.usr_rcxt; - } -#endif - -#if !defined(LAPI2) - - /* for high performance, disable LAPI internal error checking */ - LAPI_Senv(lapi_handle, ERROR_CHK, 0); - -#endif - - /* make sure that interrupt mode is on */ - LAPI_Senv(lapi_handle, INTERRUPT_SET, 1); - - /* initialize buffer managment module */ - _armci_buf_init(); - -#ifdef LAPI_RDMA - CHECK((LAPI_Gfence(lapi_handle))); -#endif -#if ARMCI_ENABLE_GPC_CALLS - gpc_req = (gpc_buf_t *)malloc(sizeof(gpc_buf_t)*MAX_GPC_REQ); - if(gpc_req==NULL)armci_die("malloc for gpc failed",sizeof(gpc_buf_t)); - gpc_init(); -#endif -} - - -void armci_term_lapi() -{ - int p; - lapi_remote_cxt_t util_cxt; /* For call to obtain rCxt */ - -#ifdef LAPI_RDMA - CHECK((LAPI_Gfence(lapi_handle))); - - /* release remote context "rCxt" for RDMA Operation of all procs */ - for(p = 0; p< armci_nproc; p++){ - if(p==armci_me) continue; - util_cxt.Util_type = LAPI_REMOTE_RCXT; - util_cxt.operation = LAPI_RDMA_RELEASE; - util_cxt.dest = p; - util_cxt.usr_rcxt = lapi_remote_cxt[p]; - CHECK(LAPI_Util(lapi_handle, (lapi_util_t *) &util_cxt)); - } - free(lapi_remote_cxt); -#endif - -#ifndef TCGMSG - CHECK((LAPI_Term(lapi_handle))); /* terminate the LAPI handle */ -#endif - free(cmpl_arr); - free(ack_cntr); - free(get_cntr); -} - -/* primitive pseudo message-passing on top of lapi */ - -/* send data to remote process using p specified message tag */ -/* tag contains address of receive buffer guarded by cntr at process p */ -void armci_lapi_send(msg_tag_t tag, void* data, int len, int p) -{ - int rc; - lapi_cntr_t org_cntr; - void *buf = tag.buf; - lapi_cntr_t *cntr = tag.cntr; - if(!buf)ERROR("armci_lapi_send: NULL tag(buf) error",0); - if(!cntr)ERROR("armci_lapi_send: NULL tag(cntr) error",0); - - rc=LAPI_Setcntr(lapi_handle, &org_cntr, 0); - if(rc) ERROR("armci_lapi_send:setcntr failed",rc); - rc=LAPI_Put(lapi_handle, (uint)p, (uint)len, buf, data, - cntr, &org_cntr, NULL); - if(rc) ERROR("armci_lapi_send:put failed",rc); - rc+=LAPI_Waitcntr(lapi_handle, &org_cntr, 1, NULL); - if(rc) ERROR("armci_lapi_send:waitcntr failed",rc); -} - -/* subroutine versions of macros disabling and enabling interrupts */ -void intr_off_() -{ - INTR_OFF; -} - -void intr_on_() -{ - INTR_ON; -} - - -void print_counters_() -{ - int i; - printf("bufcntr: val =%d cntr=%d\n", buf_cntr.val, buf_cntr.cntr); - for(i=0; i< armci_nproc;i++){ - printf("cmpl_arr: val=%d cntr=%d oper=%d\n",cmpl_arr[i].val, - cmpl_arr[i].cntr, cmpl_arr[i].oper); - } - fflush(stdout); -} - -#ifdef LAPI_RDMA -/* LAPI Put RDMA */ -void armci_client_direct_send(int p, void *src_buf, void *dst_buf, - int len, void** contextptr, int nbtag, - ARMCI_MEMHDL_T *lochdl,ARMCI_MEMHDL_T *remhdl) { - - lapi_xfer_t xfer_struct; /* Data structure for the xfer call */ - lapi_rdma_tag_t lapi_rdma_tag; /* RDMA notification tag */ - uint src_offset, tgt_offset; - int val, rc; - - /* can be any number that fits in ushort */ - lapi_rdma_tag = 22; - - /* CHECK: offset problem. what if client and server attached (shmat) at - diff address */ - src_offset = (char *)src_buf- (char *)lochdl->start; - tgt_offset = (char *)dst_buf - (char *)remhdl->start; - -#if DEBUG_ - printf("%d: Doing LAPI_Xfer (RDMA Put): dst=%d srchdl_start=%p remhdl_start=%p (bytes=%ld src_off=%d tgt_off=%d)\n", armci_me, p, lochdl->start, remhdl->start, len, src_offset, tgt_offset); fflush(stdout); -#endif - - bzero(&xfer_struct, sizeof(xfer_struct)); - xfer_struct.HwXfer.Xfer_type = LAPI_RDMA_XFER; - xfer_struct.HwXfer.tgt = p; - /*xfer_struct.HwXfer.op = LAPI_RDMA_PUT|LAPI_RCNTR_UPDATE;*/ - xfer_struct.HwXfer.op = LAPI_RDMA_PUT; - xfer_struct.HwXfer.rdma_tag = lapi_rdma_tag; - xfer_struct.HwXfer.remote_cxt = lapi_remote_cxt[p]; - xfer_struct.HwXfer.src_pvo = lochdl->pvo; - xfer_struct.HwXfer.tgt_pvo = remhdl->pvo; - xfer_struct.HwXfer.src_offset = src_offset; - xfer_struct.HwXfer.tgt_offset = tgt_offset; - xfer_struct.HwXfer.len = (ulong) (len); - xfer_struct.HwXfer.shdlr = (scompl_hndlr_t *) NULL; - xfer_struct.HwXfer.sinfo = (void *) NULL; - xfer_struct.HwXfer.org_cntr = &(ack_cntr->cntr); - - /* Initiate RDMA Xfer */ - if((rc = LAPI_Xfer(lapi_handle, &xfer_struct)) != LAPI_SUCCESS) { - LAPI_Msg_string(rc, err_msg_buf); - fprintf(stderr, "LAPI ERROR: %s, rc = %d\n", err_msg_buf, rc); - armci_die("LAPI_Xfer (RDMA Put) failed", 0); - } - - /* wait for RDMA completion */ - rc = LAPI_Waitcntr(lapi_handle, &(ack_cntr->cntr),1,&val); - if(rc != LAPI_SUCCESS) { - LAPI_Msg_string(rc, err_msg_buf); - fprintf(stderr, "LAPI ERROR: %s, rc = %d\n", err_msg_buf, rc); - armci_die("LAPI_Waitcntr (RDMA Put) failed", 0); - } - - /* CHECK((LAPI_Fence(lapi_handle))); */ - -#if DEBUG_ - printf("%d: Completed LAPI_Xfer RDMA (Put): dst=%d\n", armci_me, p); -#endif -} - -/* LAPI Get RDMA */ -void armci_client_direct_get(int p, void *src_buf, void *dst_buf, - int len, void** cptr, int nbtag, - ARMCI_MEMHDL_T *lochdl, ARMCI_MEMHDL_T *remhdl) { - - lapi_xfer_t xfer_struct; /* Data structure for the xfer call */ - lapi_rdma_tag_t lapi_rdma_tag; /* RDMA notification tag */ - uint src_offset, tgt_offset; - int val, rc; - - /* can be any number that fits in ushort */ - lapi_rdma_tag = 21; - - /* CHECK: offset problem. what if client and server attached (shmat) at - diff address */ - src_offset = (char *)dst_buf- (char *)lochdl->start; - tgt_offset = (char *)src_buf - (char *)remhdl->start; - -#if DEBUG_ - printf("%d: Doing LAPI_Xfer (RDMA Get): dst=%d srchdl_start=%p remhdl_start=%p (bytes=%ld src_off=%d tgt_off=%d)\n", armci_me, p, lochdl->start, remhdl->start, len, src_offset, tgt_offset); fflush(stdout); -#endif - bzero(&xfer_struct, sizeof(xfer_struct)); - xfer_struct.HwXfer.Xfer_type = LAPI_RDMA_XFER; - xfer_struct.HwXfer.tgt = p; - /*xfer_struct.HwXfer.op = LAPI_RDMA_GET|LAPI_RCNTR_UPDATE;*/ - xfer_struct.HwXfer.op = LAPI_RDMA_GET; - xfer_struct.HwXfer.rdma_tag = lapi_rdma_tag; - xfer_struct.HwXfer.remote_cxt = lapi_remote_cxt[p]; - xfer_struct.HwXfer.src_pvo = lochdl->pvo; - xfer_struct.HwXfer.tgt_pvo = remhdl->pvo; - xfer_struct.HwXfer.src_offset = src_offset; - xfer_struct.HwXfer.tgt_offset = tgt_offset; - xfer_struct.HwXfer.len = (ulong) (len); - xfer_struct.HwXfer.shdlr = (scompl_hndlr_t *) NULL; - xfer_struct.HwXfer.sinfo = (void *) NULL; - xfer_struct.HwXfer.org_cntr = &(get_cntr->cntr); - - /* Initiate RDMA Xfer */ - if((rc = LAPI_Xfer(lapi_handle, &xfer_struct)) != LAPI_SUCCESS) { - LAPI_Msg_string(rc, err_msg_buf); - fprintf(stderr, "LAPI ERROR: %s, rc = %d\n", err_msg_buf, rc); - armci_die("LAPI_Xfer (RDMA Get) failed", 0); - } - - /* wait for RDMA completion */ - rc = LAPI_Waitcntr(lapi_handle, &(get_cntr->cntr),1,&val); - if(rc != LAPI_SUCCESS) { - LAPI_Msg_string(rc, err_msg_buf); - fprintf(stderr, "LAPI ERROR: %s, rc = %d\n", err_msg_buf, rc); - armci_die("LAPI_Waitcntr (RDMA Get) failed", 0); - } - -#if DEBUG_ - printf("%d: Completed LAPI_Xfer (RDMA Get): dst=%d\n", armci_me, p); -#endif - -} - -int armci_pin_contig_hndl(void *ptr, int bytes, ARMCI_MEMHDL_T *memhdl) -{ - - lapi_get_pvo_t util_pvo; /* For call to obtain PVO */ - int rc; - - /* translate and pin the buffer to the adapter */ - util_pvo.Util_type = LAPI_XLATE_ADDRESS; - util_pvo.length = bytes; - util_pvo.usr_pvo = 0; - util_pvo.address = ptr; - util_pvo.operation = LAPI_RDMA_ACQUIRE; - /*bzero(ptr, bytes);*/ /* CHECK: Is touching the entire shmem sgement feasible */ - if((rc=LAPI_Util(lapi_handle, (lapi_util_t *) &util_pvo)) != LAPI_SUCCESS) { - return 0; - } - - memhdl->pvo = util_pvo.usr_pvo; - memhdl->start = ptr; - -#if DEBUG_ - printf("\n%d:armci_pin_contig_hndl(): memhdl(pvo)=%ld ptr=%p bytes=%ld\n", - armci_me, (long)memhdl->pvo, ptr, bytes);fflush(stdout); -#endif - - return 1; -} - -void armci_network_client_deregister_memory(ARMCI_MEMHDL_T *mh) -{ - fprintf(stderr," armci_network_client_deregister_memory not available\n"); - fflush(stderr); -} - - -void armci_network_server_deregister_memory(ARMCI_MEMHDL_T *mh) -{ - fprintf(stderr," armci_network_client_deregister_memory not available\n"); - fflush(stderr); -} -#endif /* LAPI_RDMA */ - -#ifdef AIX - -#define LOCKED 1 -void armci_lapi_lock(int *lock) -{ - atomic_p word_addr = (atomic_p)lock; - int spin = 1; - - - while(1){ - - if(_check_lock(word_addr, 0, LOCKED) == FALSE ) - break; /* we got the lock */ - - if(spin){ - armci_waitsome(1); - spin = 0; - }else{ - - /* yield processor to another thread */ - /* cannot yield w/o affecting thread priority - better sleep */ - /* yield(); */ - - /* call usleep to notify scheduler */ - (void)usleep(5); - } - } -} - - -void armci_lapi_unlock(int *lock) -{ - atomic_p word_addr = (atomic_p)lock; - - if(_check_lock(word_addr, LOCKED, 0) == TRUE ) - armci_die("somebody else unlocked",0); -} -#endif - -#ifdef LAPI2 -#include "lapi2.c" -#endif - diff --git a/armci/src/devices/lapi/lapi2.c b/armci/src/devices/lapi/lapi2.c deleted file mode 100644 index b99c5e7b0..000000000 --- a/armci/src/devices/lapi/lapi2.c +++ /dev/null @@ -1,413 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Id: lapi2.c,v 1.18.2.3 2007-07-02 05:24:34 d3p687 Exp $ */ -#define DEBUG 0 -#define DSCR_SIZE 4096*8 /*given that bufsize=30000*8,conservative,indeed*/ - -#define LAPI_CLEAR_CNTR(ocmpl_) if((ocmpl_)->val) {\ -int _val_;\ - if(LAPI_Waitcntr(lapi_handle,&((ocmpl_)->cntr), ((ocmpl_)->val), &_val_))\ - armci_die("LAPI_Waitcntr failed",-1);\ - if(_val_ != 0) armci_die("CLEAR_COUNTER: nonzero in file ", _val_);\ - (ocmpl_)->val = 0; \ -} - - -/*\ create lapi vector descriptors from the buffer -\*/ -static void lapi2_create_vec_info(lapi_vec_t **srcv, lapi_vec_t **dstv, - int iovnum,int iovlen,char *bufptr) -{ -int dsize = iovnum*sizeof(void *); -int dlen = iovlen*sizeof(int); -int offset=0; - - *srcv = (lapi_vec_t *)(bufptr+offset); offset+=sizeof(lapi_vec_t); - *dstv = (lapi_vec_t *)(bufptr+offset); offset+=sizeof(lapi_vec_t); - - - (*srcv)->info= (void **)(bufptr+offset); offset+=dsize; - (*dstv)->info= (void **)(bufptr+offset); offset+=dsize; - if(dlen!=0){ - (*srcv)->len = (unsigned long *)(bufptr+offset); offset+=dlen; - (*dstv)->len = (unsigned long *)(bufptr+offset); offset+=dlen; - } - else { - (*srcv)->len = (*dstv)->len = NULL; - } -} - - -/*\ 2D strided get/put using lapi vector/strided transfer -\*/ -void armcill_op2D(int op,void *src_ptr,int src_stride,void *dst_ptr, - int dst_stride,int count, int bytes, int p, - lapi_cntr_t *ocntr,char *bufptr) -{ -lapi_vec_t *src, *dst; -int reqid,rc; -int offset=0; - - if(DEBUG){ - printf("\n%d:in put2d p=%d bytes=%d\n",armci_me,p,bytes);fflush(stdout); - } - - /*lapi2_create_vec_info(&src,&dst,3,0,bufptr);*/ - src = (lapi_vec_t *)(bufptr+offset); offset+=sizeof(lapi_vec_t); - dst = (lapi_vec_t *)(bufptr+offset); offset+=sizeof(lapi_vec_t); - src->info = (void **)(bufptr+offset); offset+=3*sizeof(void *); - dst->info = (void **)(bufptr+offset); offset+=3*sizeof(void *); - - - src->vec_type = dst->vec_type = LAPI_GEN_STRIDED_XFER; - src->num_vecs = (uint)count; dst->num_vecs= (uint)count; - - src->len = NULL; dst->len = NULL; - src->info[0] = src_ptr; dst->info[0] = dst_ptr; - src->info[1] = (void*)bytes; dst->info[1] = (void*)bytes; - src->info[2] = (void*)src_stride; dst->info[2] = (void*)dst_stride; - - if(op==GET) - rc = LAPI_Getv(lapi_handle, (uint)p, src, dst,NULL,ocntr); - else - rc = LAPI_Putv(lapi_handle,(uint)p,dst,src,NULL,ocntr,&cmpl_arr[p].cntr); - - if(rc) armci_die2("LAPI_op2D failed",rc,op); - - if(DEBUG)printf("\n%d: put completed \n",armci_me); -} - - -/*\ ND strided get/put packed and sent as vectors -\*/ -void armcill_opND(int op,void *src_ptr, int src_stride_arr[],void* dst_ptr, - int dst_stride_arr[],int count[], int stride_levels, - int proc, lapi_cmpl_t *ocmpl,char *bufptr) -{ -char *dst=(char*)dst_ptr; -char *src=(char*)src_ptr; -char *dst1; -char *src1; -int i,j,k,num_xmit=0,lastiovlength,iovlength,n=0,max_iovec,totalsize=0; -int total_of_2D=1; -int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL]; -int rc,vecind; -lapi_vec_t *srcv, *dstv; -lapi_cntr_t *ocntr=&(ocmpl->cntr); -int offset=0; - - if(DEBUG){ - printf("\n%d:in getND count[0] is %d and strarr[0] is%d maxiov=%d\n", - armci_me,count[0],dst_stride_arr[0],max_iovec); - fflush(stdout); - } - - index[2] = 0; unit[2] = 1; - if(stride_levels>1){ - total_of_2D = count[2]; - for(j=3; j<=stride_levels; j++) { - index[j] = 0; unit[j] = unit[j-1] * count[j-1]; - total_of_2D *= count[j]; - } - } - - max_iovec=(DSCR_SIZE-2*sizeof(lapi_vec_t))/(2*(sizeof(int)+sizeof(void*))); - - /*compute number of loops and the size of last iovector based of buf size*/ - num_xmit = total_of_2D*count[1]/max_iovec; - lastiovlength = (total_of_2D*count[1])%max_iovec; - if(num_xmit == 0) num_xmit = 1; - else if(lastiovlength!=0)num_xmit++; - - /*set the current iov length*/ - k=0;vecind=0; - if(lastiovlength!=0 && k==(num_xmit-1))iovlength=lastiovlength; - else iovlength=max_iovec; - - /*create the lapi_vec_t from the buffer*/ - /*lapi2_create_vec_info(&srcv,&dstv,iovlength,iovlength,bufptr);*/ - srcv = (lapi_vec_t *)(bufptr+offset); offset+=sizeof(lapi_vec_t); - dstv = (lapi_vec_t *)(bufptr+offset); offset+=sizeof(lapi_vec_t); - srcv->info= (void **)(bufptr+offset); offset+=iovlength*sizeof(void*); - dstv->info= (void **)(bufptr+offset); offset+=iovlength*sizeof(void*); - srcv->len = (unsigned long *)(bufptr+offset);offset+=iovlength*sizeof(unsigned long); - dstv->len = (unsigned long *)(bufptr+offset);offset+=iovlength*sizeof(unsigned long); - - - srcv->vec_type = dstv->vec_type = LAPI_GEN_IOVECTOR; - srcv->num_vecs = (uint)iovlength; dstv->num_vecs= (uint)iovlength; - - for(i=0; i= count[j]) index[j] = 0; - } - dst1=dst; - src1=src; - for(j=0;jval+=1; - if(op==GET){ - rc = LAPI_Getv(lapi_handle,(uint)proc,srcv,dstv,NULL,ocntr); - } - else { - UPDATE_FENCE_STATE(proc,PUT,1); - rc = LAPI_Putv(lapi_handle,(uint)proc,dstv,srcv,NULL,ocntr, - &cmpl_arr[proc].cntr); - } - if(rc) armci_die2("LAPI_opND failed",rc,op); - vecind = 0; totalsize=0; k++; - if(lastiovlength!=0 && k==(num_xmit-1))iovlength=lastiovlength; - else iovlength=max_iovec; - srcv->num_vecs = (uint)iovlength; dstv->num_vecs= (uint)iovlength; - } - - dstv->info[vecind] = dst1; - dstv->len[vecind] = count[0]; - srcv->info[vecind] = src1; - srcv->len[vecind] = count[0]; - totalsize+=count[0]; - dst1+=dst_stride_arr[0]; - src1+=src_stride_arr[0]; - } - if(vecind==iovlength){ - LAPI_CLEAR_CNTR((ocmpl)); - ocmpl->val+=1; - if(op==GET){ - rc = LAPI_Getv(lapi_handle,(uint)proc,srcv,dstv,NULL,ocntr); - } - else { - UPDATE_FENCE_STATE(proc,PUT,1); - rc = LAPI_Putv(lapi_handle,(uint)proc,dstv,srcv,NULL,ocntr, - &cmpl_arr[proc].cntr); - } - if(rc) armci_die2("LAPI_opND failed",rc,op); - vecind = 0; totalsize=0; k++; - if(lastiovlength!=0 && k==(num_xmit-1))iovlength=lastiovlength; - else iovlength=max_iovec; - srcv->num_vecs = (uint)iovlength; dstv->num_vecs= (uint)iovlength; - } - } - if(DEBUG)printf("\n%d: get completed \n",armci_me); -} - - - -void lapi_op_2d(int op, uint proc, void *src_ptr, void *dst_ptr,uint bytes, - int count, int src_stride, int dst_stride,lapi_cmpl_t* o_cmpl) -{ -int i,rc; - if(op==PUT)UPDATE_FENCE_STATE(proc, PUT, count); - o_cmpl->val+=count; - for(i=0;icntr),&cmpl_arr[proc].cntr); - else - rc=LAPI_Get(lapi_handle,proc,bytes,(src_ptr),(dst_ptr),NULL, - &(o_cmpl->cntr)); - if(rc)ARMCI_Error("LAPI_put failed",0); - src_ptr = (void*) ((unsigned long)src_ptr+src_stride); - dst_ptr = (void*) ((unsigned long)dst_ptr+dst_stride); - } -} - - -/*\This function is designed as follows. - * CONTIG code breaks ND into 1D chunks a does Lapi_Put on each chunk. - * STRIDED code uses strided option in the LAPI_PutV call - * VECTOR code packs multi-strided/vector data as vectors as transmits. - * ____________________________________ - * | type small/medium large | - * |------------------------------------ - * | 1D CONTIG CONTIG| - * | 2D STRIDED CONTIG| - * | >2D VECTOR CONTIG| - * |-----------------------------------| - * this code uses orig counter from nb_handle for non-blk call - * completion counter should always be same for non-blk and blk code to be - * able to do ordering/fence. -\*/ -void armci_network_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, armci_ihdl_t nb_handle) -{ -int rc=0; -lapi_cmpl_t *o_cmpl; -int total_of_2D,i,j; -char *src = (char*)src_ptr, *dst=(char*)dst_ptr; -char *bufptr; -int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL]; -int dsize=3*sizeof(void*); - /*pick a counter, default for blocking, from descriptor for non-blocking*/ - if(nb_handle){ - INIT_COUNTER((nb_handle->cmpl_info),0); - o_cmpl = &(nb_handle->cmpl_info); - } - else{ - if(op==GET) -#if 0 - o_cmpl = &get_cntr; - else - o_cmpl = &ack_cntr; -#else - /* multithreaded lapi uses array of counters (one per thread) */ - o_cmpl = get_cntr; /* same as &(get_cntr[0]) */ - else - o_cmpl = ack_cntr; /* same as &(ack_cntr[0]) */ -#endif - } - /*CONTIG protocol: used for 1D(contiguous) or if stride is very large in - a multi strided case*/ - if(stride_levels==0 || count[0]>LONG_PUT_THRESHOLD){ - /*set bufid in nb_handle, in this case, no buffer used, hence NB_NONE*/ - if(nb_handle) - armci_set_nbhandle_bufid(nb_handle,NULL,NB_NONE); - switch (stride_levels) { - case 0: /* 1D op */ - lapi_op_2d(op, (uint)proc, src_ptr, dst_ptr, count[0], 1, - 0,0,o_cmpl); - break; - case 1: /* 2D op */ - lapi_op_2d(op, (uint)proc, src_ptr,dst_ptr, (uint)count[0], count[1], - src_stride_arr[0], dst_stride_arr[0], o_cmpl); - break; - default: /* N-dimensional */ - { - index[2] = 0; unit[2] = 1; total_of_2D = count[2]; - for(j=3; j<=stride_levels; j++) { - index[j] = 0; unit[j] = unit[j-1] * count[j-1]; - total_of_2D *= count[j]; - } - for(i=0; i= count[j]) index[j] = 0; - } - lapi_op_2d(op, (uint)proc, src, dst,(uint)count[0], count[1], - src_stride_arr[0], dst_stride_arr[0],o_cmpl); - } - } - } - } - else{ /* greated than 1D small/med stride */ - - if(stride_levels==1){ /*small/med 2D, use lapi STRIDED */ - bufptr = GET_SEND_BUFFER(2*(sizeof(lapi_vec_t)+dsize),op,proc); - if(nb_handle){ - /*update info in the buf_info_t data-structure*/ - SET_BUF_TAG(bufptr,nb_handle->tag,0); - /*set the buffer id in nb_handle*/ - armci_set_nbhandle_bufid(nb_handle,bufptr,0); - } - if(op==PUT)UPDATE_FENCE_STATE(proc, PUT, 1); - - /*we use the counter in the buffer*/ - o_cmpl = (BUF_TO_EVBUF(bufptr)); - - armcill_op2D(op,src_ptr,src_stride_arr[0],dst_ptr,dst_stride_arr[0], - count[1],count[0],proc,&(o_cmpl->cntr),bufptr); - } - else { /*small/med >2D, use lapi VECTOR*/ - bufptr = GET_SEND_BUFFER(DSCR_SIZE,op,proc); - if(nb_handle){ - /*update info in the buf_info_t data-structure*/ - SET_BUF_TAG(bufptr,nb_handle->tag,0); - /*set the buffer id in nb_handle*/ - armci_set_nbhandle_bufid(nb_handle,bufptr,0); - } - /*we use the counter in the buffer*/ - o_cmpl = (BUF_TO_EVBUF(bufptr)); - - /*val set to 0 because of the way opND is writted, to be modified*/ - o_cmpl->val=0; - - armcill_opND(op,src_ptr,src_stride_arr,dst_ptr, dst_stride_arr,count, - stride_levels,proc,o_cmpl,bufptr); - } - - /* - for blocking cases, we can free cmpldescr buffer and wait for op - to complete. - */ - if(!nb_handle){ - /*for now, we manually clear the counter here for blocking calls. - for later, this has to be done in FREE_SEND_BUFFER.*/ - LAPI_CLEAR_CNTR(o_cmpl); - FREE_SEND_BUFFER(bufptr); - } - - } -} - - -void armci_send_strided_data_bypass(int proc, request_header_t *msginfo, - void *bufptr, int msg_buflen, - void *src_ptr, int *loc_stride_arr, - void *dst_ptr, int *rem_stride_arr, - int *pcount, int stride_levels) -{ -lapi_cntr_t c; -int count= pcount[1],bytes=pcount[0],rc; -int src_stride= loc_stride_arr[0]; -int dst_stride= rem_stride_arr[0]; -lapi_vec_t *src, *dst; -int offset=0; -int p=msginfo->from; - - - if(stride_levels!=1)armci_die("armci_send_strided_data_bypass wrong stride",stride_levels); - - LAPI_Setcntr(lapi_handle,&c,0); - - src = (lapi_vec_t *)((unsigned long)bufptr+offset); offset+=sizeof(lapi_vec_t); - dst = (lapi_vec_t *)((unsigned long)bufptr+offset); offset+=sizeof(lapi_vec_t); - src->info = (void **)((unsigned long)bufptr+offset); offset+=3*sizeof(void *); - dst->info = (void **)((unsigned long)bufptr+offset); offset+=3*sizeof(void *); - - - src->vec_type = dst->vec_type = LAPI_GEN_STRIDED_XFER; - src->num_vecs = (uint)count; dst->num_vecs= (uint)count; - - src->len = NULL; dst->len = NULL; - src->info[0] = src_ptr; dst->info[0] = dst_ptr; - src->info[1] = (void*)bytes; dst->info[1] = (void*)bytes; - src->info[2] = (void*)src_stride; dst->info[2] = (void*)dst_stride; - - rc = LAPI_Putv(lapi_handle,(uint)p,dst,src,msginfo->tag.cntr,&c,NULL); - if(rc)armci_die("armci_send_strided_data_bypass failed",rc); - - if(DEBUG_){ - printf("%dserv: did putv to %d cntr =%p\n",armci_me,p,msginfo->tag.cntr); fflush(stdout); - } - LAPI_Waitcntr(lapi_handle, &c,1,NULL); -} - - - -/*\ client receives strided data from server -\*/ -void armci_rcv_strided_data_bypass_both(int proc, request_header_t* msginfo, - void *ptr, int count[], int strides) -{ -lapi_cmpl_t *pcntr=BUF_TO_EVBUF(msginfo); - if(DEBUG_){ - printf("%d: expecting data from %d cntr =%p v=%d\n",armci_me,proc,&pcntr->cntr,pcntr->val); - fflush(stdout); - } - CLEAR_COUNTER((*pcntr)); - if(DEBUG_){ - printf("%d: got data from %d\n",armci_me,proc); fflush(stdout); - } -} - diff --git a/armci/src/devices/lapi/lapidefs.h b/armci/src/devices/lapi/lapidefs.h deleted file mode 100644 index 231fda07f..000000000 --- a/armci/src/devices/lapi/lapidefs.h +++ /dev/null @@ -1,139 +0,0 @@ -#ifndef LAPI_DEFS_H -#define LAPI_DEFS_H - -#include - -#ifdef LAPI_ERR_BAD_NINTH_PARM -#define LAPI2 -#endif -#define COMPLETE_HANDLE _armci_buf_complete_nb_request -#define TEST_HANDLE _armci_buf_test_nb_request - -#define NB_CMPL_T lapi_cmpl_t - -extern lapi_handle_t lapi_handle; -extern int lapi_max_uhdr_data_sz; /* max data payload in AM header */ - -typedef struct{ - lapi_cntr_t cntr; /* counter to trace completion of stores */ - int val; /* number of pending LAPI store ops */ - int oper; /* code for last ARMCI store operation */ -}lapi_cmpl_t; - - -typedef struct{ /* generalized pointer to buffer */ - void *cntr; - void *buf; -}gp_buf_t; - -typedef struct{ - void *buf; - lapi_cntr_t *cntr; -}msg_tag_t; - -#ifdef LAPI_RDMA -typedef struct region_memhdl{ - void *start; - lapi_user_pvo_t pvo; -} region_memhdl_t; - -#define HAS_RDMA_GET -#define REGIONS_REQUIRE_MEMHDL -#define ARMCI_MEMHDL_T region_memhdl_t - -extern void armci_client_direct_send(int p, void *src_buf, void *dst_buf, - int len, void** contextptr, int nbtag, - ARMCI_MEMHDL_T *lochdl, - ARMCI_MEMHDL_T *remhdl); /* LAPI RDMA */ -#endif /* LAPI_RDMA */ -extern lapi_cmpl_t *cmpl_arr; /* completion state array, dim=NPROC */ -extern lapi_cmpl_t *ack_cntr; /* ACK counter used in handshaking protocols - between origin and target */ -extern lapi_cmpl_t buf_cntr; /* AM data buffer counter */ -extern lapi_cmpl_t *get_cntr; /* lapi_get counter */ -extern lapi_cmpl_t hdr_cntr; /* AM header buffer counter */ -extern int intr_status; - -extern void armci_init_lapi(void); /* initialize LAPI data structures*/ -extern void armci_term_lapi(void); /* destroy LAPI data structures */ -extern void armci_lapi_send(msg_tag_t, void*, int, int); /* LAPI send */ - - -#define BUF_EXTRA_FIELD_T lapi_cmpl_t -#define EXTRA_MSG_BUFLEN_DBL (sizeof(lapi_cmpl_t)>>3) -#define MAX_CHUNKS_SHORT_GET 9 -#define SHORT_ACC_THRESHOLD (6 * lapi_max_uhdr_data_sz) -#define SHORT_PUT_THRESHOLD (6 * lapi_max_uhdr_data_sz) - -#define LONG_PUT_THRESHOLD 128 -#define LONG_GET_THRESHOLD 4000 -#define LONG_GET_THRESHOLD_STRIDED LONG_GET_THRESHOLD -#define PACKPUT 8 - -#define MSG_BUFLEN_DBL 30000 - -#define INTR_ON if(intr_status==1)LAPI_Senv(lapi_handle, INTERRUPT_SET, 1) -#define INTR_OFF {\ - LAPI_Qenv(lapi_handle, INTERRUPT_SET, &intr_status);\ - LAPI_Senv(lapi_handle, INTERRUPT_SET, 0);\ -} - - -/**** macros to control LAPI modes and ordering of operations ****/ -#define TEST_COUNTER(counter,_ret_) if((counter).val){\ - int _val__;\ - if(LAPI_Getcntr(lapi_handle,&(counter).cntr,&_val__))\ - armci_die("LAPI_Getcntr failed",-1);\ - if(_val__ != (counter).val) *(_ret_)=1;\ -}else *(_ret_)=1; - -#define WAIT_COUNTER(counter) if((counter).val)\ - for(;;){\ - int _val__;\ - if(LAPI_Getcntr(lapi_handle,&(counter).cntr,&_val__))\ - armci_die("LAPI_Getcntr failed",-1);\ - if(_val__ == (counter).val) break;\ - LAPI_Probe(lapi_handle);\ -} - -#define CLEAR_COUNTER(counter) if((counter).val) {\ -int _val_;\ - if(LAPI_Waitcntr(lapi_handle,&(counter).cntr, (counter).val, &_val_))\ - armci_die("LAPI_Waitcntr failed",-1);\ - if(_val_ != 0) armci_die("CLEAR_COUNTER: nonzero in file ", _val_);\ - (counter).val = 0; \ -} - - -#define INIT_COUNTER(counter,_val) {\ - int _rc = LAPI_Setcntr(lapi_handle, &(counter).cntr, 0);\ - if(_rc)armci_die("INIT_COUNTER:setcntr failed ", _rc);\ - (counter).val = (_val);\ -} - - -#define SET_COUNTER(counter, value) (counter).val += (value) - -#define INIT_SEND_BUF(_cntr,_snd,_rcv) INIT_COUNTER(_cntr,1) -#define CLEAR_SEND_BUF_FIELD(_cntr, _s, _r,_t,_o) CLEAR_COUNTER(_cntr) -#define TEST_SEND_BUF_FIELD(_cntr, _s, _r,_t,_o,_ret) TEST_COUNTER(_cntr,(_ret)) -#define FIRST_INIT_SEND_BUF INIT_COUNTER -#define SET_BUF_TAG _armci_buf_set_tag -#define INIT_SENDBUF_INFO(_hdl,_buf,_op,_proc) \ - _armci_buf_set_tag(_buf,(_hdl)->tag,0) - -#define FENCE_NODE(p) CLEAR_COUNTER(cmpl_arr[(p)]) - -#define UPDATE_FENCE_STATE(p, opcode, nissued)\ -{/* if((opcode)==0)armci_die("op code 0 - buffer overwritten?",(p));*/\ - cmpl_arr[(p)].val += (nissued);\ - cmpl_arr[(p)].oper = (opcode);\ -} - -#define PENDING_OPER(p) cmpl_arr[(p)].oper - - -#define WAIT_FOR_GETS CLEAR_COUNTER(get_cntr[ARMCI_THREAD_IDX]) -#define WAIT_FOR_PUTS CLEAR_COUNTER(ack_cntr[ARMCI_THREAD_IDX]) - -#endif diff --git a/armci/src/devices/mpi-spawn/mpi2_client.c b/armci/src/devices/mpi-spawn/mpi2_client.c index 15f6a75bb..3ed6384bf 100644 --- a/armci/src/devices/mpi-spawn/mpi2_client.c +++ b/armci/src/devices/mpi-spawn/mpi2_client.c @@ -431,9 +431,6 @@ static void armci_mpi2_spawn() size_arr = (int*) malloc(armci_nserver * sizeof(int)); info_arr = (MPI_Info*) malloc(armci_nserver * sizeof(MPI_Info)); hostname_arr = (char**) malloc(armci_nserver * sizeof(char*)); -#ifdef SPAWN_CRAY_XT - nid_arr = (char**) malloc(armci_nserver * sizeof(char*));; -#endif for(i=0; i -#include -#include -#include -#include -#endif - #ifdef OPENIB #include #endif @@ -51,12 +42,6 @@ typedef struct { } msg_tag_t; typedef struct { -#ifdef MELLANOX - VAPI_sr_desc_t sdscr; - VAPI_sg_lst_entry_t ssg_entry; - VAPI_rr_desc_t rdscr; - VAPI_sg_lst_entry_t rsg_entry; -#endif #ifdef OPENIB struct ibv_send_wr sdscr; struct ibv_sge ssg_entry; @@ -66,11 +51,6 @@ typedef struct { } armci_vapi_field_t; typedef struct { -#ifdef MELLANOX - VAPI_lkey_t lkey; - VAPI_rkey_t rkey; - VAPI_mr_hndl_t memhndl; -#endif #ifdef OPENIB uint32_t rkey; uint32_t lkey; @@ -83,11 +63,6 @@ extern char * armci_vapi_client_mem_alloc(int); typedef struct { int tag; int issg; -#ifdef MELLANOX - VAPI_sr_desc_t sdescr; - VAPI_rr_desc_t rdescr; - VAPI_sg_lst_entry_t sg_entry[56]; /*ff:this has to be malloced*/ -#endif #ifdef OPENIB struct ibv_send_wr sdescr; struct ibv_recv_wr rdescr; @@ -101,10 +76,6 @@ typedef struct { typedef struct { int tag; int issg; -#ifdef MELLANOX - VAPI_rr_desc_t descr; - VAPI_sg_lst_entry_t sg_entry[56]; /*ff:this has to be malloced*/ -#endif #ifdef OPENIB struct ibv_recv_wr descr; struct ibv_send_wr sg_entry[56]; /*ff:this has to be malloced*/ @@ -189,9 +160,7 @@ void armci_vapi_set_mark_buf_send_complete(int); #define BUF_EXTRA_FIELD_T armci_vapi_field_t #define GET_SEND_BUFFER _armci_buf_get #define FREE_SEND_BUFFER _armci_buf_release -#ifdef MELLANOX -#define INIT_SEND_BUF(_field,_snd,_rcv) _snd=1;_rcv=1;memset(&((_field).sdscr),0,sizeof(VAPI_sr_desc_t));(_field).sdscr.id=avail+1;armci_vapi_set_mark_buf_send_complete(avail+1) -#endif + #ifdef OPENIB #define INIT_SEND_BUF(_field,_snd,_rcv) _snd=1;_rcv=1;memset(&((_field).sdscr),0,sizeof(struct ibv_send_wr));(_field).sdscr.wr_id=avail+1;armci_vapi_set_mark_buf_send_complete(avail+1) #endif @@ -227,10 +196,7 @@ void armci_vapi_set_mark_buf_send_complete(int); #endif #define ARMCI_POST_SCATTER 1000000001 #define ARMCI_VAPI_CLEAR 0 -#ifdef MELLANOX -#define VAPI_SGGET_MIN_COLUMN 720 -#define VAPI_SGPUT_MIN_COLUMN 720 -#endif + #ifdef OPENIB /* #define VAPI_SGGET_MIN_COLUMN 2147483648 */ #define VAPI_SGPUT_MIN_COLUMN 2147483648 diff --git a/armci/src/devices/openib/cbuf.h b/armci/src/devices/openib/cbuf.h index e2330b348..5f6a8ddf6 100644 --- a/armci/src/devices/openib/cbuf.h +++ b/armci/src/devices/openib/cbuf.h @@ -34,11 +34,7 @@ #include #include -#ifdef _IA64_ -#define CBUF_FLAG_TYPE uint64_t -#else #define CBUF_FLAG_TYPE uint32_t -#endif #if (defined(RDMA_FAST_PATH) || defined(ADAPTIVE_RDMA_FAST_PATH)) diff --git a/armci/src/devices/portals/armci_portals.h b/armci/src/devices/portals/armci_portals.h deleted file mode 100644 index 7571838bd..000000000 --- a/armci/src/devices/portals/armci_portals.h +++ /dev/null @@ -1,112 +0,0 @@ - -#ifndef PORTALS_H -#define PORTALS_H - -/* portals header file */ - -#ifdef CRAY_XT - -#include -#include - -#else - -#include -#include P3_NAL -#include -#include - -#endif - -/*this should match regions.c*/ -#define PORTALS_MEM_REGIONS 24 - -#define MAX_OUT_ORG 16 -#define MAX_OUT 1 -#define HAS_RDMA_GET - -typedef enum op { - ARMCI_PORTALS_PUT, - ARMCI_PORTALS_NBPUT, - ARMCI_PORTALS_GET, - ARMCI_PORTALS_NBGET, - ARMCI_PORTALS_ACC -} armci_portals_optype; - -/* array of memory segments and corresponding memory descriptors */ -typedef struct md_table{ - ptl_md_t md; /* make this a ptr instead of struct */ - void * start; - void * end; - int id; - int bytes; - ptl_match_bits_t mb; -} md_table_entry_t; - - -typedef struct desc{ - int active; - int tag; - int dest_id; - armci_portals_optype type; - ptl_md_t mem_dsc; - ptl_md_t mem_dsc_save; - ptl_handle_md_t mem_dsc_hndl; -}comp_desc; - -typedef struct region_memhdl{ - ptl_match_bits_t match_bits; - int regid; - int islocal; - ptl_size_t offset; - comp_desc cdesc; -} region_memhdl_t; - -#define NB_CMPL_T comp_desc* -#define REGIONS_REQUIRE_MEMHDL -#define ARMCI_MEMHDL_T region_memhdl_t - -#define ARMCI_NB_WAIT(_cntr) if(_cntr){\ - int rc;\ - if(nb_handle->tag==_cntr->tag)\ - rc = armci_client_complete(NULL,nb_handle->proc,nb_handle->tag,_cntr);\ -} - - - -/* structure of computing process */ -typedef struct { - int armci_rank; /* if different from portals_rank */ - int rank; /* my rank*/ - int size; /* size of the group */ - ptl_handle_me_t me_h[64]; - - ptl_handle_eq_t eq_h; - ptl_handle_ni_t ni_h; - ptl_pt_index_t ptl; - int outstanding_puts; - int outstanding_gets; - int outstanding_accs; - void * buffers; /* ptr to head of buffer */ - int num_match_entries; - cnos_nidpid_map_t *ptl_pe_procid_map; - ptl_process_id_t ptl_my_procid; -} armci_portals_proc_t; - - -extern void print_mem_desc_table(void); -extern int armci_init_portals(void); -extern void armci_fini_portals(void); -extern int armci_post_descriptor(ptl_md_t *md); -extern int armci_prepost_descriptor(void* start, long bytes); -extern ptl_size_t armci_get_offset(ptl_md_t md, void *ptr,int proc); -extern int armci_get_md(void * start, int bytes , ptl_md_t * md, ptl_match_bits_t * mb); -extern int armci_portals_put(ptl_handle_md_t md_h,ptl_process_id_t dest_id,int bytes,int mb,int local_offset, int remote_offset,int ack ); -extern int armci_portals_get(ptl_handle_md_t md_h,ptl_process_id_t dest_id,int bytes,int mb,int local_offset, int remote_offset); -extern comp_desc * get_free_comp_desc(int,int *); -extern int armci_client_direct_send(int proc,void *src, void* dst, int bytes, void **cmpl_info, int tag, ARMCI_MEMHDL_T *lochdl, ARMCI_MEMHDL_T *remhdl); -extern int armci_portals_direct_get(void *src, void *dst, int bytes, int proc, int nbtag, NB_CMPL_T *cmpl_info); -extern void comp_desc_init(); -extern int armci_client_complete(ptl_event_kind_t *evt,int proc_id, int nb_tag ,comp_desc * cdesc); - -#endif /* PORTALS_H */ diff --git a/armci/src/devices/portals/portals.c b/armci/src/devices/portals/portals.c deleted file mode 100644 index ea58002ed..000000000 --- a/armci/src/devices/portals/portals.c +++ /dev/null @@ -1,1053 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* preliminary implementation on top of portals */ - /*there are 3 kinds of ARMCI memory: PARMCI_Malloc, PARMCI_Malloc_local, user - * allocated memory. For PARMCI_Malloc, we use region specific md that - * comes from completion descriptor. - * For PARMCI_Malloc_local, we use the MD from the lochdl - * For user allocated memory, we use another list of MD's - * which binds the user memory. We never keep track of non-armci allocated - * memory. - */ -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STDLIB_H -# include -#endif -#if HAVE_STRING_H -# include -#endif -#if HAVE_FLOAT_H -# include -#endif -#include "armcip.h" -#include "message.h" -#if HAVE_STDINT_H -# include -#endif - -#define DEBUG_COMM 0 -#define DEBUG_INIT 0 -#ifndef PMI_SUCCESS -#define PMI_SUCCESS 0 -#endif - - -#ifdef CRAY_XT -#include "locks.h" -typedef struct { - int off; - int desc; -} cnos_mutex_t; - -static cnos_mutex_t *_mutex_array; -#endif - - -/*global variables and data structures */ -armci_portals_proc_t _armci_portals_proc_struct; -armci_portals_proc_t *portals = &_armci_portals_proc_struct; -comp_desc *_region_compdesc_array[PORTALS_MEM_REGIONS+1]; -int ptl_initialized = 0; -int free_desc_index[PORTALS_MEM_REGIONS+1]; -FILE *utcp_lib_out; -FILE* utcp_api_out; -ptl_ni_limits_t armci_ptl_nilimits; - -int armci_init_portals(void) -{ -int num_interface; -int rc; -int npes,i; -comp_desc *armci_comp_desc; -ptl_process_id_t match_id; - ARMCI_PR_DBG("enter",0); - - if (PtlInit(&num_interface) != PTL_OK) { - fprintf(stderr, "PtlInit() failed\n"); - exit(1); - } - portals->ptl = 37; /* our own ptl number */ - for(i=0;i<=PORTALS_MEM_REGIONS;i++){ - free_desc_index[i]=0; - } - - rc=PtlNIInit(IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS), PTL_PID_ANY, NULL, &armci_ptl_nilimits, &(portals->ni_h)); - switch(rc) { - case PTL_OK: - case PTL_IFACE_DUP: - break; - default: - printf( "PtlNIInit() failed %d error=%s\n",rc,ptl_err_str[rc]); - armci_die("NIInit Failed",0); - } - if(DEBUG_INIT || DEBUG_COMM) - PtlNIDebug(portals->ni_h,PTL_DEBUG_ALL); - - PtlGetId(portals->ni_h,&portals->ptl_my_procid); - if(DEBUG_INIT){ - printf("%d:the rank is %d, size is %d\n",armci_me, - portals->ptl_my_procid,portals->size); - } - - if((rc=PMI_CNOS_Get_nidpid_map(&portals->ptl_pe_procid_map))!=PMI_SUCCESS){ - printf("Getting proc map failed (npes=%d)\n", armci_nproc); - } - - /* Allocate one shared event queue for all operations - * TODO tune size. - */ - rc = PtlEQAlloc(portals->ni_h,1024,NULL, &(portals->eq_h)); - if (rc != PTL_OK) { - printf("%d:PtlEQAlloc() failed: %s (%d)\n", - portals->ptl_my_procid, ptl_err_str[rc], rc); - armci_die("EQ Alloc failed",rc); - } - - ptl_initialized = 1; - portals->num_match_entries = 0; - -#ifndef CRAY_XT - utcp_lib_out = stdout; - utcp_api_out = stdout; -#endif - - /*now prepare for use of local memory*/ - armci_comp_desc = (comp_desc *)malloc(sizeof(comp_desc)*MAX_OUT); - for(i=0; i< MAX_OUT;i++){ - ptl_md_t *md_ptr; - ptl_handle_md_t *md_h; - armci_comp_desc[i].active=0; - md_ptr = &armci_comp_desc[i].mem_dsc; - md_h = &armci_comp_desc[i].mem_dsc_hndl; - md_ptr->eq_handle = portals->eq_h; - md_ptr->max_size =0; - md_ptr->threshold = 2;/*PTL_MD_THRESH_INF;*/ - md_ptr->options = PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - } - _region_compdesc_array[PORTALS_MEM_REGIONS]=armci_comp_desc; - ARMCI_PR_DBG("exit",0); - return 0; -} - - - -void armci_fini_portals() -{ - ARMCI_PR_DBG("enter",0); - if(DEBUG_INIT){ - printf("ENTERING ARMCI_FINI_PORTALS\n");fflush(stdout); - } - PtlNIFini(portals->ni_h); - /*PtlFini();*/ - if(DEBUG_INIT){ - printf("LEAVING ARMCI_FINI_PORTALS\n");fflush(stdout); - } - ARMCI_PR_DBG("exit",0); -} - - - -void armci_serv_register_req(void *start,long bytes, ARMCI_MEMHDL_T *reg_mem) -{ -int rc; -void * context; -ptl_md_t *md_ptr; -ptl_match_bits_t *mb; -ptl_process_id_t match_id; -ptl_handle_md_t *md_h; -ptl_match_bits_t ignbits = 0xFFFFFFFFFFFFFFF0; - - ARMCI_PR_DBG("enter",reg_mem->regid); - if(DEBUG_COMM){ - printf("%d:armci_serv_register_req:size of mem_hndl is %d\n", - armci_me,sizeof(region_memhdl_t)); - printf("\n%d:armci_serv_register_req start=%p bytes=%d\n", - armci_me,start,bytes);fflush(stdout); - } - - md_ptr = ®_mem->cdesc.mem_dsc; - mb = ®_mem->match_bits; - md_h = ®_mem->cdesc.mem_dsc_hndl; - context = NULL; - - md_ptr->start = start; - md_ptr->length = bytes; - md_ptr->threshold = PTL_MD_THRESH_INF; - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_MANAGE_REMOTE; - md_ptr->user_ptr = context; - /*eq_hdl is null for the attaches done for a remote proc*/ - /*md_ptr->eq_handle = portals->eq_h;*/ - md_ptr->eq_handle = PTL_EQ_NONE; - md_ptr->max_size =0; - *mb = 0x0000000000000000; - *mb = (*mb+reg_mem->regid); - - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - - rc = PtlMEAttach(portals->ni_h,portals->ptl,match_id,*mb,ignbits, - PTL_RETAIN,PTL_INS_AFTER, - &(portals->me_h[portals->num_match_entries])); - - if (rc != PTL_OK) { - printf("%d:PtlMEAttach: %s\n", portals->ptl_my_procid, ptl_err_str[rc]); - armci_die("portals attach error2",rc); - } - - rc = PtlMDAttach(portals->me_h[portals->num_match_entries],*md_ptr,PTL_RETAIN,md_h); - - if (rc != PTL_OK) { - printf("%d:PtlMDAttach: %s\n", portals->ptl_my_procid, ptl_err_str[rc]); - armci_die("portals attach error1",rc); - } - - portals->num_match_entries++; - ARMCI_PR_DBG("exit",reg_mem->regid); - -} - -int armci_pin_contig_hndl(void *start,int bytes, ARMCI_MEMHDL_T *reg_mem) -{ -int rc,i; -void * context; -ptl_md_t *md_ptr; -ptl_process_id_t match_id; -ptl_handle_md_t *md_h; -comp_desc *armci_comp_desc; - - ARMCI_PR_DBG("enter",reg_mem->regid); - /*first create comp_desc arr for this region if it is not local*/ - if(!reg_mem->islocal){ - armci_comp_desc = (comp_desc *)malloc(sizeof(comp_desc)*MAX_OUT); - for(i=0; i< MAX_OUT;i++){ - armci_comp_desc[i].active=0; - md_ptr = &armci_comp_desc[i].mem_dsc; - md_h = &armci_comp_desc[i].mem_dsc_hndl; - context = NULL; - md_ptr->start = start; - md_ptr->length = bytes; - md_ptr->threshold = 2;/*PTL_MD_THRESH_INF;*/ - md_ptr->options = PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - /*md_ptr->options = PTL_MD_EVENT_START_DISABLE;*/ - - md_ptr->user_ptr = context; - md_ptr->eq_handle = portals->eq_h; - md_ptr->max_size =0; -#ifdef DO_MD_UPDATE - rc = PtlMDBind(portals->ni_h,*md_ptr, PTL_RETAIN, md_h); - if (rc != PTL_OK){ - printf("%d:PtlMDBind: %s\n", portals->ptl_my_procid, ptl_err_str[rc]); - armci_die("ptlmdbind failed",0); - } -#endif - } - _region_compdesc_array[reg_mem->regid]=armci_comp_desc; - ARMCI_PR_DBG("exit",0); - return 1; - } - else { - md_ptr = ®_mem->cdesc.mem_dsc; - md_h = ®_mem->cdesc.mem_dsc_hndl; - context = NULL; - md_ptr->start = start; - md_ptr->length = bytes; - md_ptr->threshold = 2;/*PTL_MD_THRESH_INF;*/ - md_ptr->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - - md_ptr->user_ptr = context; - md_ptr->eq_handle = portals->eq_h; - md_ptr->max_size =0; -#if DO_MD_UPDATE - rc = PtlMDBind(portals->ni_h,*md_ptr, PTL_RETAIN, md_h); - if (rc != PTL_OK){ - printf("%d:PtlMDBind: %s\n", portals->ptl_my_procid, ptl_err_str[rc]); - armci_die("ptlmdbind failed",0); - } -#endif - ARMCI_PR_DBG("exit",1); - return 1; - } -} - - -int armci_client_complete(ptl_event_kind_t *evt,int proc_id, int nb_tag, - comp_desc *cdesc) -{ -int rc; -ptl_event_t ev_t; -ptl_event_t *ev=&ev_t; -comp_desc *temp_comp = NULL; -int loop=1; -int temp_proc; - ARMCI_PR_DBG("enter",0); - if(DEBUG_COMM){ - printf("\n%d:enter:client_complete active=%d tag=%d %d\n",armci_me, - cdesc->active,cdesc->tag,nb_tag);fflush(stdout); - } - while(cdesc->active!=0){ - ev->type=0; - if((rc = PtlEQWait(portals->eq_h, ev)) != PTL_OK){ - printf("%d:PtlEQWait(): %d %s\n", portals->ptl_my_procid,rc, - ptl_err_str[rc]); - armci_die("EQWait problem",rc); - } - if (ev->ni_fail_type != PTL_NI_OK) { - printf("%d:NI sent %d in event.\n", - portals->ptl_my_procid, ev->ni_fail_type); - armci_die("event failure problem",0); - } - if(DEBUG_COMM){ - printf("\n%d:armci_client_complete:done waiting type=%d\n",armci_me, - ev->type); - fflush(stdout); - } - if (ev->type == PTL_EVENT_SEND_END){ - if(DEBUG_COMM){ - printf("\n%d:armci_client_complete:event send end\n",armci_me); - fflush(stdout); - } - temp_comp = (comp_desc *)ev->md.user_ptr; -#ifdef PUT_LOCAL_ONLY_COMPLETION - if(temp_comp->type==ARMCI_PORTALS_PUT || temp_comp->type=ARMCI_PORTALS_NBPUT){ - temp_comp->active=0; - temp_comp->tag=-1; - } - else -#else - temp_comp->active++; -#endif - continue; - } - - if (ev->type == PTL_EVENT_REPLY_END){ - if(DEBUG_COMM){ - printf("\n%d:client_send_complete:reply end\n",armci_me); - fflush(stdout); - } - temp_comp = (comp_desc *)ev->md.user_ptr; - temp_comp->active = 0; /*this was a get request, so we are done*/ - temp_comp->tag=-1; - continue; - } - if (ev->type == PTL_EVENT_ACK){ - if(DEBUG_COMM){ - printf("\n%d:client_send_complete:event ack\n",armci_me); - fflush(stdout); - } - temp_comp = (comp_desc *)ev->md.user_ptr; - temp_comp->active=0; - temp_comp->tag=-1; - armci_update_fence_array(temp_comp->dest_id,0); - portals->outstanding_puts--; - } - } - if(DEBUG_COMM){ - printf("\n%d:exit:client_complete active=%d tag=%d %d\n",armci_me, - cdesc->active,cdesc->tag,nb_tag);fflush(stdout); - } - ARMCI_PR_DBG("exit",0); - return rc; -} - - -comp_desc * get_free_comp_desc(int region_id, int * comp_id) -{ -comp_desc * c; -int rc = PTL_OK; - ARMCI_PR_DBG("enter",region_id); - c = &(_region_compdesc_array[region_id][free_desc_index[region_id]]); - if(c->active!=0 && c->tag>0)armci_client_complete(NULL,c->dest_id,c->tag,c); -#ifdef PUT_LOCAL_ONLY_COMPLETION - if(region_idstart, md->length); - fflush(stdout); - -} - -void armci_client_direct_get(int proc, void *src_buf, void *dst_buf, int bytes, - void** cptr,int tag,ARMCI_MEMHDL_T *lochdl, - ARMCI_MEMHDL_T *remhdl) -{ -int clus = armci_clus_id(proc); -int rc, i; -ptl_size_t offset_local = 0, offset_remote=0; -ptl_match_bits_t mb = remhdl->regid; -ptl_md_t *md_remote,md, *md_local; -ptl_md_t * md_ptr; -ptl_handle_md_t *md_hdl_local; -comp_desc *cdesc; -ptl_process_id_t dest_proc; -int c_info; -int lproc,rproc,user_memory=0; - - ARMCI_PR_DBG("enter",remhdl->regid); - - if(DEBUG_COMM){ - printf("%d:armci_client_direct_get:src_buf %p dstbuf %p loc_hd is %p\n" - "rem_hndl is %p, BYTES = %d\n",armci_me,src_buf,dst_buf, - lochdl,remhdl,bytes); - fflush(stdout); - } - - /*first process information*/ - dest_proc.nid = portals->ptl_pe_procid_map[proc].nid; - dest_proc.pid = portals->ptl_pe_procid_map[proc].pid; - md_remote =&remhdl->cdesc.mem_dsc; - - /*updating md to send*/ - if(lochdl == NULL){ /*this is user memory (stack/heap/whatever) */ - user_memory=1; - cdesc = get_free_comp_desc(PORTALS_MEM_REGIONS,&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=dst_buf; - } - else { - if(lochdl->islocal){ /*PARMCI_Malloc_local memory*/ - user_memory=1; -#if 1 - cdesc = get_free_comp_desc(PORTALS_MEM_REGIONS,&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=dst_buf; -#else - cdesc=&lochdl->cdesc; - md_local = &lochdl->cdesc.mem_dsc; - md_hdl_local = &lochdl->cdesc.mem_dsc_hndl; -#endif - } - else{ - /*we need to pass region id to get corresponding md*/ - cdesc = get_free_comp_desc(lochdl->regid,&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - } - } - - /*compute the local and remote offsets*/ - offset_local = (char*)dst_buf - (char *)md_local->start; - offset_remote = (char*)src_buf - (char *)md_remote->start; - if(DEBUG_COMM){ - printf("\n%d:offr=%d offl=%d %p %p\n",armci_me,offset_remote,offset_local,md_local->start,md_remote->start); - } - /*printf("\n%d:get offr=%d ptrr=%p offl=%d ptrl=%p\n",armci_me,offset_remote,md_remote->start,offset_local,md_local->start);fflush(stdout);*/ - - if(tag) *((comp_desc **)cptr) = cdesc; - /*if(tag){printf("\n%d:get tag=%d c_info=%d %p",armci_me,tag,c_info,cdesc);fflush(stdout);}*/ - if (tag){ - cdesc->tag = tag; - cdesc->dest_id = proc; - cdesc->type = ARMCI_PORTALS_NBGET; - } - else{ - cdesc->tag = 0; - cdesc->dest_id = proc; - cdesc->type = ARMCI_PORTALS_GET; - } - cdesc->active = 1; - md_local->user_ptr = (void *)cdesc; - md_local->options = PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; -#if DO_MD_UPDATE - if(user_memory==0){ - do{ - rc = PtlMDUpdate(*md_hdl_local,NULL,md_local,portals->eq_h); - printf("\n%d:trying to update\n",armci_me);fflush(stdout); - } while (rc == PTL_MD_NO_UPDATE); - if (rc != PTL_OK){ - printf("%d:PtlMDUpdate: %s\n", portals->rank, ptl_err_str[rc]); - armci_die("ptlmdbind failed",0); - } - } - else{ -#endif - rc = PtlMDBind(portals->ni_h,*md_local, PTL_UNLINK, md_hdl_local); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBind: %s\n", portals->rank, ptl_err_str[rc]); - armci_die("ptlmdbind failed",0); - } -#if DO_MD_UPDATE - } -#endif - - rc = PtlGetRegion(*md_hdl_local,offset_local,bytes,dest_proc, - portals->ptl, - 0, - mb, - offset_remote); - if (rc != PTL_OK){ - printf("%d:PtlGetRegion: %s\n", portals->rank,ptl_err_str[rc]); - armci_die("PtlGetRegion failed",0); - } - - if(DEBUG_COMM){ - printf("\n%d:issued get to %d %d\n",armci_me,proc,c_info);fflush(stdout); - } - - if(!tag){ - armci_client_complete(NULL,proc,0,cdesc); /* check this later */ - } - /*printf("\n%d:issued get to %d %d\n",armci_me,proc,c_info);fflush(stdout);*/ - ARMCI_PR_DBG("exit",remhdl->regid); -} - - -void armci_client_nb_get(int proc, void *src_buf, int *src_stride_arr, - void *dst_buf, int *dst_stride_arr, int bytes, - void** cptr,int tag,ARMCI_MEMHDL_T *lochdl, - ARMCI_MEMHDL_T *remhdl) -{ -} - - -int armci_client_direct_send(int proc,void *src, void* dst, int bytes, - void **cptr, int tag, ARMCI_MEMHDL_T *lochdl, - ARMCI_MEMHDL_T *remhdl ) -{ -int clus = armci_clus_id(proc); -int rc, i; -ptl_size_t offset_local = 0, offset_remote = 0; -ptl_match_bits_t mb = remhdl->regid; -ptl_md_t *md_remote,md, *md_local; -ptl_md_t * md_ptr; -ptl_match_bits_t * mb_ptr; -ptl_handle_md_t *md_hdl_local; -comp_desc *cdesc; -ptl_process_id_t dest_proc; -int c_info; -int lproc,rproc,user_memory=0; - - ARMCI_PR_DBG("enter",remhdl->regid); - dest_proc.nid = portals->ptl_pe_procid_map[proc].nid; - dest_proc.pid = portals->ptl_pe_procid_map[proc].pid; - md_remote =&remhdl->cdesc.mem_dsc; - - if(lochdl == NULL){ /*this is user memory*/ - user_memory=1; - cdesc = get_free_comp_desc(PORTALS_MEM_REGIONS,&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=src; - } - else { - if(lochdl->islocal){ /*PARMCI_Malloc_local memory*/ - user_memory=1; -#if 1 - cdesc = get_free_comp_desc(PORTALS_MEM_REGIONS,&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - md_local->length=bytes; - md_local->start=src; -#else - cdesc=&lochdl->cdesc; - md_local = &lochdl->cdesc.mem_dsc; - md_hdl_local = &lochdl->cdesc.mem_dsc_hndl; -#endif - } - else{ - /*we need to pass region id to get corresponding md*/ - cdesc = get_free_comp_desc(lochdl->regid,&c_info); - md_local = &cdesc->mem_dsc; - md_hdl_local = &cdesc->mem_dsc_hndl; - - } - } - - offset_local = (char *)src - (char *)md_local->start; - offset_remote =(char *)dst - (char *)md_remote->start; - if(DEBUG_COMM){ - printf("\n%d:offr=%d offl=%d\n",armci_me,offset_remote,offset_local); - } - /*printf("\n%d:offr=%d ptrr=%p offl=%d ptrl=%p\n",armci_me,offset_remote,md_remote->start,offset_local,md_local->start);fflush(stdout);*/ - - if(tag) *((comp_desc **)cptr) = cdesc; /*TOED*/ - /* - if(tag){printf("\n%d:put tag=%d c_info=%d %p",armci_me,tag,c_info,cdesc);fflush(stdout);} - */ - if (tag){ - cdesc->tag = tag; - cdesc->dest_id = proc; - cdesc->type = ARMCI_PORTALS_NBPUT; - } - else{ - cdesc->tag = 0; - cdesc->dest_id = proc; - cdesc->type = ARMCI_PORTALS_PUT; - } -#ifdef PUT_LOCAL_COMPLETION_ONLY - cdesc->active = 2; -#else - cdesc->active = 1; -#endif - - md_local->user_ptr = (void *)cdesc; - md_local->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; -#if DO_MD_UPDATE - if(user_memory==0){ - do{ - rc = PtlMDUpdate(*md_hdl_local,NULL,md_local,portals->eq_h); - } while (rc == PTL_MD_NO_UPDATE); - if (rc != PTL_OK){ - printf("%d:PtlMDUpdate: %s\n", portals->rank, ptl_err_str[rc]); - armci_die("ptlmdupdate failed",0); - } - } - else{ -#endif - rc = PtlMDBind(portals->ni_h,*md_local, PTL_UNLINK, md_hdl_local); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBind: %s\n", portals->rank, ptl_err_str[rc]); - armci_die("ptlmdbind failed",0); - } -#if DO_MD_UPDATE - } -#endif - - rc = PtlPutRegion(*md_hdl_local,offset_local,bytes, -#ifdef PUT_LOCAL_COMPLETION_ONLY - PTL_NOACK_REQ, -#else - PTL_ACK_REQ, -#endif - dest_proc, - portals->ptl, - 0, mb,offset_remote, 0); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlPutRegion: %s\n", portals->rank,ptl_err_str[rc]); - armci_die("PtlPutRegion failed",0); - } - if(DEBUG_COMM){ - printf("\n%d:issued put to %d\n",armci_me,proc);fflush(stdout); - } - - armci_update_fence_array(proc, 1); - if(!tag){ - armci_client_complete(NULL,proc,0,cdesc); /* check this later */ - } - else - portals->outstanding_puts++; - ARMCI_PR_DBG("exit",remhdl->regid); - return rc; -} - -void armci_client_nb_send(int proc, void *src_buf, int *src_stride_arr, - void *dst_buf, int *dst_stride_arr, int bytes, - void** cptr,int tag,ARMCI_MEMHDL_T *lochdl, - ARMCI_MEMHDL_T *remhdl) -{ -} -#if 1 -/*using non-blocking for multiple 1ds inside a 2d*/ -void armci_network_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, armci_ihdl_t nb_handle) -{ -int i, j,tag=0; -long idxs,idxd; /* index offset of current block position to ptr */ -int n1dim; /* number of 1 dim block */ -int bvalues[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; -int bvalued[MAX_STRIDE_LEVEL]; -int bytes = count[0]; -void *loc, *rem; -void *sptr,*dptr; -ARMCI_MEMHDL_T *loc_memhdl=NULL,*rem_memhdl=NULL; -NB_CMPL_T cptr; -int armci_region_both_found_hndl(void *loc, void *rem, int size, int node, - ARMCI_MEMHDL_T **loc_memhdl,ARMCI_MEMHDL_T **rem_memhdl); - if(nb_handle)tag=nb_handle->tag; - if(op==GET){ - loc = dst_ptr; - rem = src_ptr; - } - else { - loc = src_ptr; - rem = dst_ptr; - } - armci_region_both_found_hndl(loc,rem,bytes,armci_clus_id(proc), - &loc_memhdl,&rem_memhdl); - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - /* calculate the destination indices */ - bvalues[0] = 0; bvalues[1] = 0; bunit[0] = 1; - bvalued[0] = 0; bvalued[1] = 0; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) { - bvalues[i] = bvalued[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - if(ARMCI_ACC(op)){ - /*lock here*/ - printf("\nSHOULD NOT DO NETWORK_STRIDED FOR ACCS \n",armci_me); - fflush(stdout); - armci_die("network_strided called for acc",proc); - } - for(i=0; i (count[j]-1)) bvalues[j] = 0; - if(bvalued[j] > (count[j]-1)) bvalued[j] = 0; - } - - sptr = ((char *)src_ptr)+idxs; - dptr = ((char *)dst_ptr)+idxd; - if(op==GET){ - armci_client_direct_get(proc,sptr,dptr,bytes,&cptr,tag,loc_memhdl, - rem_memhdl); - } - else if(op==PUT){ - armci_client_direct_send(proc,sptr,dptr,bytes,&cptr,tag,loc_memhdl, - rem_memhdl); - } - else if(ARMCI_ACC(op)){ - armci_client_direct_get(proc,sptr,dptr,bytes,&cptr,tag,loc_memhdl, - rem_memhdl); - /*DO ACC*/ - armci_client_direct_send(proc,sptr,dptr,bytes,&cptr,tag,loc_memhdl, - rem_memhdl); - } - else - armci_die("in network_strided unknown opcode",op); - } - if(ARMCI_ACC(op)){ - /*unlock here*/ - } - if(nb_handle){ - nb_handle->tag=tag; - nb_handle->cmpl_info=cptr; - } - else{ - armci_client_complete(NULL,proc,tag,cptr); /* check this later */ - } -} -#else /*using blocking for multiple 1ds inside a 2d*/ -void armci_network_strided(int op, void* scale, int proc,void *src_ptr, - int src_stride_arr[], void* dst_ptr, int dst_stride_arr[], - int count[], int stride_levels, armci_ihdl_t nb_handle) -{ - int i, j; - long idxs,idxd; /* index offset of current block position to ptr */ - int n1dim; /* number of 1 dim block */ - int bvalues[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; - int bvalued[MAX_STRIDE_LEVEL]; - int bytes = count[0]; - void *loc, *rem; - void *sptr,*dptr; -#if 0 - ARMCI_MEMHDL_T *loc_memhdl=NULL,*rem_memhdl=NULL; - int armci_region_both_found_hndl(void *loc, void *rem, int size, int node, - ARMCI_MEMHDL_T **loc_memhdl,ARMCI_MEMHDL_T **rem_memhdl); -#endif - if(op==GET){ - loc = dst_ptr; - rem = src_ptr; - } - else { - loc = src_ptr; - rem = dst_ptr; - } -#if 0 - armci_region_both_found_hndl(loc,rem,count[0],armci_clus_id(proc), - &loc_memhdl,&rem_memhdl); -#endif - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - - /* calculate the destination indices */ - bvalues[0] = 0; bvalues[1] = 0; bunit[0] = 1; - bvalued[0] = 0; bvalued[1] = 0; bunit[1] = 1; - for(i=2; i<=stride_levels; i++) { - bvalues[i] = bvalued[i] = 0; - bunit[i] = bunit[i-1] * count[i-1]; - } - - for(i=0; i (count[j]-1)) bvalues[j] = 0; - if(bvalued[j] > (count[j]-1)) bvalued[j] = 0; - } - - sptr = ((char *)src_ptr)+idxs; - dptr = ((char *)dst_ptr)+idxd; - if((i<(n1dim-1)) || nb_handle==NULL){ - if(op==GET) - PARMCI_Get(sptr,dptr,bytes,proc); - else if(op==PUT) - PARMCI_Put(sptr,dptr,bytes,proc); - else if(ARMCI_ACC(op)) - PARMCI_AccS(op,scale,sptr,NULL,dptr,NULL,count,1,proc); - else - armci_die("in network_strided unknown opcode",op); - } - } - if(nb_handle!=NULL){ - if(op==GET) - PARMCI_NbGet(sptr,dptr,bytes,proc,(armci_hdl_t *)nb_handle); - else if(op==PUT) - PARMCI_NbPut(sptr,dptr,bytes,proc,(armci_hdl_t *)nb_handle); - else if(ARMCI_ACC(op)) - PARMCI_NbAccS(op,scale,sptr,NULL,dptr,NULL,count,1,proc,(armci_hdl_t *)nb_handle); - else - armci_die("in network_strided unknown opcode",op); - } -} -#endif - -int armci_client_direct_getput(int proc,void *getinto, void *putfrom, void* dst, - int bytes, void **cptr, int tag, ARMCI_MEMHDL_T *lochdl, - ARMCI_MEMHDL_T *remhdl ) - -{ -int clus = armci_clus_id(proc); -int rc, i; -ptl_size_t offset_get = 0, offset_put=0, offset_remote = 0; -ptl_match_bits_t mb = 100; -ptl_md_t *md_remote,md, *md_local_put, *md_local_get; -ptl_md_t * md_ptr; -ptl_match_bits_t * mb_ptr; -ptl_handle_md_t *md_hdl_local_put,*md_hdl_local_get; -comp_desc *cdesc; -ptl_process_id_t dest_proc; -int c_info; -int lproc,rproc; - printf("\n%d:****************getput*********\n",armci_me); - dest_proc.nid = portals->ptl_pe_procid_map[proc].nid; - dest_proc.pid = portals->ptl_pe_procid_map[proc].pid; - md_remote =&remhdl->cdesc.mem_dsc; - - cdesc = get_free_comp_desc(PORTALS_MEM_REGIONS,&c_info); - md_local_get = &cdesc->mem_dsc; - md_hdl_local_get = &cdesc->mem_dsc_hndl; - md_local_get->length=bytes; - md_local_get->start=getinto; - offset_get = (char *)getinto - (char *)md_local_get->start; - offset_remote =(char *)dst - (char *)md_remote->start; - cdesc->tag = 999999; - cdesc->dest_id = proc; - cdesc->type = ARMCI_PORTALS_PUT; - cdesc->active = 0; - md_local_get->user_ptr = (void *)cdesc; - md_local_get->options = PTL_MD_OP_GET | PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE | PTL_MD_EVENT_END_DISABLE; - rc = PtlMDBind(portals->ni_h,*md_local_get, PTL_UNLINK, md_hdl_local_get); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBind: %s\n", portals->rank, ptl_err_str[rc]); - armci_die("ptlmdbind failed",0); - } - - cdesc = get_free_comp_desc(PORTALS_MEM_REGIONS,&c_info); - md_local_put = &cdesc->mem_dsc; - md_hdl_local_put = &cdesc->mem_dsc_hndl; - md_local_put->length=bytes; - md_local_put->start=putfrom; - offset_put = (char *)putfrom - (char *)md_local_put->start; - cdesc->tag = 999999; - cdesc->dest_id = proc; - cdesc->type = ARMCI_PORTALS_GET; - cdesc->active = 0; - md_local_put->user_ptr = (void *)cdesc; - md_local_put->options = PTL_MD_OP_PUT | PTL_MD_EVENT_START_DISABLE; - rc = PtlMDBind(portals->ni_h,*md_local_put, PTL_UNLINK, md_hdl_local_put); - if (rc != PTL_OK){ - fprintf(stderr, "%d:PtlMDBind: %s\n", portals->rank, ptl_err_str[rc]); - armci_die("ptlmdbind failed",0); - } - - rc = PtlGetPutRegion(*md_hdl_local_get,offset_get,*md_hdl_local_put, - offset_put,bytes,dest_proc, portals->ptl,0,mb,offset_remote, - 0); - if (rc != PTL_OK){ - printf("%d:PtlPutRegion: %s\n", portals->rank,ptl_err_str[rc]); - armci_die("PtlPutRegion failed",0); - } - if(DEBUG_COMM){ - printf("\n%d:issued getput to %d\n",armci_me,proc);fflush(stdout); - } - - armci_client_complete(NULL,proc,0,cdesc); /* check this later */ - return rc; -} - -void armci_network_client_deregister_memory(ARMCI_MEMHDL_T *mh) -{ -} - - -void armci_network_server_deregister_memory(ARMCI_MEMHDL_T *mh) -{ -} - -#ifdef CRAY_XT_ -static int num_locks=0; -static long **all_locks; -#define ARMCI_PORTALS_MAX_LOCKS 16 -typedef struct { - ptl_handle_md_t mem_dsc_h; - ptl_handle_me_t me_lock_h; - region_memhdl_t armci_portal_lock_memhdl; -} armci_lock_struct; -armci_lock_struct armci_portals_lock_st; -void armcill_allocate_locks(int num) -{ -ptl_md_t *md_ptr; -ptl_match_bits_t *mb; -ptl_process_id_t match_id; -ptl_handle_md_t *md_h; -int ace_any=1; -int rc; -long *my_locks; -int elems; -armci_lock_struct *armci_portals_lock=&armci_portals_lock_st; - - num_locks = num; - if(DEBUG_COMM){ - printf("%d:armci_allocate_locks num=%d\n", - armci_me,num_locks); - } - if(MAX_LOCKSni_h, ace_any, - (ptl_process_id_t){PTL_NID_ANY, PTL_PID_ANY}, - PTL_PID_ANY, 128); - if (rc != PTL_OK) { - printf("%d: PtlACEntry() failed: %s\n", - armci_me, ptl_err_str[rc]); - armci_die("PtlACEntry failed",0); - } -#endif - armci_portals_lock->armci_portal_lock_memhdl.cdesc.mem_dsc.start =&my_locks; - armci_portals_lock->armci_portal_lock_memhdl.cdesc.mem_dsc.length = - sizeof(my_locks); - armci_portals_lock->armci_portal_lock_memhdl.cdesc.mem_dsc.threshold = - PTL_MD_THRESH_INF; - armci_portals_lock->armci_portal_lock_memhdl.cdesc.mem_dsc.options = - PTL_MD_OP_PUT | PTL_MD_OP_GET | - PTL_MD_MANAGE_REMOTE | PTL_MD_TRUNCATE | - PTL_MD_EVENT_START_DISABLE; - armci_portals_lock->armci_portal_lock_memhdl.cdesc.mem_dsc.max_size = 0; - armci_portals_lock->armci_portal_lock_memhdl.cdesc.mem_dsc.user_ptr = NULL; - armci_portals_lock->armci_portal_lock_memhdl.cdesc.mem_dsc.eq_handle = - PTL_EQ_NONE; - match_id.nid = PTL_NID_ANY; - match_id.pid = PTL_PID_ANY; - - /* Lockmaster needs a match entry for clients to access lock value. - */ - rc = PtlMEAttach(portals->ni_h, portals->ptl, - match_id, /* source address */ - 100, /* expected match bits */ - 0, /* ignore bits to mask */ - PTL_RETAIN, /* unlink when md is unlinked */ - PTL_INS_AFTER, - &armci_portals_lock->me_lock_h); - if (rc != PTL_OK){ - printf("%d: PtlMEAttach(): %s\n", - armci_me, ptl_err_str[rc]); - armci_die("PtlMEAttach in init_locks failed",0); - } - rc = PtlMDAttach(armci_portals_lock->me_lock_h, - armci_portals_lock->armci_portal_lock_memhdl.cdesc.mem_dsc, - PTL_RETAIN, - &armci_portals_lock->mem_dsc_h); - if (rc != PTL_OK) { - printf("%d: PtlMDAttach(): %s\n", - armci_me, ptl_err_str[rc]); - armci_die("PtlMDAttach in init_locks failed",0); - } -} - -void armcill_lock(int mutex, int proc) -{ -long getinto=0,putfrom=1; -armci_lock_struct *armci_portal_lock=&armci_portals_lock_st; -region_memhdl_t *rem_lock_hdl=&armci_portal_lock->armci_portal_lock_memhdl; -printf("\n%d:in lock before\n",armci_me);fflush(stdout); - do{ - armci_client_direct_getput(proc,&getinto,&putfrom,(all_locks[proc]+mutex), - sizeof(long), NULL, 0, NULL,rem_lock_hdl); - }while(getinto!=0); -printf("\n%d:in lock after\n",armci_me);fflush(stdout); -} - - -/*\ unlock specified mutex on node where process proc is running -\*/ -void armcill_unlock(int mutex, int proc) -{ -long getinto=0,putfrom=0; -armci_lock_struct *armci_portal_lock=&armci_portals_lock_st; -region_memhdl_t *rem_lock_hdl=&armci_portal_lock->armci_portal_lock_memhdl; - armci_client_direct_getput(proc,&getinto,&putfrom,(all_locks[proc]+mutex), - sizeof(long), NULL, 0, NULL,rem_lock_hdl); -} - -int armci_portals_rmw_(int op, int *ploc, int *prem, int extra, int proc) -{ - return(0); -} -#endif - -void armci_portals_shmalloc_allocate_mem(int num_lks) -{ - void **ptr_arr; - void *ptr; - armci_size_t bytes = 128; - int i; - - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - if(!ptr_arr) armci_die("armci_shmalloc_get_offsets: malloc failed", 0); - bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); - - PARMCI_Malloc(ptr_arr,bytes); - - return; -} diff --git a/armci/src/devices/portals/portals.h b/armci/src/devices/portals/portals.h deleted file mode 100644 index f071a58aa..000000000 --- a/armci/src/devices/portals/portals.h +++ /dev/null @@ -1,100 +0,0 @@ - -#ifndef PORTALS_H -#define PORTALS_H - -/* portals header file */ - -#ifdef CRAY_XT - -#include -#include - -#else - -#include -#include P3_NAL -#include -#include - -#endif - -#define MAX_OUT 1000 -#define MAX_ENT 64 -#define MAX_PREPOST 1 -#define HAS_RDMA_GET - -typedef enum op { - ARMCI_PORTALS_PUT, - ARMCI_PORTALS_NBPUT, - ARMCI_PORTALS_GET, - ARMCI_PORTALS_NBGET, - ARMCI_PORTALS_ACC -} armci_portals_optype; - -/* array of memory segments and corresponding memory descriptors */ -typedef struct md_table{ - ptl_md_t md; /* make this a ptr instead of struct */ - void * start; - void * end; - int id; - int bytes; - ptl_match_bits_t mb; -} md_table_entry_t; - - -typedef struct desc{ - int active; - unsigned int tag; - int dest_id; - armci_portals_optype type; -}comp_desc; - -typedef struct region_memhdl{ - ptl_match_bits_t match_bits; - ptl_size_t offset; - ptl_md_t mem_dsc; - ptl_handle_md_t mem_dsc_hndl; -} region_memhdl_t; - -#define NB_CMPL_T int -#define REGIONS_REQUIRE_MEMHDL -#define ARMCI_MEMHDL_T region_memhdl_t - -/* structure of computing process */ -typedef struct { - int armci_rank; /* if different from portals_rank */ - int rank; /* my rank*/ - int size; /* size of the group */ - ptl_handle_me_t me_h[64]; - ptl_handle_md_t md_h[64]; - ptl_handle_me_t mh; - ptl_handle_md_t mdh; - - ptl_handle_eq_t eq_h; - ptl_handle_ni_t ni_h; - ptl_pt_index_t ptl; - int outstanding_puts; - int outstanding_gets; - int outstanding_accs; - void * buffers; /* ptr to head of buffer */ - int num_match_entries; -} armci_portals_proc_t; - - -extern void print_mem_desc_table(void); -extern int armci_init_portals(void); -extern void armci_fini_portals(void); -extern int armci_post_descriptor(ptl_md_t *md); -extern int armci_prepost_descriptor(void* start, long bytes); -extern ptl_size_t armci_get_offset(ptl_md_t md, void *ptr,int proc); -extern int armci_get_md(void * start, int bytes , ptl_md_t * md, ptl_match_bits_t * mb); -extern int armci_portals_put(ptl_handle_md_t md_h,ptl_process_id_t dest_id,int bytes,int mb,int local_offset, int remote_offset,int ack ); -extern int armci_portals_get(ptl_handle_md_t md_h,ptl_process_id_t dest_id,int bytes,int mb,int local_offset, int remote_offset); -extern comp_desc * get_free_comp_desc(int * c_info); -extern int armci_client_direct_send(int proc,void *src, void* dst, int bytes, NB_CMPL_T *cmpl_info, int tag, ARMCI_MEMHDL_T *lochdl, ARMCI_MEMHDL_T *remhdl); -extern int armci_portals_direct_get(void *src, void *dst, int bytes, int proc, int nbtag, NB_CMPL_T *cmpl_info); -extern int armci_portals_complete(int nbtag, NB_CMPL_T *cmpl_info); -extern void comp_desc_init(); -extern int armci_client_complete(ptl_event_kind_t *evt,int proc_id, int nb_tag ,comp_desc * cdesc,int b_tag); - -#endif /* PORTALS_H */ diff --git a/armci/src/devices/sockets/sockets.c b/armci/src/devices/sockets/sockets.c index b98aabe71..e18656042 100644 --- a/armci/src/devices/sockets/sockets.c +++ b/armci/src/devices/sockets/sockets.c @@ -55,38 +55,7 @@ #include "armcip.h" #include "sockets.h" -/* JAD 2010-05-06 Code these days is safe to use socklen_t so long as it uses - * it throughout. */ -#if 0 -# ifdef AIX -# include -# if HAVE_SYS_SELECT_H -# include -# endif -# ifdef _AIXVERSION_430 typedef socklen_t soclen_t; -# else -typedef size_t soclen_t; -# endif -# elif defined(XLCLINUX) -typedef socklen_t soclen_t; -# else -typedef int soclen_t; -# endif -#else -# ifdef NEC -typedef int soclen_t; -# else -typedef socklen_t soclen_t; -# endif -#endif - - -#ifdef CRAY -# if HAVE_MEMORY_H -# include -# endif -#endif /* portability of socklen_t definition is iffy - we need to avoid it !! #if defined(LINUX) && ( defined(_SOCKETBITS_H) || defined(__BITS_SOCKET_H)) @@ -801,7 +770,7 @@ int armci_ListenAndAccept(int sock) return msgsock; } -#if !defined(SGI) && !defined(WIN32) +#if !defined(WIN32) struct hostent *gethostbyname(); #endif diff --git a/armci/src/ft/armci_chkpt.c b/armci/src/ft/armci_chkpt.c index ea6d05384..210eedeff 100644 --- a/armci/src/ft/armci_chkpt.c +++ b/armci/src/ft/armci_chkpt.c @@ -208,12 +208,7 @@ static void armci_protect_pages(unsigned long startpagenum,unsigned long numpage * called inside main(int argc, char **argv), I guess ... */ void armci_init_checkpoint2() { - printf("%d:in armci init checkpoint2\n",armci_me);fflush(stdout); -#ifdef __ia64 - /* get backing store bottom */ - asm("mov %0=ar.bsp": "=r"(armci_ckpt_bspBottom)); - printf("%d: armci_ckpt_bspBottom=%p\n", armci_me, armci_ckpt_bspBottom); -#endif + printf("%d:in armci init checkpoint2\n",armci_me);fflush(stdout); } /*\ ----------CORE FUNCTIONS ----------- @@ -227,11 +222,6 @@ int armci_init_checkpoint(int spare) #ifdef CHECKPOINT2 printf("%d:in armci init checkpoint\n",armci_me);fflush(stdout); -#ifdef __ia64 - /* get backing store bottom */ - asm("mov %0=ar.bsp": "=r"(armci_ckpt_bspBottom)); - printf("%d: armci_ckpt_bspBottom=%p\n", armci_me, armci_ckpt_bspBottom); -#endif #endif mypagesize = getpagesize(); if(checkpointing_initialized)return(0); @@ -487,44 +477,6 @@ void what_is_going_on() { printf("what_is_going_on(): a=%p\n", &a); } -/* - In IA64, there is a seperate stack called register stack engine (RSE, - contains 96 registers) to manage across functions calls (e.g. these - registers stores the function return address, etc.. In order to save this - info, flush the stack registers to backing store and save backing store. - NOTE: backing store is a cache to register stack. - */ -#if defined(__ia64) -static void armci_ckpt_write_backstore(int rid) -{ - char *bspTop; /* in IA64 only, back store pointer (bsp) */ - off_t ofs; - - /* flush the register stack */ - asm("flushrs"); - - /* getting back store pointer (bsp). BSP is similar to stack pointer, - * which points to the top of backing store */ - asm("mov %0=ar.bsp": "=r"(bspTop)); - printf("BSP Pointer (ar.bsp) = %p\n", bspTop); - - armci_storage_record[rid].bsp_mon.ptr = armci_ckpt_bspBottom; - armci_storage_record[rid].bsp_mon.bytes=((unsigned long)(bspTop) - (unsigned long)(armci_ckpt_bspBottom)); - - ofs=CURR_FILE_POS(rid); - UPDATE_FILE_POS(rid, armci_storage_record[rid].bsp_mon.bytes); - armci_storage_record[rid].bsp_mon.fileoffset = ofs; - - printf("%d: Save Backing store: %p to %p (bytes=%ld: off=%ld)\n\n",armci_me, armci_ckpt_bspBottom, armci_ckpt_bspBottom+armci_storage_record[rid].bsp_mon.bytes, armci_storage_record[rid].bsp_mon.bytes, ofs);fflush(stdout); - - armci_storage_write_ptr(armci_storage_record[rid].fileinfo.fd, - armci_storage_record[rid].bsp_mon.ptr, - armci_storage_record[rid].bsp_mon.bytes, - armci_storage_record[rid].bsp_mon.fileoffset); - -} -#endif - static void armci_ckpt_write_stack(int rid) { int dummy_first=ARMCI_STACK_VERIFY; @@ -546,12 +498,7 @@ static void armci_ckpt_write_stack(int rid) printf("%d: Save stack: %p to %p (bytes=%ld : off=%ld)\n\n",armci_me, top, top+armci_storage_record[rid].stack_mon.bytes, armci_storage_record[rid].stack_mon.bytes, ofs);fflush(stdout); armci_storage_write_ptr(armci_storage_record[rid].fileinfo.fd,top, armci_storage_record[rid].stack_mon.bytes, - armci_storage_record[rid].stack_mon.fileoffset); - -#if defined(__ia64) - /* In IA64, write Backing Store, as it is the cache for Stack Registers */ - armci_ckpt_write_backstore(rid); -#endif + armci_storage_record[rid].stack_mon.fileoffset); } static void armci_ckpt_write_heap(int rid) @@ -664,16 +611,6 @@ int armci_icheckpoint(int rid) if(armci_storage_record[rid].ckpt_stack || armci_storage_record[rid].ckpt_heap) { -#if defined(__ia64) - { - char *tmp_bsp; - /* flush the register stack */ - asm("flushrs"); - /* get the top of backing store */ - asm("mov %0=ar.bsp": "=r"(tmp_bsp)); - printf("tmp: ar.bsp = %p\n", tmp_bsp); - } -#endif if((armci_recovering=setjmp(armci_storage_record[rid].jmp))==0){ /* 1. file offsets */ @@ -724,117 +661,18 @@ int armci_icheckpoint(int rid) return(rc); } -/** - * Recover Backing Store. - */ -#if defined(__ia64) -static void armci_recover_backstore(int rid) -{ - off_t offset = armci_storage_record[rid].bsp_mon.fileoffset; - size_t size = armci_storage_record[rid].bsp_mon.bytes; - char *bspTop = (char*)((unsigned long)(armci_storage_record[rid].bsp_mon.ptr) + size); - char *bsp; - - asm("flushrs"); - asm("mov %0=ar.bsp": "=r"(bsp)); - - /* CHECK: expand the backing store so that the current backing store is - replaced with saved backing store (CHECK: register stack can be as - large as 96 registers, so 96*8 bytes) */ - if( (unsigned long)bsp < (unsigned long)(bspTop + 96*8 + EST_OFFSET) ) { - armci_recover_backstore(rid); - } - else{ - printf("%d: armci_recover_backstore(): size=%ld offset=%ld backing store: %p to %p\n", armci_me, size, offset, bspTop, armci_storage_record[rid].bsp_mon.ptr); - armci_storage_read_ptr(armci_storage_record[rid].fileinfo.fd, bspTop, size, offset); - - printf("%d: armci_recover_backstore(): rid=%d\n", armci_me, rid); - - /* CHECK: Is there a way to verify backing store recovery - (similar to stack) */ - } - /** - * CHECK: Do nothing here. Recursive function in action. - */ -} -#endif - - -/** - * Recover stack: restore a saved stack by overwriting the current stack - * of this process . The idea of restoring the stack is, we are going to - * replace the contents of current stack, so that longjmp is legitimate. - */ -#if 0 -static void armci_recover_stack(int rid) -{ - off_t offset = sizeof(jmp_buf)+4*sizeof(int); - size_t size = armci_storage_record[rid].stack_mon.bytes; - char *stacktop = (char*)((unsigned long)(armci_storage_record[rid].stack_mon.ptr) - size); - int dummy; - printf("check=%p %p; rid=%d\n", &dummy, &offset, rid); - - /* CHECK: check whether current stack frame is above the old (saved) - stack. If so, the recover the stack, else call thus recursively - until the current stack is above the old stack */ - if( (unsigned long)&dummy >= (unsigned long)(stacktop-EST_OFFSET) ) { - armci_recover_stack(rid); - } - else { - printf("%d: armci_recover_stack(): size=%ld offset=%ld stack: %p to %p\n", armci_me, size, offset, stacktop, armci_storage_record[rid].stack_mon.ptr); - armci_storage_read_ptr(armci_storage_record[rid].fileinfo.fd, stacktop, size, offset); - - { /* verify stack recovery */ - int dummy = *((int*)(stacktop+EST_OFFSET)); - if(dummy != ARMCI_STACK_VERIFY) { - printf("WARNING: armci_recover_stack FAILED: %d", dummy); - armci_die("armci_recover_stack FAILED", dummy); - } - else if(DEBUG_) - printf("%d: armci_recover_stack SUCCESS (%d)\n", armci_me, dummy); - } - -#ifdef __ia64 - /* recover the backing store (BSP) */ - armci_recover_backstore(rid); -#endif - - } - /** - * CHECK: Do nothing here...recursive function in action here.. - */ -} -#endif - static void armci_recover_memory(int rid) { int dummy; off_t ofs; size_t stacksize = armci_storage_record[rid].stack_mon.bytes; char *stacktop = (char*)((unsigned long)(armci_storage_record[rid].stack_mon.ptr) - stacksize); - -#ifdef __ia64 - size_t bspsize = armci_storage_record[rid].bsp_mon.bytes; - char *bspTop = (char*)((unsigned long)(armci_storage_record[rid].bsp_mon.ptr) + bspsize); - char *bsp; -#endif printf("armci_recover_stack(): check=%p ; rid=%d\n", &dummy, rid); /* call recursively until current stack is above saved stack */ if( (unsigned long)&dummy >= (unsigned long)(stacktop-EST_OFFSET) ) armci_recover_memory(rid); -#ifdef __ia64 - asm("flushrs"); - asm("mov %0=ar.bsp": "=r"(bsp)); - - printf("armci_recover_bsp(): check=%p ; rid=%d\n", &dummy, rid); - /* similarly, call recursively until current backing store expands - (register stack can be as large as 96 registers) */ - if( (unsigned long)bsp < (unsigned long)(bspTop + 97*8) ) - armci_recover_memory(rid); -#endif - /* ------------------ recover stack segment ------------------- */ printf("%d: armci_recover_stack(): fp=%p size=%ld off=%ld stack: %p to %p\n", armci_me, armci_storage_record[rid].fileinfo.fd, stacksize, armci_storage_record[rid].stack_mon.fileoffset, stacktop, armci_storage_record[rid].stack_mon.ptr); armci_storage_read_ptr(armci_storage_record[rid].fileinfo.fd, stacktop, stacksize, armci_storage_record[rid].stack_mon.fileoffset); @@ -847,23 +685,6 @@ static void armci_recover_memory(int rid) else if(DEBUG_) printf("%d: armci_recover_stack SUCCESS (%d)\n", armci_me, dummy); } - - /* -------- recover register stack (RSE) segment (IA64 only) -------- */ -#ifdef __ia64 - { - size_t bspsize = armci_storage_record[rid].bsp_mon.bytes; - char *bspTop = (char*)((unsigned long)(armci_storage_record[rid].bsp_mon.ptr) + bspsize); - - bsp = (char*)armci_storage_record[rid].bsp_mon.ptr; /* CHECK: */ - - printf("%d: armci_recover_backstore(): size=%ld off=%ld backing store: %p to %p\n", armci_me, bspsize, armci_storage_record[rid].bsp_mon.fileoffset, armci_storage_record[rid].bsp_mon.ptr, bspTop); - armci_storage_read_ptr(armci_storage_record[rid].fileinfo.fd, bsp, bspsize, armci_storage_record[rid].bsp_mon.fileoffset); - printf("%d: armci_recover_backstore(): rid=%d\n", armci_me, rid); - - /* CHECK: Is there a way to verify backing store recovery - (similar to stack) */ - } -#endif ofs=0; /* jmp_buf is the first one to be stored in ckpt file, so ofs=0 */ printf("%d: armci_recover jmp_buf(): size=%ld off=%ld (%p to %p)\n", armci_me, sizeof(jmp_buf), ofs, &armci_storage_record[rid].jmp, (char*)(&armci_storage_record[rid].jmp)+sizeof(jmp_buf)); @@ -935,64 +756,6 @@ int armci_irecover(int rid,int iamreplacement) return 1; } -#if 0 -static int tmpStack[TMP_STACK_SIZE]; -int armci_irecover_OLD(int rid,int iamreplacement) -{ - int rc; - jmp_buf jmp; - - /* Save "rid" and "iamreplacement" in a global variable as we are going - to replace the contents of the current stack. */ - RID = rid; /* CHECK: save rid in a file or somewhere instead of - * global variable*/ - tmp_iamreplacement = iamreplacement; - -#if 0 - /* create a temporary stack */ - rc = _setjmp(jmp); - - if (rc == 0) { - /* Goto a temporary stack as we still running on the original stack. To - do this, update Stack Pointer (SP) to be in a temp stack area. */ - jmp->__jmpbuf[JB_SP] = ((long)((char *)(tmpStack + TMP_STACK_SIZE) - EXTRA_STACK_SPACE) & ~0xf); - printf("%d: temporary stack starts @ %p\n", armci_me, jmp->__jmpbuf[JB_SP]); - - /* CHECK: make this TMP_STACK_SIZE dynamic, by measuring the size of - the stack from file */ - - /* - * Jump back ... - * But with new 'jmp' - */ - _longjmp(jmp, 1); - } - else -#endif - { - - /** - * Now we are on temporary stack. So it is safe to recover stack. - */ - armci_recover_stack(RID); - - /** - * go to the restored stack by calling longjmp(). Read jmpbuf from file - */ - if(tmp_iamreplacement){ /* CHECK: what is iamreplacement */ - rc=armci_storage_read_ptr(armci_storage_record[RID].fileinfo.fd,&armci_storage_record[RID].jmp,sizeof(jmp_buf),4*sizeof(int)); - } - armci_msg_group_barrier(&armci_storage_record[RID].group); - printf("%d: restoring original stack starts @ %p\n", armci_me, armci_storage_record[RID].jmp->__jmpbuf[JB_SP]); - longjmp(armci_storage_record[RID].jmp,1);/*goto the restored stack*/ - } - - /*we should never come here things are hosed */ - armci_die2("recovery hosed",RID,iamreplacement); - return(1); -} -#endif - void armci_icheckpoint_finalize(int rid) { int i; @@ -1008,10 +771,8 @@ void armci_icheckpoint_finalize(int rid) } /* - TODO: - 1. organize all the $ifdef __ia64's properly..They are scattered all - over and it is difficult to track down and potentially buggy - 2. checkpoint shared memory and mmap regions - 3. I/O file open/close, signals and other system specific stuff ??? - 4. memory leaks due to malloc()....free'em + TODO + 1. checkpoint shared memory and mmap regions + 2. I/O file open/close, signals and other system specific stuff ? + 3. memory leaks due to malloc() - free them */ diff --git a/armci/src/ft/armci_chkpt.h b/armci/src/ft/armci_chkpt.h index f004c956d..dbc51977d 100644 --- a/armci/src/ft/armci_chkpt.h +++ b/armci/src/ft/armci_chkpt.h @@ -35,9 +35,7 @@ typedef struct{ int tmp; /*for jmp_buf alignment*/ jmp_buf jmp; /*the jmp buffer for setjmp and longjmp*/ int ckpt_heap,ckpt_stack; -#ifdef __ia64 - armci_monitor_address_t bsp_mon; /*registerStack(backingStorePtr) monitor*/ -#endif + armci_monitor_address_t stack_mon,heap_mon; armci_monitor_address_t *user_addr; int user_addr_count; diff --git a/armci/src/include/armci.h b/armci/src/include/armci.h index fddfefe56..2522f9706 100644 --- a/armci/src/include/armci.h +++ b/armci/src/include/armci.h @@ -246,15 +246,6 @@ extern int armci_domain_my_id(armci_domain_t domain); extern int armci_domain_count(armci_domain_t domain); extern int armci_domain_same_id(armci_domain_t domain, int proc); - -/* PVM group - * On CrayT3E: the default group is the global group which is (char *)NULL - * It is the only working group. - * On Workstations: the default group is "mp_working_group". User can set - * the group name by calling the ARMCI_PVM_init (defined - * in message.c) and passing the group name to the library. - */ - extern char *mp_group_name; /*********************stuff for non-blocking API******************************/ diff --git a/armci/src/include/armci_shmem.h b/armci/src/include/armci_shmem.h index d8165b8e9..5db18fff9 100644 --- a/armci/src/include/armci_shmem.h +++ b/armci/src/include/armci_shmem.h @@ -7,19 +7,8 @@ extern char* Attach_Shared_Region(long idlist[], long size, long offset); extern void Free_Shmem_Ptr(long id, long size, char* addr); extern long armci_shmem_reg_size(int i, long id); extern char* armci_shmem_reg_ptr(int i); -#ifdef MULTI_CTX -extern void armci_nattach_preallocate_info(int* segments, int *segsize); -#endif - -#ifdef HITACHI -#define FIELD_NUM 0x1 -#endif -#if defined(QUADRICS) && defined(MULTI_CTX) -#define POST_ALLOC_CHECK(temp,size) armci_checkMapped(temp,size); -#else #define POST_ALLOC_CHECK(temp,size) ; -#endif #define MAX_REGIONS 64 diff --git a/armci/src/include/armcip.h b/armci/src/include/armcip.h index 0109dd98f..46057a7be 100644 --- a/armci/src/include/armcip.h +++ b/armci/src/include/armcip.h @@ -14,29 +14,6 @@ printf("\n%d:%s:%d:%s:%s:%d",armci_me,__FILE__,__LINE__,FUNCTION_NAME,__ARMCI_ST,__ARMCI_NU)*/ #define ARMCI_PR_DBG(__ARMCI_ST,__ARMCI_NU) -#ifdef QUADRICS -#include -#ifdef QSNETLIBS_VERSION_CODE -#ifndef DECOSF -# define ELAN_ACC -# define PENDING_OPER(x) ARMCI_ACC_INT -#endif - -# if QSNETLIBS_VERSION_CODE > QSNETLIBS_VERSION(1,5,0) -# define LIBELAN_ATOMICS -# endif - -#endif -extern void armci_elan_fence(int p); -#endif - -/* we got problems on IA64/Linux64 with Elan if inlining is used */ -#if defined(__GNUC__) && !defined(QUADRICS) -# define INLINE inline -#else -# define INLINE -#endif - #if HAVE_UNISTD_H # include #elif HAVE_WINDOWS_H @@ -44,7 +21,7 @@ extern void armci_elan_fence(int p); # define sleep(x) Sleep(100*(x)) #endif -#if (defined(SYSV) || defined(WIN32)|| defined(MMAP)) && !defined(NO_SHM) && !defined(HITACHI) && !defined(CATAMOUNT) +#if (defined(SYSV) || defined(WIN32)|| defined(MMAP)) && !defined(NO_SHM) #define CLUSTER #ifdef SERVER_THREAD @@ -89,7 +66,7 @@ void *ptr; } armci_flag_t; -#if defined(LAPI) || defined(PTHREADS) || defined(POSIX_THREADS) +#if defined(PTHREADS) || defined(POSIX_THREADS) # include typedef pthread_t thread_id_t; # define THREAD_ID_SELF pthread_self @@ -110,15 +87,10 @@ extern thread_id_t armci_serv_tid; # define SERVER_CONTEXT (armci_me<0) #endif -#if defined(LAPI) || defined(CLUSTER) || defined(CRAY) || defined(CRAY_XT)\ - || defined(CRAY_SHMEM) || defined(BGML) || defined(DCMF) +#if defined(CLUSTER) # include "request.h" #endif -#ifdef ARMCIX -#include "armcix.h" -#endif - /* ------------------------ ARMCI threads support ------------------------- */ #define ARMCI_THREADS_LIMIT 32 @@ -138,7 +110,7 @@ extern armci_user_threads_t armci_user_threads; extern void armci_init_threads(); extern void armci_finalize_threads(); extern int armci_thread_idx(); -extern INLINE int armci_register_thread(thread_id_t id); +extern int armci_register_thread(thread_id_t id); #define ARMCI_THREAD_IDX armci_thread_idx() /* needs to be optimized */ @@ -169,11 +141,6 @@ extern INLINE int armci_register_thread(thread_id_t id); # endif #endif -#if defined(CRAY_XT) || defined(CRAY_T3E) || defined(FUJITSU)\ - || defined(HITACHI) || (defined(QUADRICS) && !defined(ELAN_ACC)) -#define ACC_COPY -#endif - #ifndef FATR # if defined(WIN32) && !defined(__MINGW32__) # define FATR __stdcall @@ -194,18 +161,14 @@ extern INLINE int armci_register_thread(thread_id_t id); # define RESERVED_BUFLEN ((sizeof(request_header_t)>>3)+3*MAX_STRIDE_LEVEL +\ EXTRA_MSG_BUFLEN_DBL) #endif - -#if defined(HITACHI) -# define BUFSIZE ((0x50000) * sizeof(double)) -#else - /* packing algorithm for double complex numbers requires even number */ + +/* packing algorithm for double complex numbers requires even number */ # ifdef MSG_BUFLEN_DBL # define BUFSIZE_DBL (MSG_BUFLEN_DBL - RESERVED_BUFLEN) # else # define BUFSIZE_DBL 32768 # endif # define BUFSIZE (BUFSIZE_DBL * sizeof(double)) -#endif /* note opcodes must be lower than ARMCI_ACC_OFF !!! */ #define PUT 1 @@ -221,14 +184,10 @@ extern INLINE int armci_register_thread(thread_id_t id); extern int armci_me, armci_nproc; extern int _armci_initialized; -#ifdef HITACHI - extern int sr8k_server_ready; - extern double *armci_internal_buffer; -#else + #if !defined(THREAD_SAFE) extern double armci_internal_buffer[BUFSIZE_DBL]; #endif -#endif extern void armci_shmem_init(); extern void armci_krmalloc_init_localmem(); @@ -310,31 +269,17 @@ extern void armci_finalize_fence(); # define SAMECLUSNODE(p)\ ( ((p) <= armci_clus_last) && ((p) >= armci_clus_first) ) -#elif defined(__crayx1) -# define SAMECLUSNODE(p) 1 -#elif defined(ARMCIX) -# define SAMECLUSNODE(p) 0 #else # define SAMECLUSNODE(p) ((p)==armci_me) #endif -#if defined(LAPI) || defined(ELAN_ACC) -# define ORDER(op,proc)\ - if( proc == armci_me || ( ARMCI_ACC(op) && ARMCI_ACC(PENDING_OPER(proc))) );\ - else FENCE_NODE(proc) -# define UPDATE_FENCE_INFO(proc_) -#elif defined(CLUSTER) && !defined(QUADRICS) && !defined(HITACHI)\ - && !defined(CRAY_SHMEM) && !defined(PORTALS) +#if defined(CLUSTER) # define ORDER(op_,proc_)\ if(!SAMECLUSNODE(proc_) && op_ != GET )FENCE_ARR(proc_)=1 # define UPDATE_FENCE_INFO(proc_) if(!SAMECLUSNODE(proc_))FENCE_ARR(proc_)=1 #else -# if defined(GM) && defined(ACK_FENCE) -# define ORDER(op,proc) -# else -# define ORDER(op,proc) if(proc != armci_me) FENCE_NODE(proc) -# endif +# define ORDER(op,proc) if(proc != armci_me) FENCE_NODE(proc) # define UPDATE_FENCE_INFO(proc_) #endif @@ -349,7 +294,7 @@ typedef struct { * to establish socket communication like on the networks of workstations * SP node names must be distinct within first HOSTNAME_LEN characters \*/ -#if defined(LAPI) && defined(AIX) +#if defined(AIX) # define HOSTNAME_TRUNCATE # define HOSTNAME_LEN 12 #else @@ -443,9 +388,6 @@ extern void armci_global_region_exchange(void *, long); /* -------------------- ARMCI Groups ---------------------- */ /* data structure that caches a group's attribute */ -#ifdef BGML -#define PCLASS 3 -#endif #ifdef MSG_COMMS_MPI typedef int ARMCI_Datatype; @@ -505,12 +447,4 @@ extern void armci_icheckpoint_finalize(int rid); #endif /* ifdef ENABLE_CHECKPOINT */ /* -------------------------------------------------------- */ -#ifdef BGML -#define ARMCI_CRITICAL_SECTION_ENTER() BGML_CriticalSection_enter(); -#define ARMCI_CRITICAL_SECTION_EXIT() BGML_CriticalSection_exit(); -#else -#define ARMCI_CRITICAL_SECTION_ENTER() -#define ARMCI_CRITICAL_SECTION_EXIT() -#endif - #endif diff --git a/armci/src/include/atomic_ops_ia64.h b/armci/src/include/atomic_ops_ia64.h deleted file mode 100644 index c4ffecc29..000000000 --- a/armci/src/include/atomic_ops_ia64.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2003 Marcel Moolenaar - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD: src/lib/libkse/arch/ia64/include/atomic_ops.h,v 1.2.2.1 2009/06/09 15:09:10 des Exp $ - */ - -#ifndef _ATOMIC_OPS_H_ -#define _ATOMIC_OPS_H_ - -static inline void -atomic_swap_int(volatile int *dst, int val, int *res) -{ - __asm("xchg4 %0=[%2],%1" : "=r"(*res) : "r"(val), "r"(dst)); -} - -static inline void -atomic_swap_long(volatile long *dst, long val, long *res) -{ - __asm("xchg8 %0=[%2],%1" : "=r"(*res) : "r"(val), "r"(dst)); -} - -#define atomic_swap_ptr(d,v,r) \ - atomic_swap_long((volatile long *)d, (long)v, (long *)r) - -#endif /* _ATOMIC_OPS_H_ */ diff --git a/armci/src/include/copy.h b/armci/src/include/copy.h index 357b2a370..f48464de3 100644 --- a/armci/src/include/copy.h +++ b/armci/src/include/copy.h @@ -8,69 +8,20 @@ #if HAVE_STRING_H # include #endif -#ifdef DECOSF -#include -#endif -#if 1 || defined(HITACHI) || defined(CRAY_T3E) || defined(CRAY_XT) || defined(BGML) # define MEMCPY -#endif -#if defined(LINUX64) && defined(SGIALTIX) && defined(MSG_COMMS_MPI) -/* fastbcopy from Wayne Vieira and Gerardo Cisneros */ -#define MEMCPY -#define armci_copy(src, dst, len) _fastbcopy(src, dst, len) -#define memcpy(dst, src, len) _fastbcopy(src, dst, len) -#define bcopy(src, dst, len) _fastbcopy(src, dst, len) -#endif #ifndef EXTERN # define EXTERN extern #endif - -#ifdef NEC -# define memcpy1 _VEC_memcpy -# define armci_copy1(src,dst,n) _VEC_memcpy((dst),(src),(n)) - EXTERN long long _armci_vec_sync_flag; -#endif - -#if defined(SGI) || defined(FUJITSU) || defined(HPUX) || defined(SOLARIS) || defined (DECOSF) || defined(__ia64__) || defined(__crayx1) -# define PTR_ALIGN -#endif - -#if defined(NB_NONCONT) && !defined(CRAY_SHMEM) && !defined(QUADRICS) && !defined(PORTALS) -#error NB_NONCONT is only available on CRAY_SHMEM,QUADRICS and PORTALS -#endif - -#if defined(SHMEM_HANDLE_SUPPORTED) && !defined(CRAY_SHMEM) -#error SHMEM_HANDLE_SUPPORTED should not be defined on a non CRAY_SHMEM network -#endif #if defined(MEMCPY) && !defined(armci_copy) -#if defined(BGML) -#define armci_copy(src, dst, n) BGLML_memcpy((dst), (src), (n)) -#else -# define armci_copy(src,dst,n) memcpy((dst), (src), (n)) -#endif -#endif - -#ifdef NEC -# define MEM_FENCE {mpisx_clear_cache(); _armci_vec_sync_flag=1;mpisx_syncset0_long(&_armci_vec_sync_flag);} -#endif - -#ifdef DECOSF -# define MEM_FENCE asm ("mb") +# define armci_copy(src,dst,n) memcpy((dst), (src), (n)) #endif #if defined(NEED_MEM_SYNC) # ifdef AIX # define MEM_FENCE {int _dummy=1; _clear_lock((int *)&_dummy,0); } -# elif defined(__ia64) -# if defined(__GNUC__) && !defined (__INTEL_COMPILER) -# define MEM_FENCE __asm__ __volatile__ ("mf" ::: "memory"); -# else /* Intel Compiler */ - extern void _armci_ia64_mb(); -# define MEM_FENCE _armci_ia64_mb(); -# endif # elif defined(LINUX) && defined(__GNUC__) && defined(__ppc__) # define MEM_FENCE \ __asm__ __volatile__ ("isync" : : : "memory"); @@ -129,77 +80,6 @@ }\ } -#if defined(FUJITSU) - -# define armci_put2D(p, bytes,count,src_ptr,src_stride,dst_ptr,dst_stride)\ - CopyPatchTo(src_ptr, src_stride, dst_ptr, dst_stride, count,bytes, p) - -# define armci_get2D(p, bytes, count, src_ptr,src_stride,dst_ptr,dst_stride)\ - CopyPatchFrom(src_ptr, src_stride, dst_ptr, dst_stride,count,bytes,p) - -#elif defined(HITACHI) || defined(_ELAN_PUTGET_H) && !defined(NB_NONCONT) - -#if defined(QUADRICS) -#if 0 -# define WAIT_FOR_PUTS elan_putWaitAll(elan_base->state,200) -# define WAIT_FOR_GETS elan_getWaitAll(elan_base->state,200) -#else -# define WAIT_FOR_PUTS armcill_wait_put() -# define WAIT_FOR_GETS armcill_wait_get() - extern void armcill_wait_put(); - extern void armcill_wait_get(); -#endif -#endif - - extern void armcill_put2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); - extern void armcill_get2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); -# define armci_put2D armcill_put2D -# define armci_get2D armcill_get2D - -#elif defined(NB_NONCONT) - - extern void armcill_wait_put(); - extern void armcill_wait_get(); -# define WAIT_FOR_PUTS armcill_wait_put() -# define WAIT_FOR_GETS armcill_wait_get() - - extern void armcill_put2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); - extern void armcill_get2D(int proc, int bytes, int count, - void* src_ptr,int src_stride, void* dst_ptr,int dst_stride); -# define armci_put2D armcill_put2D -# define armci_get2D armcill_get2D - -# if defined(QUADRICS) - -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - _hdl = elan_put(elan_base->state,_src,_dst,(size_t)_sz,_proc) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - _hdl = elan_get(elan_base->state,_src,_dst,(size_t)_sz,_proc) -# define armcill_nb_wait(_hdl)\ - elan_wait(_hdl,100) - -# elif defined(CRAY_SHMEM) - -# define armcill_nb_wait(_hdl)\ - shmem_wait_nb(_hdl) -/*VT:this should be ifdef'ed based on if shmem_handle is defined or not*/ -# if defined (CRAY_XT) -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - shmem_putmem(_dst, _src, (size_t)_sz, _proc) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - shmem_getmem(_dst, _src, (size_t)_sz, _proc) -# else -# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\ - _hdl = shmem_putmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl)) -# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\ - _hdl = shmem_getmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl)) -# endif -# endif - -#else # define armci_put2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\ int _j;\ char *ps=src_ptr, *pd=dst_ptr;\ @@ -220,64 +100,12 @@ pd += dst_stride;\ }\ } -#endif - -/* macros to ensure ordering of consecutive puts or gets following puts */ -#if defined(LAPI) -# include "lapidefs.h" +#define FENCE_NODE(p) +#define UPDATE_FENCE_STATE(p, op, nissued) -#elif defined(_CRAYMPP) || defined(QUADRICS) || defined(__crayx1)\ - || defined(CRAY_SHMEM) || defined(PORTALS) -#if defined(CRAY) || defined(CRAY_XT) -# include -#else -# include -#ifndef ptrdiff_t -# include -#endif -# include -#endif -# ifdef ELAN_ACC -# define FENCE_NODE(p) {\ - if(((p)armci_clus_last))armci_elan_fence(p);} -# define UPDATE_FENCE_STATE(p, op, nissued) -# else - int cmpl_proc; -# ifdef DECOSF -# define FENCE_NODE(p) if(cmpl_proc == (p)){\ - if(((p)armci_clus_last))shmem_quiet();\ - else asm ("mb"); } -# else -# define FENCE_NODE(p) if(cmpl_proc == (p)){\ - if(((p)armci_clus_last))shmem_quiet(); } -# endif -# define UPDATE_FENCE_STATE(p, op, nissued) if((op)==PUT) cmpl_proc=(p); -# endif -#else -# if defined(GM) && defined(ACK_FENCE) - extern void armci_gm_fence(int p); -# define FENCE_NODE(p) armci_gm_fence(p) -# elif defined(BGML) -# include "bgmldefs.h" -# define FENCE_NODE(p) BGML_WaitProc(p) -# elif defined(ARMCIX) -# define FENCE_NODE(p) ARMCIX_Fence(p) -# else -# define FENCE_NODE(p) -# endif -# define UPDATE_FENCE_STATE(p, op, nissued) - -#endif - - -#ifdef NEC -# define THRESH 1 -# define THRESH1D 1 -#else # define THRESH 32 # define THRESH1D 512 -#endif #define ALIGN_SIZE sizeof(double) /********* interface to C 1D and 2D memory copy functions ***********/ @@ -329,10 +157,10 @@ void c_dcopy13_(const int* const restrict rows, const double* const restrict buf, int* const restrict cur); -#if defined(AIX) || defined(BGML) +#if defined(AIX) # define DCOPY2D c_dcopy2d_u_ # define DCOPY1D c_dcopy1d_u_ -#elif defined(LINUX) || defined(__crayx1) || defined(HPUX64) || defined(DECOSF) || defined(CRAY) || defined(WIN32) || defined(HITACHI) +#elif defined(LINUX) || defined(WIN32) # define DCOPY2D c_dcopy2d_n_ # define DCOPY1D c_dcopy1d_n_ #else @@ -346,156 +174,8 @@ void c_dcopy13_(const int* const restrict rows, /***************************** 1-Dimensional copy ************************/ -#if defined(QUADRICS) -# include - -# if defined(_ELAN_PUTGET_H) -# define qsw_put(src,dst,n,proc) \ - elan_wait(elan_put(elan_base->state,src,dst,n,proc),elan_base->waitType) -# define qsw_get(src,dst,n,proc) \ - elan_wait(elan_get(elan_base->state,src,dst,n,proc),elan_base->waitType) -/* -# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\ - *(phandle)=elan_put(elan_base->state,src,dst,n,proc) -*/ -#ifdef DOELAN4 -extern void armci_elan_put_with_tracknotify(char *src,char *dst,int n,int proc, ELAN_EVENT **phandle); -# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\ - armci_elan_put_with_tracknotify(src,dst,n,proc,phandle) -#endif - -# define ARMCI_NB_GET(src,dst,n,proc,phandle)\ - *(phandle)=elan_get(elan_base->state,src,dst,n,proc) -# define ARMCI_NB_WAIT(handle) if(handle)elan_wait(handle,elan_base->waitType) -# define ARMCI_NB_TEST(handle,_succ) (*(_succ))= (handle)? !elan_poll(handle,1L): 1 -# else -# define qsw_put(src,dst,n,proc) shmem_putmem((dst),(src),(int)(n),(proc)) -# define qsw_get(src,dst,n,proc) shmem_getmem((dst),(src),(int)(n),(proc)) -# endif - -# define armci_put(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { qsw_put(src,dst,n,proc);} -# define armci_get(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { qsw_get((src),(dst),(int)(n),(proc));} - -#elif defined(CRAY_T3E) || defined(CRAY_SHMEM) -# define armci_copy_disabled(src,dst,n)\ - if((n)<256 || n%sizeof(long) ) memcpy((dst),(src),(n));\ - else {\ - shmem_put((long*)(dst),(long*)(src),(int)(n)/sizeof(long),armci_me);\ - shmem_quiet(); } - -# define armci_put(src,dst,n,proc) \ - shmem_put32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\ - shmem_quiet() - -# define armci_get(src,dst,n,proc) \ - shmem_get32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\ - shmem_quiet() - -#elif defined(HITACHI) - - extern void armcill_put(void *src, void *dst, int bytes, int proc); - extern void armcill_get(void *src, void *dst, int bytes, int proc); - -# define armci_put(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armcill_put((src), (dst),(n),(proc));} - -# define armci_get(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { armcill_get((src), (dst),(n),(proc));} - -#elif defined(FUJITSU) - -# include "fujitsu-vpp.h" -# ifndef __sparc -# define armci_copy(src,dst,n) _MmCopy((char*)(dst), (char*)(src), (n)) -# endif -# define armci_put CopyTo -# define armci_get CopyFrom - -#elif defined(LAPI) - -# include - extern lapi_handle_t lapi_handle; - -# define armci_put(src,dst,n,proc)\ - if(proc==armci_me){\ - armci_copy(src,dst,n);\ - } else {\ - if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\ - NULL,&(ack_cntr[ARMCI_THREAD_IDX].cntr),&cmpl_arr[proc].cntr))\ - ARMCI_Error("LAPI_put failed",0); else;} - - /**** this copy is nonblocking and requires fence to complete!!! ****/ -# define armci_get(src,dst,n,proc) \ - if(proc==armci_me){\ - armci_copy(src,dst,n);\ - } else {\ - if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \ - NULL, &(get_cntr[ARMCI_THREAD_IDX].cntr)))\ - ARMCI_Error("LAPI_Get failed",0);else;} - -# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\ - {if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\ - ARMCI_Error("LAPI_Setcntr in NB_PUT failed",0);\ - (cmplt)->val=1;\ - if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\ - NULL, &((cmplt)->cntr), &cmpl_arr[proc].cntr))\ - ARMCI_Error("LAPI_put failed",0); else;} - -# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\ - {if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\ - ARMCI_Error("LAPI_Setcntr in NB_GET failed",0);\ - (cmplt)->val=1;\ - if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \ - NULL, &((cmplt)->cntr)))\ - ARMCI_Error("LAPI_Get NB_GET failed",0);else;} - -# define ARMCI_NB_WAIT(cmplt) CLEAR_COUNTER((cmplt)) -# define ARMCI_NB_TEST(cmplt,_succ) TEST_COUNTER((cmplt),(_succ)) - -#elif defined(PORTALS) -# define armci_put(src,dst,n,proc) \ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { PARMCI_Put((src), (dst),(n),(proc));} - -# define armci_get(src,dst,n,proc)\ - if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\ - armci_copy(src,dst,n);\ - } else { PARMCI_Get((src), (dst),(n),(proc));} - -#if 0 -# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\ - nb_handle->tag=GET_NEXT_NBTAG();armci_portals_put((proc),(src),\ - (dst),(n),cmplt,nb_handle->tag) -# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\ - nb_handle->tag=GET_NEXT_NBTAG();armci_portals_get((proc),(src),\ - (dst),(n),cmplt,nb_handle->tag) -#endif - -#elif defined(BGML) -#define armci_get(src, dst, n, p) PARMCI_Get(src, dst, n, p) -#define armci_put(src, dst, n, p) PARMCI_Put(src, dst, n, p) - -#elif defined(ARMCIX) -#define armci_get(src, dst, n, p) PARMCI_Get(src, dst, n, p) -#define armci_put(src, dst, n, p) PARMCI_Put(src, dst, n, p) -#define ARMCI_NB_WAIT(cmplt) ARMCIX_Wait(&(cmplt)) -#else - -# define armci_get(src,dst,n,p) armci_copy((src),(dst),(n)) -# define armci_put(src,dst,n,p) armci_copy((src),(dst),(n)) - -#endif +#define armci_get(src,dst,n,p) armci_copy((src),(dst),(n)) +#define armci_put(src,dst,n,p) armci_copy((src),(dst),(n)) #ifndef MEM_FENCE # define MEM_FENCE {} diff --git a/armci/src/include/fujitsu-vpp.h b/armci/src/include/fujitsu-vpp.h deleted file mode 100644 index 2ecd1c3aa..000000000 --- a/armci/src/include/fujitsu-vpp.h +++ /dev/null @@ -1,153 +0,0 @@ -/* This file "fujitsu-vpp.h" #defines common set of macros used in - * the Global Array and TCGMSG-MPI ports to Fujitsu VX and VPP systems. - * - * Author: Jarek Nieplocha - * Organization: Pacific Northwest National Laboratory, Richland, WA 99552 - * Date: 08.21.1997 - * History: 11.27.1997 added optimized 2-D array copy macros - * 03.10.2000 Both strided operations made optional - * - * Notes: - * 1. This file contains calls to an undisclosed Fujitsu MPlib library and - * is NOT intended for public distribution. - */ - -#ifndef _FUJITSU_VPP_H -#define _FUJITSU_VPP_H - -#ifdef __sparc - /* the AP3000 version is limited */ -# include "/opt/FSUNaprun/include/vpp_mplib.h" -# define VPP_NbWrite VPP_Write -#else - /* specify full path to MPlib include file (here is my account on Fecit VX)*/ -# include "/home/jniep/include/mplib.h" -#endif - -#define TOTAL_NUM_SEM 128 /* number of MPlib semaphores per node */ -#define NUM_SEM 16 /* we will use only that+1 many semaphores */ -#define SEM_BASE (TOTAL_NUM_SEM-NUM_SEM-2) -#define NXTV_SEM SEM_BASE+NUM_SEM+1 /*nxtval server semaphore in tcgmsg-mpi */ - -#define MPLIB_TERMINATE {\ - fflush(stdout);sleep(1);\ - fprintf(stderr,"%d: MPlib call failed: %s in line %d compiled on %s\n",\ - VPP_MyPe(),__FILE__,__LINE__,__DATE__);\ - fprintf(stdout,"%d: MPlib call failed: %s in line %d compiled on %s\n",\ - VPP_MyPe(),__FILE__,__LINE__,__DATE__);\ - fflush(stdout); sleep(1);\ - VPP_Abort();\ -} - -#define NATIVE_LOCK0(proc,mtx) if(VPP_SemWait((PROC)(proc),(mtx)))MPLIB_TERMINATE -#define NATIVE_UNLOCK0(proc,mtx) if(VPP_SemPost((PROC)(proc),(mtx)))MPLIB_TERMINATE - -#define NAT_LOCK(proc) if(VPP_SemWait((PROC)(proc),SEM_BASE))MPLIB_TERMINATE -#define NAT_UNLOCK(proc) if(VPP_SemPost((PROC)(proc),SEM_BASE))MPLIB_TERMINATE - -#define NATIVE_BARRIER VPP_Barrier -#define NATIVE_BARRIER__() MPI_Barrier(MPI_COMM_WORLD) - - -/**************************** MEMORY COPY macros ************************/ -/*copy n words from remote memory (proc) at (src) to local memory (dst) */ -#define CopyElemFrom(src,dst,n,proc) \ - if(VPP_Read((PROC)(proc),(ADDRP) (src), (ADDRP)(dst), n*sizeof(long)))\ - MPLIB_TERMINATE - -/*copy n words to remote memory (proc) at (dst) from local memory (src) */ -#define CopyElemTo(src,dst,n,proc) \ - if(VPP_Write((PROC)(proc),(ADDRP)(dst),(ADDRP) (src),n*sizeof(long)))\ - MPLIB_TERMINATE - -/*copy n bytes from remote memory (proc) at (src) to local memory (dst) */ -#define CopyFrom(src,dst,n,proc) \ - if(VPP_Read((PROC)(proc),(ADDRP) (src), (ADDRP)(dst), n))\ - MPLIB_TERMINATE - -/*copy n bytes o remote memory (proc) at (dst) from local memory (src) */ -#define CopyTo(src,dst,n,proc) \ - if(VPP_Write((PROC)(proc),(ADDRP)(dst),(ADDRP) (src),n ))\ - MPLIB_TERMINATE - -/* Memory copy for 2-dimensional array patches between local and remote memory: - * we use nonblocking read/write operations followed by a blocking read/write - * of 1 byte to force completion of outstanding nonblocking operations. - * This works because of the "in-order" rule for remote memory operations. - * This code is using byte- rather than word- interface. - */ - -#ifdef VPP_STRIDED_WRITE -# define CopyPatchTo(src, ld_src, dst, ld_dst, blocks, bytes, proc){\ - if(VPP_WriteBothStrided((PROC)(proc), (ADDRP)dst, (ADDRP)src,\ - bytes, ld_dst, ld_src, bytes*blocks))MPLIB_TERMINATE; } -#else -# define CopyPatchTo(src, ld_src, dst, ld_dst, blocks, bytes, proc){\ - int _iii, _stat=0, _bytes2copy=1;\ - char *ps=(char*)src, *pd=(char*)dst;\ - if((blocks)>1)for (_iii=0;_iii<(blocks);_iii++){\ - _stat += VPP_NbWrite((PROC)(proc),(ADDRP)pd,(ADDRP)ps,(bytes));\ - ps += (ld_src);\ - pd += (ld_dst);\ - }else _bytes2copy=(bytes);\ - _stat += VPP_Write((PROC)(proc),(ADDRP)(dst),(ADDRP)(src),_bytes2copy);\ - if(_stat)MPLIB_TERMINATE; } -#endif - -#ifdef VPP_STRIDED_READ -# define CopyPatchFrom(src, ld_src, dst, ld_dst, blocks, _bytes, proc){\ - if(VPP_ReadBothStrided((PROC)(proc), (ADDRP)src, (ADDRP)dst,\ - _bytes, ld_src, ld_dst, _bytes*blocks))MPLIB_TERMINATE;} -#else -# define CopyPatchFrom(src, ld_src, dst, ld_dst, blocks, _bytes, proc){\ - int _iii, _stat=0, _bytes2copy=1;\ - char *ps=(char*)src, *pd=(char*)dst;\ - if((blocks)>1)for (_iii=0;_iii<(blocks);_iii++){\ - _stat += VPP_Read((PROC)(proc),(ADDRP)ps,(ADDRP)pd,(_bytes));\ - ps += (ld_src);\ - pd += (ld_dst);\ - }else _bytes2copy=(_bytes);\ - _stat += VPP_Read((PROC)(proc),(ADDRP)(src),(ADDRP)(dst),_bytes2copy);\ - if(_stat)MPLIB_TERMINATE; } -#endif - -#if 0 -#define MAX_IDS 10 -static DRWD *_id[MAX_IDS]; - -#define CopyPatchFromXX(src, ld_src, dst, ld_dst, blocks, _bytes, proc){\ - int _iii, _stat=0, _idx;\ - char *ps=(char*)src, *pd=(char*)dst;\ -printf("lds=%d ldd=%d bl=%d bytes=%d proc=%d\n",ld_src,ld_dst,blocks, _bytes, proc); fflush(stdout);\ - if((blocks)>1){\ - for (_iii=0; _iii<(blocks); _iii++){\ - _idx = _iii%MAX_IDS;\ - if(_iii >= MAX_IDS)_stat += VPP_ReadDone(_id[_idx]);\ - _id[_idx]=VPP_StartRead((PROC)(proc),(ADDRP)ps,(ADDRP)pd,(_bytes));\ - ps += (ld_src);\ - pd += (ld_dst);\ - }\ - for (_idx=0; _idx < (blocks) && _idx < MAX_IDS; _idx++){\ - _stat += VPP_ReadDone(_id[_idx]);\ - }\ - }else _stat = VPP_Read((PROC)(proc),(ADDRP)(src),(ADDRP)(dst),(_bytes));\ - if(_stat)MPLIB_TERMINATE;\ -} - -#define CopyPatchFromYY(src, ld_src, dst, ld_dst, blocks, _bytes, proc){\ - int _iii, _stat=0, _bytes2copy=1;\ - char *ps=(char*)src, *pd=(char*)dst;\ - if((blocks)>1) { for (_iii=0;_iii<(blocks);_iii++){\ - _id[0]= VPP_StartRead((PROC)(proc),(ADDRP)ps,(ADDRP)pd,(_bytes));\ - ps += (ld_src);\ - pd += (ld_dst);\ - }\ - _stat += VPP_ReadDone(_id[0]);\ - }else _bytes2copy=(_bytes);\ - _stat += VPP_Read((PROC)(proc),(ADDRP)(src),(ADDRP)(dst),_bytes2copy);\ - if(_stat)MPLIB_TERMINATE;\ -} -#endif - - -#endif diff --git a/armci/src/include/kr_malloc.h b/armci/src/include/kr_malloc.h index 08e7b97f3..2015dd627 100644 --- a/armci/src/include/kr_malloc.h +++ b/armci/src/include/kr_malloc.h @@ -1,13 +1,7 @@ #ifndef KR_MALLOC_H /* K&R malloc */ #define KR_MALLOC_H -#ifdef CRAY #define LOG_ALIGN 6 -#elif defined(KSR) -#define LOG_ALIGN 7 -#else -#define LOG_ALIGN 6 -#endif #define ALIGNMENT (1 << LOG_ALIGN) diff --git a/armci/src/include/locks.h b/armci/src/include/locks.h index 4e2c5985f..792d523bb 100644 --- a/armci/src/include/locks.h +++ b/armci/src/include/locks.h @@ -8,7 +8,7 @@ #define MAX_LOCKS 1024 #define NUM_LOCKS MAX_LOCKS -#if !(defined(PMUTEX) || defined(PSPIN) || defined(CYGNUS) || defined(CRAY_XT)) +#if !(defined(PMUTEX) || defined(PSPIN) || defined(CYGNUS) ) # include "spinlock.h" #endif @@ -16,15 +16,13 @@ # error cannot run #endif -#if (defined(SPINLOCK) || defined(PMUTEX) || defined(PSPIN) || defined(HITACHI)) && !(defined(BGML) || defined(DCMF)) +#if (defined(SPINLOCK) || defined(PMUTEX) || defined(PSPIN)) # include "armci_shmem.h" typedef struct { long off; long idlist[SHMIDLEN]; } lockset_t; extern lockset_t lockid; -#elif defined(BGML) || defined(DCMF) -typedef int lockset_t; #endif #if defined(PMUTEX) @@ -47,49 +45,11 @@ extern PAD_LOCK_T *_armci_int_mutexes; # define PAD_LOCK_T LOCK_T extern PAD_LOCK_T *_armci_int_mutexes; -#elif defined(SPINLOCK) && defined(SGIALTIX) -# define NAT_LOCK(x,p) armci_acquire_spinlock((LOCK_T*)( ((PAD_LOCK_T*)(((void**)_armci_int_mutexes)[p]))+x )) -# define NAT_UNLOCK(x,p) armci_release_spinlock((LOCK_T*)( ((PAD_LOCK_T*)(((void**)_armci_int_mutexes)[p]))+x )) -extern PAD_LOCK_T *_armci_int_mutexes; - #elif defined(SPINLOCK) # define NAT_LOCK(x,p) armci_acquire_spinlock((LOCK_T*)(_armci_int_mutexes+(x))) # define NAT_UNLOCK(x,p) armci_release_spinlock((LOCK_T*)(_armci_int_mutexes+(x))) extern PAD_LOCK_T *_armci_int_mutexes; -#elif defined(HITACHI) -extern void armcill_lock(int mutex, int proc); -extern void armcill_unlock(int mutex, int proc); -# define LOCK_T int -# define PAD_LOCK_T LOCK_T -# define NAT_LOCK(x,p) armcill_lock((x),(p)) -# define NAT_UNLOCK(x,p) armcill_unlock((x),(p)) -extern PAD_LOCK_T *_armci_int_mutexes; - -#elif defined(SGI) -# define SGI_SPINS 100 -# include -typedef struct { - int id; - ulock_t * lock_array[NUM_LOCKS]; -}lockset_t; -extern lockset_t lockset; -# define NAT_LOCK(x,p) (void) uswsetlock(lockset.lock_array[(x)],SGI_SPINS) -# define NAT_UNLOCK(x,p) (void) usunsetlock(lockset.lock_array[(x)]) - -#elif defined(CONVEX) -# include -typedef struct{ - unsigned state; - unsigned pad[15]; -} lock_t; -typedef int lockset_t; -extern lock_t *lock_array; -extern void setlock(unsigned * volatile lp); -extern void unsetlock(unsigned * volatile lp); -# define NAT_LOCK(x,p) (void) setlock(&lock_array[x].state) -# define NAT_UNLOCK(x,p) (void) unsetlock(&lock_array[(x)].state) - #elif defined(WIN32) typedef int lockset_t; extern void setlock(int); @@ -97,52 +57,11 @@ extern void unsetlock(int); # define NAT_LOCK(x,p) setlock(x) # define NAT_UNLOCK(x,p) unsetlock(x) -#elif defined(CRAY_YMP) && !defined(__crayx1) -# include -typedef int lockset_t; -extern lock_t cri_l[NUM_LOCKS]; -# pragma _CRI common cri_l -# define NAT_LOCK(x,p) t_lock(cri_l+(x)) -# define NAT_UNLOCK(x,p) t_unlock(cri_l+(x)) - -#elif defined(CRAY_T3E) || defined(__crayx1) || defined(CATAMOUNT) || defined(CRAY_SHMEM) || defined(PORTALS) -# include -# if defined(CRAY) || defined(CRAY_XT) -# include -# endif -# if defined(DECOSF) || defined(LINUX64) || defined(__crayx1) || defined(CATAMOUNT) -# define _INT_MIN_64 (LONG_MAX-1) -# endif -# undef NUM_LOCKS -# define NUM_LOCKS 4 -static long armci_lock_var[4]={0,0,0,0}; -typedef int lockset_t; -# define INVALID (long)(_INT_MIN_64 +1) -# define NAT_LOCK(x,p) while( shmem_swap(armci_lock_var+(x),INVALID,(p)) ) -# define NAT_UNLOCK(x,p) shmem_swap(armci_lock_var+(x), 0, (p)) - -#elif defined(SYSV) && defined(LAPI) && defined(AIX) -int **_armci_int_mutexes; -# define NAT_LOCK(x,p) armci_lapi_lock(_armci_int_mutexes[armci_master]+(x)) -# define NAT_UNLOCK(x,p) armci_lapi_unlock(_armci_int_mutexes[armci_master]+(x)) -typedef int lockset_t; - #elif defined(CYGNUS) typedef int lockset_t; # define NAT_LOCK(x,p) armci_die("does not run in parallel",0) # define NAT_UNLOCK(x,p) armci_die("does not run in parallel",0) -#elif defined(LAPI) && !defined (LINUX) -# include -typedef int lockset_t; -extern pthread_mutex_t _armci_mutex_thread; -# define NAT_LOCK(x,p) pthread_mutex_lock(&_armci_mutex_thread) -# define NAT_UNLOCK(x,p) pthread_mutex_unlock(&_armci_mutex_thread) - -#elif defined(FUJITSU) -typedef int lockset_t; -# include "fujitsu-vpp.h" - #elif defined(SYSV) || defined(MACX) # include "semaphores.h" # undef NUM_LOCKS @@ -163,12 +82,7 @@ extern void CreateInitLocks(int num, lockset_t *id); extern void InitLocks(int num , lockset_t id); extern void DeleteLocks(lockset_t id); -#ifdef FUJITSU -# define NATIVE_LOCK(x,p) if(armci_nproc>1) { NAT_LOCK(p); } -# define NATIVE_UNLOCK(x,p) if(armci_nproc>1) { NAT_UNLOCK(p); } -#else -# define NATIVE_LOCK(x,p) if(armci_nproc>1) { NAT_LOCK(x,p); } -# define NATIVE_UNLOCK(x,p) if(armci_nproc>1) { NAT_UNLOCK(x,p); } -#endif +#define NATIVE_LOCK(x,p) if(armci_nproc>1) { NAT_LOCK(x,p); } +#define NATIVE_UNLOCK(x,p) if(armci_nproc>1) { NAT_UNLOCK(x,p); } #endif /* _ARMCI_LOCKS_H_ */ diff --git a/armci/src/include/memlock.h b/armci/src/include/memlock.h index 024ee138a..46cfe8a4e 100644 --- a/armci/src/include/memlock.h +++ b/armci/src/include/memlock.h @@ -21,9 +21,7 @@ typedef struct { extern void** memlock_table_array; extern int *armci_use_memlock_table; -#if defined(LAPI ) || defined(FUJITSU) || defined(PTHREADS) || \ - defined(QUADRICS) || defined(HITACHI) || \ - defined(CYGWIN) || defined(__crayx1) || defined(NEC) || \ +#if defined(PTHREADS) || defined(CYGWIN) || \ (defined(LINUX64) && defined(__GNUC__) && defined(__alpha__)) # define ARMCI_LOCKMEM armci_lockmem_ # define ARMCI_UNLOCKMEM armci_unlockmem_ diff --git a/armci/src/include/request.h b/armci/src/include/request.h index c6e3cc380..5b866d6ea 100644 --- a/armci/src/include/request.h +++ b/armci/src/include/request.h @@ -17,32 +17,14 @@ extern void _armci_buf_test_nb_request(int bufid,unsigned int tag, int *retcode) extern void _armci_buf_set_tag(void *bufptr,unsigned int tag,short int protocol); extern void _armci_buf_clear_all(); -extern INLINE char *_armci_buf_get_clear_busy(int size, int operation, int to); -extern INLINE void _armci_buf_set_busy(void *buf, int state); -extern INLINE void _armci_buf_set_busy_idx(int tbl_idx, int state); -extern INLINE int _armci_buf_cmpld(int bufid); -extern INLINE void _armci_buf_set_cmpld(void *buf, int state); -extern INLINE void _armci_buf_set_cmpld_idx(int idx, int state); - -#ifdef LAPI -# include "lapidefs.h" -#elif PORTALS -# include "armci_portals.h" - typedef long msg_tag_t; -#elif defined(GM) -# include "myrinet.h" -#elif defined(DOELAN4) -# include "elandefs.h" -#elif defined(QUADRICS) -# include - typedef void* msg_tag_t; -# ifdef _ELAN_PUTGET_H -# define NB_CMPL_T ELAN_EVENT* -# endif -#elif defined(VIA) -# include "via.h" - typedef void* msg_tag_t; -#elif defined(VAPI) +extern char *_armci_buf_get_clear_busy(int size, int operation, int to); +extern void _armci_buf_set_busy(void *buf, int state); +extern void _armci_buf_set_busy_idx(int tbl_idx, int state); +extern int _armci_buf_cmpld(int bufid); +extern void _armci_buf_set_cmpld(void *buf, int state); +extern void _armci_buf_set_cmpld_idx(int idx, int state); + +#if defined(VAPI) # include "armci-vapi.h" #elif defined(SOCKETS) # include "sockets.h" @@ -50,20 +32,6 @@ extern INLINE void _armci_buf_set_cmpld_idx(int idx, int state); typedef unsigned short msg_id_t; # define DTAG_ ((1<<(sizeof(msg_id_t)*8))-1) # define NB_SOCKETS_ /* define NB_SOCKETS to allow non-blocking path */ -#elif defined(HITACHI) -# include "sr8k.h" -#elif defined(BGML) -# include "bgml.h" -# include "bgmldefs.h" -# define NB_CMPL_T BG1S_t - typedef long msg_tag_t; -#elif defined(ARMCIX) -# ifndef ARMCIX_OPAQUE_SIZE -# define ARMCIX_OPAQUE_SIZE 8 -# endif - typedef char armcix_opaque_t [ARMCIX_OPAQUE_SIZE]; -# define NB_CMPL_T armcix_opaque_t - typedef long msg_tag_t; #elif defined(MPI_SPAWN) || defined(MPI_MT) # include "mpi2.h" typedef long msg_tag_t; @@ -92,15 +60,9 @@ typedef struct{ short int agg_flag; int op; int proc; -#ifdef PORTALS - int flag; -#endif #ifdef NB_CMPL_T NB_CMPL_T cmpl_info; #endif -#ifdef BGML - unsigned count; -#endif } armci_ireq_t; /*\ the internal request structure for non-blocking api. \*/ @@ -110,19 +72,10 @@ extern void set_nbhandle(armci_ihdl_t *nbh, armci_hdl_t *nb_handle, int op, int proc); typedef struct { -#if 0 - int to:16; /* message recipient */ - int from:16; /* message sender */ -#else - short int to; /* message recipient */ - short int from; /* message sender */ -#endif +short int to; /* message recipient */ +short int from; /* message sender */ unsigned int operation:8; /* operation code */ -#if defined(DOELAN4) -unsigned int format:2; /* data format used */ -unsigned int dowait:1; /* indicates if should wait for data */ -unsigned int inbuf:1; /* data is in one of the buffers */ -#elif defined(CLIENT_BUF_BYPASS) || defined(LAPI2) +#if defined(CLIENT_BUF_BYPASS) unsigned int format:2; /* data format used */ unsigned int pinned:1; /* indicates if sender memory was pinned */ unsigned int bypass:1; /* indicate if bypass protocol used */ @@ -187,7 +140,7 @@ extern BUF_INFO_T *_armci_buf_to_bufinfo(void *buf); #define BUF_TO_BUFINFO _armci_buf_to_bufinfo void armci_complete_req_buf(BUF_INFO_T *info, void *buffer); -extern INLINE BUF_INFO_T *_armci_id_to_bufinfo(int bufid); +extern BUF_INFO_T *_armci_id_to_bufinfo(int bufid); #if 0 && defined(DATA_SERVER) && defined(SOCKETS) #define MAX_BUFS 1 @@ -214,37 +167,18 @@ typedef struct { #ifndef MSG_BUFLEN_DBL -# if defined(HITACHI) -# define MSG_BUFLEN_DBL 0x50000 -# else # define MSG_BUFLEN_DBL 50000 -# endif #endif #define MSG_BUFLEN sizeof(double)*MSG_BUFLEN_DBL extern char* MessageRcvBuffer; extern char* MessageSndBuffer; -#ifdef LAPI -# define GET_SEND_BUFFER_(_size)(MessageSndBuffer+sizeof(lapi_cmpl_t));\ - CLEAR_COUNTER(*((lapi_cmpl_t*)MessageSndBuffer));\ - SET_COUNTER(*((lapi_cmpl_t*)MessageSndBuffer),1); -# define GET_SEND_BUFFER _armci_buf_get -# define GA_SEND_REPLY armci_lapi_send -#else # ifdef SOCKETS # define GA_SEND_REPLY(tag, buf, len, p) armci_sock_send(p,buf,len) # else # define GA_SEND_REPLY(tag, buf, len, p) # endif -#endif - -#ifdef QUADRICS_ -# define GET_SEND_BUFFER(_size,_op,_to) MessageSndBuffer;\ - while(((request_header_t*)MessageSndBuffer)->tag)\ - armci_util_spin(100, MessageSndBuffer) -# define FREE_SEND_BUFFER(_ptr) ((request_header_t*)MessageSndBuffer)->tag = (void*)0 -#endif #ifndef GET_SEND_BUFFER # define GET_SEND_BUFFER(_size,_op,_to) MessageSndBuffer @@ -263,7 +197,7 @@ typedef struct { } buf_arg_t; /*includes for SERVER_LOCK*/ -#if defined(SERVER_THREAD) && !defined(VIA) +#if defined(SERVER_THREAD) extern void armci_rem_lock(int mutex, int proc, int *ticket); extern void armci_rem_unlock(int mutex, int proc, int ticket); extern void armci_unlock_waiting_process(msg_tag_t tag,int proc, int ticket); @@ -321,7 +255,6 @@ extern void armci_send_data(request_header_t* msginfo, void *data); extern int armci_server_unlock_mutex(int mutex, int p, int tkt, msg_tag_t* tag); extern void armci_rcv_vector_data(int p, request_header_t* msginfo, armci_giov_t dr[], int len); -#if !defined(LAPI) extern void armci_wait_for_server(); extern void armci_start_server(); extern void armci_transport_cleanup(); @@ -336,7 +269,7 @@ extern void armci_client_connect_to_servers(); extern void armci_data_server(void *mesg); extern void armci_server_initial_connection(); extern void armci_call_data_server(); -#endif + #ifdef SOCKETS extern void armci_ReadStridedFromDirect(int proc, request_header_t* msginfo, void *ptr, int strides, int stride_arr[], int count[]); @@ -351,10 +284,7 @@ extern void armci_server_goodbye(request_header_t* msginfo); extern void armci_serv_quit(); extern void armci_server_goodbye(request_header_t* msginfo); #endif -#ifdef HITACHI -extern void armci_server_goodbye(request_header_t* msginfo); -extern void armci_serv_quit(); -#endif + extern void armci_server_ipc(request_header_t* msginfo, void* descr, void* buffer, int buflen); diff --git a/armci/src/include/semaphores.h b/armci/src/include/semaphores.h index 379f95466..4648bd176 100644 --- a/armci/src/include/semaphores.h +++ b/armci/src/include/semaphores.h @@ -29,12 +29,6 @@ union semun { # endif #endif -/* on HPUX 10.2 SEMMSL is much bigger than realistically we can allocate */ -#ifdef HPUX -#undef SEMMSL -#define SEMMSL 64 -#endif - extern struct sembuf sops; extern int semaphoreID; int semop(); diff --git a/armci/src/include/spinlock.h b/armci/src/include/spinlock.h index 9999e6e0c..771fb1ca0 100644 --- a/armci/src/include/spinlock.h +++ b/armci/src/include/spinlock.h @@ -54,45 +54,6 @@ static int testandset(void *spinlock) { } # define TESTANDSET testandset -#elif defined(HPUX) && defined(__ia64) /* HPUX on IA64, non gcc */ -# if DEBUG_SPINLOCK -# warning SPINLOCK: HPUX ia64 -# endif -# define SPINLOCK -typedef unsigned int slock_t; -# include -# define TESTANDSET(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE) -# define RELEASE_SPINLOCK(lock) (*((volatile LOCK_T *) (lock)) = 0) - -#elif defined(__ia64) -# if DEBUG_SPINLOCK -# warning SPINLOCK: ia64 -# endif -# define SPINLOCK -# include "atomic_ops_ia64.h" -static int testandset(void *spinlock) { - int val=1; - int res; - atomic_swap_int(spinlock, val, &res); - return res; -} -# define TESTANDSET testandset - -#elif defined(DECOSF) -# if DEBUG_SPINLOCK -# warning SPINLOCK: DECOSF -# endif -# error "no implementation" - -#elif defined(SGI) -# if DEBUG_SPINLOCK -# warning SPINLOCK: SGI -# endif -# include -# define SPINLOCK -# define TESTANDSET(x) __lock_test_and_set((x), 1) -# define RELEASE_SPINLOCK __lock_release - /*#elif defined(AIX)*/ #elif HAVE_SYS_ATOMIC_OP_H # if DEBUG_SPINLOCK @@ -103,41 +64,8 @@ static int testandset(void *spinlock) { # define TESTANDSET(x) (_check_lock((x), 0, 1)==TRUE) # define RELEASE_SPINLOCK(x) _clear_lock((x),0) -#elif defined(SOLARIS) -# if DEBUG_SPINLOCK -# warning SPINLOCK: SOLARIS -# endif -# include -# include -# define SPINLOCK -# define TESTANDSET(x) (!_lock_try((x))) -# define RELEASE_SPINLOCK _lock_clear - #elif defined(MACX) -#elif defined(HPUX__) -# if DEBUG_SPINLOCK -# warning SPINLOCK: HPUX__ -# endif -extern int _acquire_lock(); -extern void _release_lock(); -# define SPINLOCK -# define TESTANDSET(x) (!_acquire_lock((x))) -# define RELEASE_SPINLOCK _release_lock - -#elif defined(NEC) -# if DEBUG_SPINLOCK -# warning SPINLOCK: NEC -# endif -extern ullong ts1am_2me(); -# define LOCK_T ullong -# define _LKWD (1ULL << 63) -# define SPINLOCK -# define TESTANDSET(x) ((_LKWD & ts1am_2me(_LKWD, 0xffULL, (ullong)(x)))) -# define MEMORY_BARRIER mpisx_clear_cache -extern void mpisx_clear_cache(); -# define RELEASE_SPINLOCK(x) ts1am_2me(0ULL, 0xffULL, (ullong)x); - #endif #ifdef SPINLOCK @@ -179,23 +107,19 @@ static inline void armci_init_spinlock(LOCK_T *mutex) static inline void armci_acquire_spinlock(LOCK_T *mutex) { -#if defined(BGML) || defined(DCMF) - return; -#else int loop=0, maxloop =10; while (TESTANDSET(mutex)){ loop++; if(loop==maxloop){ -# if DEBUG_ +#if DEBUG_ extern int armci_me; printf("%d:spinlock sleeping\n",armci_me); fflush(stdout); -# endif +#endif usleep(1); loop=0; } } -#endif } #ifdef RELEASE_SPINLOCK @@ -207,23 +131,22 @@ static inline void armci_acquire_spinlock(LOCK_T *mutex) #else static inline void armci_release_spinlock(LOCK_T *mutex) { -#if defined(BGML) || defined(DCMF) - return; -#else -# ifdef MEMORY_BARRIER - MEMORY_BARRIER (); -# endif +#ifdef MEMORY_BARRIER + MEMORY_BARRIER(); +#endif + #if OPENPA OPA_store_int(mutex, 0); #else *mutex =0; #endif -# ifdef MEMORY_BARRIER + +#ifdef MEMORY_BARRIER MEMORY_BARRIER (); -# endif -# if (defined(MACX)||defined(LINUX)) && defined(__GNUC__) && defined(__ppc__) +#endif + +#if (defined(MACX)||defined(LINUX)) && defined(__GNUC__) && defined(__ppc__) __asm__ __volatile__ ("isync" : : : "memory"); -# endif #endif } #endif /* RELEASE_SPINLOCK */ diff --git a/armci/src/include/utils.h b/armci/src/include/utils.h index 9f9049fd8..453f97b97 100644 --- a/armci/src/include/utils.h +++ b/armci/src/include/utils.h @@ -32,13 +32,7 @@ # define THREAD_UNLOCK(x) pthread_mutex_unlock(&x) #else -#ifndef INLINE -# define INLINE # include "spinlock.h" -# undef INLINE -#else -# include "spinlock.h" -#endif typedef LOCK_T thread_lock_t; # define THREAD_LOCK_INIT(x) armci_init_spinlock(&x) diff --git a/armci/src/locks/locks.c b/armci/src/locks/locks.c index 49cf9d1e5..65482b1d3 100644 --- a/armci/src/locks/locks.c +++ b/armci/src/locks/locks.c @@ -23,36 +23,9 @@ extern void armci_die(char*,int); void **ptr_arr; -#ifdef SGIALTIX - -void CreateInitLocks(int num_locks, lockset_t *plockid) -{ -int locks_per_proc, size; - - /* locks per process in the SMP node */ - locks_per_proc = num_locks/armci_clus_info[armci_clus_me].nslave + 1; - locks_per_proc = num_locks; /* this is am altix hack and no clue why this is works */ - size=locks_per_proc*sizeof(PAD_LOCK_T); - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - PARMCI_Malloc(ptr_arr, size); - _armci_int_mutexes = (PAD_LOCK_T*) ptr_arr; - bzero((char*)ptr_arr[armci_me],size); -} - -void DeleteLocks(lockset_t lockid) { - ptr_arr = (void**)_armci_int_mutexes; - PARMCI_Free(ptr_arr[armci_me]); - _armci_int_mutexes = (PAD_LOCK_T*)0; -} - -#else - void CreateInitLocks(int num_locks, lockset_t *plockid) { int locks_per_proc, size; -#ifdef BGML - fprintf(stderr,"createinitlocks\n"); -#endif ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); locks_per_proc = (num_locks*armci_nclus)/armci_nproc + 1; size=locks_per_proc*sizeof(PAD_LOCK_T); @@ -102,156 +75,6 @@ void DeleteLocks(lockset_t lockid) { _armci_int_mutexes = (PAD_LOCK_T*)0; } -#endif - - -/********************* all SGI systems ****************/ -#elif defined(SGI) -#define FILE_LEN 200 -lockset_t lockset; -static char arena_name[FILE_LEN]; -usptr_t *arena_ptr; -static int avail =0; - -extern char *getenv(const char *); - -void CreateInitLocks(int num_locks, lockset_t *lockid) -{ -int i; -char *tmp; - - if(num_locks > NUM_LOCKS) armci_die("To many locks requested", num_locks); - lockset.id = (int)getpid(); - if (!(tmp = getenv("ARENA_DIR"))) tmp = "/tmp"; - sprintf(arena_name,"%s/armci_arena%d.%ld", tmp,armci_clus_me,lockset.id); - - (void) usconfig(CONF_ARENATYPE, US_GENERAL); - (void) usconfig(CONF_INITUSERS, (unsigned int) - armci_clus_info[armci_clus_me].nslave+1); /* +1 for server */ - arena_ptr = usinit(arena_name); - if(!arena_ptr) armci_die("Failed to Create Arena", 0); - - for(i=0; i -#include -#include -#include -#include -#include - -#define FILE_LEN 200 -lock_t *lock_array; -static char file_name[FILE_LEN]; -static int fd=-1; -static unsigned shmem_size=-1; - - -void CreateInitLocks(int num_locks, lockset_t *lockid) -{ -int i; - - if(num_locks > NUM_LOCKS) armci_die("To many locks requested", num_locks); - *lockid = (int)getpid(); - sprintf(file_name,"/tmp/ga.locks.%ld", *lockid); - if ( (fd = open(file_name, O_RDWR|O_CREAT, 0666)) < 0 ) - armci_die("CreateInitLocks: failed to open temporary file",0); - - shmem_size = (NUM_LOCKS)*sizeof(lock_t); - lock_array = (lock_t*) mmap((caddr_t) 0, shmem_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|CNX_MAP_SEMAPHORE|MAP_SHARED, fd, 0); - - if(((unsigned)lock_array)%16)armci_die("CreateInitLocks: not aligned",0); - for (i=0; i NUM_LOCKS) armci_die("To many locks requested", num_locks); - sprintf(file_name,"/tmp/ga.locks.%ld", lockid); - if ( (fd = open(file_name, O_RDWR|O_CREAT, 0666)) < 0 ) - armci_die("InitLocks: failed to open temporary file",0); - - shmem_size = (NUM_LOCKS)*sizeof(lock_t); - lock_array = (lock_t*) mmap((caddr_t) 0, shmem_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|CNX_MAP_SEMAPHORE|MAP_SHARED, fd, 0); - if(((unsigned)lock_array)%16)armci_die("InitLocks: not aligned",0); -} - - -void DeleteLocks(lockset_t lockid) -{ - lock_array = 0; - (void)unlink(file_name); /*ignore armci_die code: file might be already gone*/ - (void)munmap((char *) shmem_size, 0); -} - - -void setlock(unsigned * volatile lp) -{ -volatile unsigned flag; - - flag = fetch_and_inc32(lp); - while(flag){ - flag = fetch32(lp); - } -} - - -void unsetlock(unsigned * volatile lp) -{ - (void)fetch_and_clear32(lp); -} - #elif defined(WIN32) /****************************** Windows NT ********************************/ @@ -314,30 +137,6 @@ void unsetlock(int mutex) if(ReleaseMutex(mutex_arr[mutex])==FALSE)armci_die("unsetlock: failed",mutex); } - -#elif defined(CRAY_YMP) - -lock_t cri_l[NUM_LOCKS]; -#pragma _CRI common cri_l - -void CreateInitLocks(int num_locks, lockset_t *lockid) -{ - int i; - if(num_locks > NUM_LOCKS) armci_die("To many locks requested", num_locks); - - for(i=0;i> LOG_CALGN) << LOG_CALGN) -#ifdef CRAY_T3E -#pragma _CRI cache_align table -#endif -static memlock_t table[MAX_SLOTS]; -#if defined(SGIALTIX) || (defined(CRAY_SHMEM) && defined(CRAY_XT)) -#define MAX_SEGS 512 -armci_memoffset_t armci_memoffset_table[MAX_SEGS]; -static short int seg_count=0; -static short int new_seg=0; -#endif +static memlock_t table[MAX_SLOTS]; /*\ simple locking scheme that ignores addresses \*/ void armci_lockmem_(void *pstart, void *pend, int proc) { -#ifdef BGML - bgml_lockmem(pstart, pend, proc); -#else - -#if defined(CLUSTER) && !defined(SGIALTIX) +#if defined(CLUSTER) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; @@ -76,38 +58,21 @@ void armci_lockmem_(void *pstart, void *pend, int proc) } NATIVE_LOCK(lock,proc); -# ifdef LAPI - { - extern int kevin_ok; - kevin_ok=0; - } -# endif + if(DEBUG_){ printf("%d: armci_lockmem_ done\n",armci_me); fflush(stdout); } -#endif } void armci_unlockmem_(int proc) { -#ifdef BGML - bgml_unlockmem(proc); -#else - -#if defined(CLUSTER) && !defined(SGIALTIX) +#if defined(CLUSTER) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; #endif NATIVE_UNLOCK(lock,proc); -# ifdef LAPI - { - extern int kevin_ok; - kevin_ok=1; - } -# endif -#endif } @@ -124,10 +89,6 @@ int i=factor*100000; armci_dummy_work = armci_dummy_work + 1./(double)i; } } - -#ifdef SGIALTIX -#include -#endif /*\ acquire exclusive LOCK to MEMORY area owned by process "proc" * . only one area can be locked at a time by the calling process @@ -135,14 +96,11 @@ int i=factor*100000; \*/ void armci_lockmem(void *start, void *end, int proc) { -#ifdef ARMCIX - ARMCIX_Lockmem (start, end, proc); -#else register void* pstart, *pend; register int slot, avail=0; int turn=0, conflict=0; memlock_t *memlock_table; -#if defined(CLUSTER) && !defined(SGIALTIX) +#if defined(CLUSTER) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; @@ -155,7 +113,6 @@ void armci_lockmem(void *start, void *end, int proc) return; } -# ifndef SGIALTIX /* when processes are attached to a shmem region at different addresses, * addresses written to memlock table must be adjusted to the node master */ @@ -163,7 +120,6 @@ void armci_lockmem(void *start, void *end, int proc) start = armci_mem_offset + (char*)start; end = armci_mem_offset + (char*)end; } -# endif #endif if(DEBUG_){ @@ -183,46 +139,6 @@ void armci_lockmem(void *start, void *end, int proc) pend =end; #endif -#ifdef CRAY_SHMEM - { /* adjust according the remote process raw address */ - long bytes = (long) ((char*)pend-(char*)pstart); - extern void* armci_shmalloc_remote_addr(void *ptr, int proc); - pstart = armci_shmalloc_remote_addr(pstart, proc); - pend = (char*)pstart + bytes; - } -#endif -#ifdef SGIALTIX - if (proc == armci_me) { - pstart = shmem_ptr(pstart,armci_me); - pend = shmem_ptr(pend,armci_me); - } - /* In SGI Altix processes are attached to a shmem region at different - addresses. Addresses written to memlock table must be adjusted to - the node master - */ - if(ARMCI_Uses_shm()){ - int i, seg_id=-1; - size_t tile_size,offset; - void *start_addr, *end_addr; - for(i=0; i=start_addr && pend<=end_addr) {seg_id=i; break;}*/ - if(pstart >= start_addr && pstart <= end_addr) {seg_id=i; break;} - } - if(seg_id==-1) armci_die("armci_lockmem: Invalid segment", seg_id); - - offset = armci_memoffset_table[seg_id].mem_offset; - pstart = ((char*)pstart + offset); - pend = ((char*)pend + offset); - } -#endif - while(1){ NATIVE_LOCK(lock,proc); @@ -270,7 +186,6 @@ void armci_lockmem(void *start, void *end, int proc) NATIVE_UNLOCK(lock,proc); locked_slot = avail; -#endif /* ! ARMCIX */ } @@ -278,10 +193,6 @@ void armci_lockmem(void *start, void *end, int proc) \*/ void armci_unlockmem(int proc) { -#ifdef ARMCIX - ARMCIX_Unlockmem (proc); -#else - void *null[2] = {NULL,NULL}; memlock_t *memlock_table; @@ -301,7 +212,6 @@ void armci_unlockmem(int proc) memlock_table = (memlock_t*)memlock_table_array[proc]; armci_put(null,&memlock_table[locked_slot].start,2*sizeof(void*),proc); -#endif /* ! ARMCIX */ } @@ -340,103 +250,3 @@ void armci_set_mem_offset(void *ptr) } } } - -#ifdef SGIALTIX -/* SGI Altix Stuff */ -static void armci_altix_gettilesize(void *ptr, void **ptr_arr, - size_t *tile_size) { - int i; - size_t diff=0; - for(i=0; i0) diff = (size_t)((char*)ptr_arr[i]-(char*)ptr_arr[i-1]); - if(i>1 && diff!=*tile_size) - armci_die("armci_memoffset_table_newentry:Inconsistent tile size", - armci_me); - *tile_size = diff; - } -} - -void armci_memoffset_table_newentry(void *ptr, size_t seg_size) { - - void **ptr_arr; - void *master_addr = NULL; - size_t tile_size=0, offset=0; - - if(!ptr) armci_die("armci_memoffset_table_newentry : null ptr",0); - - if(seg_count >= MAX_SEGS) /* CHECK: make it dynamic */ - armci_die("armci_altix_allocate: Increase MAX_SEGS > 512", armci_me); - - if(armci_me == armci_master) master_addr = shmem_ptr(ptr, armci_me); - armci_msg_brdcst(&master_addr, sizeof(void*), armci_master); - - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - armci_altix_gettilesize(ptr, ptr_arr, &tile_size); - offset = (size_t)((char*)master_addr - (char*)ptr_arr[armci_master]); - - /* enter in memoffset table */ - armci_memoffset_table[seg_count].seg_addr = ptr_arr[armci_master]; - armci_memoffset_table[seg_count].seg_size = seg_size; - armci_memoffset_table[seg_count].tile_size = tile_size; - armci_memoffset_table[seg_count].mem_offset = offset; - -#if DEBUG_ - printf("%d: addr=%p seg_size=%ld tile_size=%ld offset=%ld\n", armci_me, - ptr_arr[armci_master], seg_size, tile_size, offset); -#endif - - ++seg_count; - free(ptr_arr); -} -#endif - -#if defined(CRAY_SHMEM) && defined(CRAY_XT) -/* CRAY-CRAY_XT stuff */ -static void armci_cray_gettilesize(void *ptr, void **ptr_arr, - size_t *tile_size) { - int i; - size_t diff=0; - for(i=0; i0) diff = (size_t)((char*)ptr_arr[i]-(char*)ptr_arr[i-1]); - if(i>1 && diff!=*tile_size) - armci_die("armci_memoffset_table_newentry:Inconsistent tile size", - armci_me); - *tile_size = diff; - } -} - -void armci_memoffset_table_newentry(void *ptr, size_t seg_size) { - - void **ptr_arr; - void *master_addr = NULL; - size_t tile_size=0, offset=0; - - if(!ptr) armci_die("armci_memoffset_table_newentry : null ptr",0); - - if(seg_count >= MAX_SEGS) /* CHECK: make it dynamic */ - armci_die("armci_cary_allocate: Increase MAX_SEGS > 512", armci_me); - - if(armci_me == armci_master) master_addr = ptr; - armci_msg_brdcst(&master_addr, sizeof(void*), armci_master); - - ptr_arr = (void**)malloc(armci_nproc*sizeof(void*)); - armci_cray_gettilesize(ptr, ptr_arr, &tile_size); - offset = (size_t)((char*)master_addr - (char*)ptr_arr[armci_master]); - - /* enter in memoffset table */ - armci_memoffset_table[seg_count].seg_addr = ptr_arr[armci_master]; - armci_memoffset_table[seg_count].seg_size = seg_size; - armci_memoffset_table[seg_count].tile_size = tile_size; - armci_memoffset_table[seg_count].mem_offset = offset; - -#if DEBUG_ - printf("%d: addr=%p seg_size=%ld tile_size=%ld offset=%ld\n", armci_me, - ptr_arr[armci_master], seg_size, tile_size, offset); -#endif - - ++seg_count; - free(ptr_arr); -} -#endif diff --git a/armci/src/locks/mutex.c b/armci/src/locks/mutex.c index 363e11281..ad1de34f2 100644 --- a/armci/src/locks/mutex.c +++ b/armci/src/locks/mutex.c @@ -14,15 +14,7 @@ #define MAX_LOCKS 32768 #define SPINMAX 1000 -#if defined(LAPI) || defined(GM) -# define SERVER_LOCK -#endif - double _dummy_work_=0.; -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ -int mymutexcount; -double _dummy_server_work_=0.; -#endif static int num_mutexes=0, *tickets; typedef struct { @@ -109,9 +101,6 @@ int rc,p, totcount; } num_mutexes= totcount; -#ifdef LAPI - mymutexcount = num; -#endif PARMCI_Barrier(); if(DEBUG) @@ -139,25 +128,10 @@ void armci_serv_mutex_close() int PARMCI_Destroy_mutexes() { -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ - int proc, mutex, i,factor=0; -#endif if(num_mutexes==0)armci_die("armci_destroy_mutexes: not created",0); if(armci_nproc == 1) return(0); armci_msg_barrier(); - -#ifdef LAPI /*fix to if cmpl handler for a pending unlock runs after destroy*/ - for(proc=0;proc= (MAX_BUFS+MAX_SMALL_BUFS)) armci_die2("_armci_id_to_bufinfo: bad id",bufid,MAX_BUFS); @@ -489,7 +489,7 @@ void _armci_buf_release_index(int tbl_idx) { /*\ release buffer when it becomes free \*/ -INLINE void _armci_buf_release(void *buf) +void _armci_buf_release(void *buf) { _armci_buf_release_index(_armci_buf_to_index(buf)); } @@ -551,7 +551,7 @@ char *_armci_buf_get(int size, int operation, int to) THREAD_UNLOCK(armci_user_threads.buf_lock); /* try network complete */ -#if defined(SOCKETS) || defined(MELLANOX) +#if defined(SOCKETS) tbl_idx = armci_test_network_complete(); #else /* all network should eventually use armci_test_network_complete */ tbl_idx = small ? _armci_buf_state->smavail : _armci_buf_state->avail; @@ -1700,36 +1700,36 @@ BUF_INFO_T *_armci_tag_to_bufinfo(msg_tag_t tag) { /* inline primitives for buffer state management */ -INLINE char *_armci_buf_get_clear_busy(int size, int operation, int to) { +char *_armci_buf_get_clear_busy(int size, int operation, int to) { char *buf = _armci_buf_get(size, operation, to); _armci_buf_set_busy(buf, 0); return buf; } -INLINE void _armci_buf_set_busy(void *buf, int state) { +void _armci_buf_set_busy(void *buf, int state) { _armci_buf_state->table[_armci_buf_to_index(buf)].busy = state; } -INLINE void _armci_buf_set_busy_idx(int idx, int state) { +void _armci_buf_set_busy_idx(int idx, int state) { _armci_buf_state->table[idx].busy = state; } #if 0 -INLINE int _armci_buf_cmpld(void *buf) { +int _armci_buf_cmpld(void *buf) { return _armci_buf_state->table[_armci_buf_to_index(buf)].cmpl; } #else -INLINE int _armci_buf_cmpld(int bufid) { +int _armci_buf_cmpld(int bufid) { return _armci_buf_state->table[bufid].cmpl; } #endif -INLINE void _armci_buf_set_cmpld(void *buf, int state) { +void _armci_buf_set_cmpld(void *buf, int state) { _armci_buf_state->table[_armci_buf_to_index(buf)].cmpl = state; } -INLINE void _armci_buf_set_cmpld_idx(int idx, int state) { +void _armci_buf_set_cmpld_idx(int idx, int state) { _armci_buf_state->table[idx].cmpl = state; } diff --git a/armci/src/memory/kr_malloc.c b/armci/src/memory/kr_malloc.c index d0b5d879a..17bf13c0d 100644 --- a/armci/src/memory/kr_malloc.c +++ b/armci/src/memory/kr_malloc.c @@ -150,7 +150,7 @@ char *kr_malloc(size_t nbytes, context_t *ctx) { size_t nunits; char *return_ptr; -#if !((defined(SUN) || defined(SOLARIS)) && !defined(SHMMAX_SEARCH_NO_FORK)) +#if !defined(SHMMAX_SEARCH_NO_FORK) if(ctx->ctx_type == KR_CTX_SHMEM) return kr_malloc_shmem(nbytes,ctx); #endif @@ -225,7 +225,7 @@ char *kr_malloc(size_t nbytes, context_t *ctx) { void kr_free(char *ap, context_t *ctx) { Header *bp, *p, **up; -#if !((defined(SUN) || defined(SOLARIS)) && !defined(SHMMAX_SEARCH_NO_FORK)) +#if !defined(SHMMAX_SEARCH_NO_FORK) if(ctx->ctx_type == KR_CTX_SHMEM) { kr_free_shmem(ap,ctx); return; } #endif diff --git a/armci/src/memory/memory.c b/armci/src/memory/memory.c index 27932247d..dbc5eb978 100644 --- a/armci/src/memory/memory.c +++ b/armci/src/memory/memory.c @@ -18,30 +18,15 @@ #define USE_SHMEM_ #define SHM_UNIT 1024 -#if defined(CRAY_SHMEM) -extern void armci_shmalloc_exchange_address(void **ptr_arr); -extern void armci_shmalloc_exchange_offsets(context_t *); -# if defined(CRAY_XT) -# include -# ifdef CATAMOUNT -# include -# endif -# endif -#endif - static context_t ctx_localmem; -#if defined(PORTALS_WITHREG) || defined(PORTALS) || defined(ALLOW_PIN) +#if defined(ALLOW_PIN) static context_t ctx_mlocalmem; #endif -#if defined(SYSV) || defined(WIN32) || defined(MMAP) || defined(HITACHI) +#if defined(SYSV) || defined(WIN32) || defined(MMAP) #include "armci_shmem.h" -#if !defined(USE_SHMEM) && (defined(HITACHI) || defined(MULTI_CTX)) -# define USE_SHMEM -#endif - -#if !(defined(LAPI)||defined(QUADRICS)||defined(SERVER_THREAD)) ||\ +#if !defined(SERVER_THREAD) ||\ defined(USE_SHMEM) #define RMA_NEEDS_SHMEM #endif @@ -50,119 +35,12 @@ static context_t ctx_mlocalmem; extern int _armci_server_started; #endif -/**************************************************************************** - * Memory Allocator called by kr_malloc on SGI Altix to get more core from OS - */ -#ifdef SGIALTIX - -#include -#if HAVE_UNISTD_H -# include -#endif - -#define DEF_UNITS (64) -#define MAX_SEGS 512 - -#define _SHMMAX_ALTIX 32*1024 /* 32 MB */ -#define _SHMMAX_ALTIX_GRP 512*1024 /* 512 MB */ - -static context_t altix_ctx_shmem; -static context_t altix_ctx_shmem_grp; -static size_t altix_pagesize; -extern void armci_memoffset_table_newentry(void *ptr, size_t seg_size); - -void *armci_altix_allocate(size_t bytes) -{ - void *ptr, *sptr; - ARMCI_PR_DBG("enter",0); - sptr=ptr= shmalloc(bytes); - if(sptr == NULL) armci_die("armci_altix_allocate: shmalloc failed\n", - armci_me); - armci_memoffset_table_newentry(ptr, bytes); -#if 0 - if(ptr){ /* touch each page to establish ownership */ - int i; - for(i=0; i< bytes/altix_pagesize; i++){ - *(double*)ptr=0.; - ((char*)ptr) += altix_pagesize; - } - } -#endif - ARMCI_PR_DBG("exit",0); - return sptr; -} - -void armci_altix_shm_init() -{ - ARMCI_PR_DBG("enter",0); - altix_pagesize = getpagesize(); - kr_malloc_init(SHM_UNIT, _SHMMAX_ALTIX, 0, - armci_altix_allocate, 0, &altix_ctx_shmem); - kr_malloc_init(SHM_UNIT, _SHMMAX_ALTIX_GRP, _SHMMAX_ALTIX_GRP, - armci_altix_allocate, 0, &altix_ctx_shmem_grp); - /* allocate a huge segment for groups. When kr_malloc() is called for - the first time for this altix_ctx_shmem_grp context with some minimal - size of 8 bytes, a huge segment of size (SHM_UNIT*_SHMMAX_ALTIX_GRP) - will be created */ - { - void *ptr; - ptr=kr_malloc((size_t)8, &altix_ctx_shmem_grp); - if(ptr==NULL) - armci_die("armci_altix_shm_init(): malloc failed", armci_me); - } - ARMCI_PR_DBG("exit",0); -} - -void armci_altix_shm_malloc(void *ptr_arr[], armci_size_t bytes) -{ - long size=bytes; - void *ptr; - int i; - ARMCI_PR_DBG("enter",0); - armci_msg_lgop(&size,1,"max"); - ptr=kr_malloc((size_t)size, &altix_ctx_shmem); - bzero(ptr_arr,(armci_nproc)*sizeof(void*)); - ptr_arr[armci_me] = ptr; - if(size!=0 && ptr==NULL) - armci_die("armci_altix_shm_malloc(): malloc failed", armci_me); - for(i=0; i< armci_nproc; i++) if(i!=armci_me) ptr_arr[i]=shmem_ptr(ptr,i); - ARMCI_PR_DBG("exit",0); -} - -#ifdef MSG_COMMS_MPI -void armci_altix_shm_malloc_group(void *ptr_arr[], armci_size_t bytes, - ARMCI_Group *group) { - long size=bytes; - void *ptr; - int i,grp_me, grp_nproc; - armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group); - ARMCI_PR_DBG("enter",0); - - ARMCI_Group_size(group, &grp_nproc); - ARMCI_Group_rank(group, &grp_me); - armci_msg_group_lgop(&size,1,"max",group); - ptr=kr_malloc((size_t)size, &altix_ctx_shmem_grp); - if(size!=0 && ptr==NULL) - armci_die("armci_altix_shm_malloc_group(): malloc failed for groups. Increase _SHMMAX_ALTIX_GRP", armci_me); - bzero(ptr_arr,(grp_nproc)*sizeof(void*)); - ptr_arr[grp_me] = ptr; - for(i=0; i< grp_nproc; i++) if(i!=grp_me) ptr_arr[i]=shmem_ptr(ptr,ARMCI_Absolute_id(group, i)); - ARMCI_PR_DBG("exit",0); -} -#endif - -#endif /* end ifdef SGIALTIX */ -/* ------------------ End Altix memory allocator ----------------- */ - void kr_check_local() { #if 0 kr_malloc_print_stats(&ctx_localmem); #endif kr_malloc_verify(&ctx_localmem); -#if defined(PORTALS_WITHREG) -kr_malloc_verify(&ctx_mlocalmem); -#endif } void armci_print_ptr(void **ptr_arr, int bytes, int size, void* myptr, int off) @@ -269,9 +147,7 @@ void armci_shmem_malloc(void *ptr_arr[], armci_size_t bytes) armci_me,myptr, *(void**)myptr,size); fflush(stdout); } } -# ifdef HITACHI - armci_register_shmem(myptr,size,idlist+1,idlist[0],ptr_ref_arr[armci_clus_me]); -# endif + # if defined(DATA_SERVER) /* get server reference address for every cluster node to perform @@ -565,9 +441,6 @@ void armci_shmem_malloc_group(void *ptr_arr[], armci_size_t bytes, armci_me,myptr, *(void**)myptr,size); fflush(stdout); } } -# ifdef HITACHI - armci_register_shmem_grp(myptr,size,idlist+1,idlist[0],ptr_ref_arr[armci_clus_me],group); -# endif # if defined(DATA_SERVER) @@ -705,16 +578,10 @@ void armci_shmem_memctl(armci_meminfo_t *meminfo) { #ifdef ALLOW_PIN void *reg_malloc(size_t size) { -#ifdef PORTALS -char *ptr; -extern void *shmalloc(size_t); + char *ptr; ARMCI_PR_DBG("enter",0); ptr = malloc(size); -#else -char *ptr; - ARMCI_PR_DBG("enter",0); - ptr = malloc(size); -#endif + armci_region_register_loc(ptr,size); ARMCI_PR_DBG("exit",0); return(ptr); @@ -728,28 +595,9 @@ void armci_krmalloc_init_localmem() { kr_malloc_init(0, 0, 0, reg_malloc, 0, &ctx_localmem); kr_malloc_init(0, 0, 0, malloc, 0, &ctx_mlocalmem); ctx_mlocalmem.ctx_type = KR_CTX_LOCALMEM; -#elif defined(CRAY_SHMEM) && defined(CRAY_XT) -# ifdef CATAMOUNT - int units_avail = (cnos_shmem_size() - 1024 * 1024) / SHM_UNIT; -# else - extern size_t get_xt_heapsize(); - int units_avail = (get_xt_heapsize() - 1024 * 1024) / SHM_UNIT; -# endif - - if(DEBUG_) - { - fprintf(stderr,"%d:krmalloc_init_localmem: symheap=%llu,units(%d)=%d\n", - armci_me, SHM_UNIT*units_avail, SHM_UNIT, units_avail); - } - kr_malloc_init(SHM_UNIT, units_avail, units_avail, shmalloc, 0, - &ctx_localmem); - armci_shmalloc_exchange_offsets(&ctx_localmem); #else - kr_malloc_init(0, 0, 0, malloc, 0, &ctx_localmem); - #endif - ctx_localmem.ctx_type = KR_CTX_LOCALMEM; } @@ -757,27 +605,14 @@ void armci_krmalloc_init_localmem() { * Local Memory Allocation and Free */ void *PARMCI_Malloc_local(armci_size_t bytes) { -#if defined(PORTALS) - void *rptr; -#endif ARMCI_PR_DBG("enter",0); -#if defined(PORTALS) - rptr=kr_malloc((size_t)bytes, &ctx_mlocalmem); - ARMCI_PR_DBG("exit",0); - return rptr; -#else ARMCI_PR_DBG("exit",0); return (void *)kr_malloc((size_t)bytes, &ctx_localmem); -#endif } int PARMCI_Free_local(void *ptr) { ARMCI_PR_DBG("enter",0); -#if defined(PORTALS) - kr_free((char *)ptr, &ctx_mlocalmem); -#else kr_free((char *)ptr, &ctx_localmem); -#endif ARMCI_PR_DBG("exit",0); return 0; } @@ -884,11 +719,7 @@ int PARMCI_Malloc(void *ptr_arr[], armci_size_t bytes) } # endif -# ifdef SGIALTIX - if( ARMCI_Uses_shm() ) armci_altix_shm_malloc(ptr_arr,bytes); -# else if( ARMCI_Uses_shm() ) armci_shmem_malloc(ptr_arr,bytes); -# endif else { /* on distributed-memory systems just malloc & collect all addresses */ ptr = kr_malloc(bytes, &ctx_localmem); @@ -896,14 +727,10 @@ int PARMCI_Malloc(void *ptr_arr[], armci_size_t bytes) bzero((char*)ptr_arr,armci_nproc*sizeof(void*)); ptr_arr[armci_me] = ptr; - -# if defined(CRAY_SHMEM) - armci_shmalloc_exchange_address(ptr_arr); -# else /* now combine individual addresses into a single array */ armci_exchange_address(ptr_arr, armci_nproc); -# endif + # ifdef ALLOW_PIN armci_global_region_exchange(ptr, (long) bytes); # endif @@ -931,7 +758,6 @@ int PARMCI_Free(void *ptr) ARMCI_PR_DBG("enter",0); if(!ptr)return 1; -#ifndef SGIALTIX # ifdef REGION_ALLOC kr_free(ptr, &ctx_region_shmem); # else @@ -956,15 +782,10 @@ int PARMCI_Free(void *ptr) # endif kr_free(ptr, &ctx_localmem); # endif /* REGION_ALLOC */ -#else - /* Altix */ - if( ARMCI_Uses_shm() ) kr_free(ptr, &altix_ctx_shmem); - else kr_free(ptr, &ctx_localmem); -#endif - ptr = NULL; + ptr = NULL; ARMCI_PR_DBG("exit",0); - return 0; + return 0; } int PARMCI_Free_memdev(void *ptr) @@ -977,7 +798,7 @@ int ARMCI_Uses_shm() { int uses=0; -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) && !defined(NO_SHM) +#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(NO_SHM) # ifdef RMA_NEEDS_SHMEM if(armci_nproc >1) uses= 1; /* always unless serial mode */ # else @@ -999,7 +820,7 @@ int ARMCI_Uses_shm_grp(ARMCI_Group *group) ARMCI_Group_size(group, &grp_nproc); ARMCI_Group_rank(group, &grp_me); -#if (defined(SYSV) || defined(WIN32) || defined(MMAP) ||defined(HITACHI)) && !defined(NO_SHM) +#if (defined(SYSV) || defined(WIN32) || defined(MMAP)) && !defined(NO_SHM) # ifdef RMA_NEEDS_SHMEM if(grp_nproc >1) uses= 1; /* always unless serial mode */ # else @@ -1048,11 +869,7 @@ int ARMCI_Malloc_group(void *ptr_arr[], armci_size_t bytes, #endif if( ARMCI_Uses_shm_grp(group) ) { -# ifdef SGIALTIX - armci_altix_shm_malloc_group(ptr_arr,bytes,group); -# else - armci_shmem_malloc_group(ptr_arr,bytes,group); -# endif + armci_shmem_malloc_group(ptr_arr,bytes,group); } else { /* on distributed-memory systems just malloc & collect all addresses */ @@ -1063,11 +880,7 @@ int ARMCI_Malloc_group(void *ptr_arr[], armci_size_t bytes, ptr_arr[grp_me] = ptr; /* now combine individual addresses into a single array */ -#if defined(CRAY_SHMEM) - armci_shmalloc_exchange_address_grp(ptr_arr, group); -#else armci_exchange_address_grp(ptr_arr, grp_nproc, group); -#endif # ifdef ALLOW_PIN # if 0 @@ -1112,7 +925,6 @@ int ARMCI_Free_group(void *ptr, ARMCI_Group *group) grp_clus_me = grp_attr->grp_clus_me; grp_master = grp_attr->grp_clus_info[grp_clus_me].master; -#ifndef SGIALTIX #ifdef REGION_ALLOC kr_free(ptr, &ctx_region_shmem); #else @@ -1136,12 +948,6 @@ int ARMCI_Free_group(void *ptr, ARMCI_Group *group) # endif kr_free(ptr, &ctx_localmem); #endif /* ifdef REGION_ALLOC */ -#else /* SGI Altix */ - if(ARMCI_Uses_shm_grp(group)) - kr_free(ptr, &altix_ctx_shmem_grp); - else kr_free(ptr, &ctx_localmem); - -#endif /* SGIALTIX */ ptr = NULL; ARMCI_PR_DBG("exit",0); diff --git a/armci/src/memory/shmalloc.c b/armci/src/memory/shmalloc.c index 295671314..c0fe8e537 100644 --- a/armci/src/memory/shmalloc.c +++ b/armci/src/memory/shmalloc.c @@ -92,52 +92,3 @@ void* armci_shmalloc_remote_addr(void *ptr, int proc) return (void*)((char*)ptr - offset_arr[proc]); } -#if defined(CRAY_XT) - -#define XT_SYMMETRIC_HEAP_SIZE ((size_t)1024)*1024*1024; /* 1 GB is default */ -size_t get_xt_heapsize() -{ - - char *uval = getenv("XT_SYMMETRIC_HEAP_SIZE"); - char *token = NULL; - char *unit_str = NULL; - size_t scale=1, size; - - if(uval != NULL) - { - if((unit_str=strchr(uval, 'K')) != NULL) - { - scale = 1024; - token = strtok(uval, "K"); - } - else if((unit_str=strchr(uval, 'M')) != NULL) - { - scale = ((size_t)1024)*1024; - token = strtok(uval, "M"); - } - else if((unit_str=strchr(uval, 'G')) != NULL) - { - scale = ((size_t)1024)*1024*1024; - token = strtok(uval, "G"); - } - else - { - scale = 1; - token = uval; - } - - size = (size_t)atol(token); - size *= scale; /* in bytes */ - - if(size < 1024*1024) - armci_die("get_xt_heapsize(): Symmetric heapsize should be > 1MB",0); - } - else - { - size = XT_SYMMETRIC_HEAP_SIZE; - } - - return size; -} -#endif /* defined CRAY_XT */ - diff --git a/armci/src/memory/shmem.c b/armci/src/memory/shmem.c index 7497f9232..532b590c5 100644 --- a/armci/src/memory/shmem.c +++ b/armci/src/memory/shmem.c @@ -73,25 +73,6 @@ #include "message.h" #include "armcip.h" -#ifdef ALLOC_MUNMAP -#if HAVE_SYS_MMAN_H -# include -#endif -#if HAVE_UNISTD_H -# include -#endif -static size_t pagesize=0; -static int logpagesize=0; -/* allow only that big shared memory segment (in MB)- incresed from 128 11/02 */ -#define MAX_ALLOC_MUNMAP 128 -#define MAX_ALLOC_MUNMAP_ 368 -static long max_alloc_munmap=MAX_ALLOC_MUNMAP; -#endif - -#if defined(SUN) - extern char *shmat(); -#endif - #define SHM_UNIT (1024) @@ -102,41 +83,24 @@ static long max_alloc_munmap=MAX_ALLOC_MUNMAP; * case b) search w/o forking until success (less accurate) */ -/* under Myrinet GM, we cannot fork */ -#if defined(GM) || defined(VAPI) +#if defined(VAPI) || defined(SOLARIS) # define SHMMAX_SEARCH_NO_FORK #endif -#if defined(LAPI) || defined(AIX) || defined(SHMMAX_SEARCH_NO_FORK) +#if defined(AIX) || defined(SHMMAX_SEARCH_NO_FORK) # define NO_SHMMAX_SEARCH #endif -/* on some platforms with tiny shmmax can try to glue multiple regions */ -#if (defined(SUN) || defined(SOLARIS)) && !defined(SHMMAX_SEARCH_NO_FORK) -# define MULTIPLE_REGIONS -#endif - /* Limits for the largest shmem segment are in Kilobytes to avoid passing * Gigavalues to kr_malloc - * the limit for the KSR is lower than SHMMAX in sys/param.h because - * shmat would fail -- SHMMAX cannot be trusted (a bug) */ #define _SHMMAX 4*1024 #if defined(SUN)||defined(SOLARIS) # undef _SHMMAX # define _SHMMAX (1024) /* memory in KB */ -#elif defined(SGI64) || defined(AIX) || defined(CONVEX) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)512*1024) -#elif defined(SGI) && !defined(SGI64) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)128*1024) -#elif defined(KSR) +#elif defined(AIX) # undef _SHMMAX # define _SHMMAX ((unsigned long)512*1024) -#elif defined(HPUX) -# undef _SHMMAX -# define _SHMMAX ((unsigned long)64*1024) #elif defined(__FreeBSD__) # undef _SHMMAX # define _SHMMAX ((unsigned long)3*1024) @@ -173,60 +137,10 @@ static int id_search_no_fork=0; #ifdef LINUX #define CLEANUP_CMD(command) sprintf(command,"/usr/bin/ipcrm shm %d",id); -#elif defined(SOLARIS) -#define CLEANUP_CMD(command) sprintf(command,"/bin/ipcrm -m %d",id); -#elif defined(SGI) -#define CLEANUP_CMD(command) sprintf(command,"/usr/sbin/ipcrm -m %d",id); #else #define CLEANUP_CMD(command) sprintf(command,"/usr/bin/ipcrm -m %d",id); #endif - -#ifdef ALLOC_MUNMAP -#ifdef QUADRICS -# include -# include - static char *armci_elan_starting_address = (char*)0; - -# ifdef __ia64__ -# define ALLOC_MUNMAP_ALIGN 1024*1024 -# else -# define ALLOC_MUNMAP_ALIGN 64*1024 -# endif - -# define ALGN_MALLOC(s,a) elan_allocMain(elan_base->state, (a), (s)) -#else -# define ALGN_MALLOC(s,a) malloc((s)) -#endif - -static char* alloc_munmap(size_t size) -{ -char *tmp; -unsigned long iptr; -size_t bytes = size+pagesize-1; - - if(armci_elan_starting_address){ - tmp = armci_elan_starting_address; - armci_elan_starting_address += size; -# ifdef ALLOC_MUNMAP_ALIGN - armci_elan_starting_address += ALLOC_MUNMAP_ALIGN; -# endif - if(DEBUG_) {printf("%d: address for shm attachment is %p size=%ld\n", - armci_me,tmp,(long)size); fflush(stdout); } - } else { - tmp = ALGN_MALLOC(bytes, getpagesize()); - if(tmp){ - iptr = (unsigned long)tmp + pagesize-1; - iptr >>= logpagesize; iptr <<= logpagesize; - if(DEBUG_) printf("%d:unmap ptr=%p->%p size=%d pagesize=%d\n",armci_me, - tmp,(char*)iptr,(int)size,pagesize); - tmp = (char*)iptr; - if(munmap(tmp, size) == -1) armci_die("munmap failed",0); - if(DEBUG_){printf("%d: unmap OK\n",armci_me); fflush(stdout);} - }else armci_die("alloc_munmap: malloc failed",(int)size); - } - return tmp; -} #endif /*\ A wrapper to shmget. Just to be sure that ID is not 0. @@ -310,11 +224,7 @@ static int armci_shmalloc_try(long size) */ #define PAGE (16*65536L) #define LBOUND 1048576L -#if defined(MULTI_CTX) && defined(QUADRICS) -#define UBOUND 256*LBOUND -#else #define UBOUND 512*LBOUND -#endif #define ARMCI_STRINGIFY(str) #str #define ARMCI_CONCAT(str) strL @@ -420,24 +330,6 @@ long lower_bound=_SHMMAX*SHM_UNIT; return (int)( lower_bound>>20); /* return shmmax in mb */ } #endif - - -#ifdef MULTI_CTX -void armci_nattach_preallocate_info(int* segments, int *segsize) -{ - int x; - char *uval; - uval = getenv("LIBELAN_NATTACH"); - if(uval != NULL){ - sscanf(uval,"%d",&x); - if(x<2 || x>8) armci_die("Error in LIBELAN_NATTACH <8, >1 ",(int)x); - }else - armci_die("Inconsistent configuration: ARMCI needs LIBELAN_NATTACH",0); - *segments =x; - *segsize = (int) (SHM_UNIT * MinShmem); - -} -#endif /* Create shared region to store kr_malloc context in shared memory */ void armci_krmalloc_init_ctxshmem() { @@ -490,59 +382,6 @@ void armci_krmalloc_init_ctxshmem() { void armci_shmem_init() { - -#ifdef ALLOC_MUNMAP - -#if defined(QUADRICS) -# if (defined(__ia64__) || defined(__alpha)) && !defined(DECOSF) - - /* this is to determine size of Elan Main memory allocator for munmap */ - long x; - char *uval; - uval = getenv("LIBELAN_ALLOC_SIZE"); - if(uval != NULL){ - sscanf(uval,"%ld",&x); - if((x>80000000) && (x< 4*1024*1024*1024L)){ - max_alloc_munmap = (x>>20) - 72; - if(DEBUG_){ - printf("%d: max_alloc_munmap is %ld\n",armci_me,max_alloc_munmap); - fflush(stdout); - } - } - } - - /* an alternative approach is to use MMAP area where we get - the address from the Elan environment variable in qsnetlibs 1.4+ */ - uval = getenv("LIBELAN3_MMAPBASE"); - if(uval != NULL){ - sscanf(uval,"%p",&armci_elan_starting_address); - } - -# endif -# if defined(__ia64__) - /* need aligment on 1MB boundary rather than the actual pagesize */ - pagesize = 1024*1024; - logpagesize = 20; -# else - /* determine log2(pagesize) needed for address alignment */ - int tp=512; - logpagesize = 9; - pagesize = getpagesize(); - if(tp>pagesize)armci_die("armci_shmem_init:pagesize",pagesize); - - while(tpmax_alloc_munmap && !armci_elan_starting_address) x=max_alloc_munmap; -# else - x = 10; /* mb */ -# endif -# endif - if(DEBUG_){ printf("%d:shmem_init: %d mbytes max segment size\n",armci_me,x);fflush(stdout);} @@ -647,264 +475,8 @@ static long occup_blocks=0; */ -#if defined(MULTIPLE_REGIONS) -/********************************* MULTIPLE_REGIONS *******************/ -/* allocate contiguous shmem -- glue pieces together -- works on SUN - * SUN max shmem segment is only 1MB so we might need several to satisfy request - */ - - -/* SHM_OP is an operator to calculate shmem address to attach - * might be + or - depending on the system - */ -#if defined(DECOSF) || defined(LINUX) -#define SHM_OP + -#else -#define SHM_OP - -#endif - -static int prev_alloc_regions=0; - - -unsigned long armci_max_region() -{ - /* we assume that at least two regions can be glued */ - return MinShmem*2; -} - -/*\ - * assembles the list of shmem id for the block -\*/ -int find_regions(char *addrp, long* idlist, int *first) -{ -int reg, nreg, freg=-1, min_reg, max_reg; - - /* find the region where addrp belongs */ - for(reg = 0; reg < alloc_regions-1; reg++){ - if(region_list[reg].addr < region_list[reg+1].addr){ - min_reg = reg; max_reg = reg+1; - }else{ - min_reg = reg+1; max_reg = reg; - } - if(region_list[min_reg].addr <= addrp && - region_list[max_reg].addr > addrp){ - freg = min_reg; - break; - } - } - /* if not found yet, it must be the last region */ - if(freg < 0) freg=alloc_regions-1; - - if( alloc_regions == prev_alloc_regions){ - /* no new regions were allocated this time - just get the id */ - idlist[0] = 1; - idlist[1] = region_list[freg].id; - }else{ - /* get ids of the allocated regions */ - idlist[0] = alloc_regions - prev_alloc_regions; - if(idlist[0] < 0)armci_die("armci find_regions error ",0); - for(reg =prev_alloc_regions,nreg=1; reg =MAX_REGIONS) - armci_die("Attach_Shared_Region: too many regions ",0L); - - /* first time needs to initialize region_list structure */ - if(!alloc_regions){ - for(reg=0;reg(b)? (b): (a)) -char *temp = (char*)0, *pref_addr=(char*)0, *ftemp; -int id, newreg, i; -size_t sz; - - if(DEBUG1){ - printf("%d:Shmem allocate: size %ld bytes\n",armci_me,size); - fflush(stdout); - } - - newreg = (size+(SHM_UNIT*MinShmem)-1)/(SHM_UNIT*MinShmem); - - if( (alloc_regions + newreg)> MAX_REGIONS) - armci_die("allocate: to many regions already allocated ",(long)newreg); - - prev_alloc_regions = alloc_regions; - - if(DEBUG_)fprintf(stderr, "in allocate size=%ld\n",size); - -#ifdef ALLOC_MUNMAP - pref_addr = alloc_munmap((size_t) size); -#else - pref_addr = (char*)0; /* first time let the OS choose address */ -#endif - - /* allocate shmem in as many segments as neccesary */ - for(i =0; i< newreg; i++){ - long szl; - szl =(i==newreg-1)?size-i*MinShmem*SHM_UNIT: min(size,SHM_UNIT*MinShmem); - sz = (size_t) szl; - - if ( (int)(id = armci_shmget(sz,"MULTIPLE_REGIONarmci_allocate")) < 0){ - fprintf(stderr,"%d:id=%d size=%d MAX=%ld\n",armci_me,id,szl,MinShmem); - alloc_regions++; - shmem_errmsg(size); - armci_die("allocate: failed to create shared region ",id); - } - - /* make sure the next shmem region will be adjacent to previous one */ - if(temp) pref_addr= temp SHM_OP (MinShmem*SHM_UNIT); - - if(DEBUG_)printf("calling shmat:id=%d adr=%p sz=%ld\n",id,pref_addr,szl); - - if ( (long)(temp = (char*)shmat(id, pref_addr, 0)) == -1L){ - char command[64]; - CLEANUP_CMD(command); - if(system(command) == -1) - printf("Please clean shared memory (id=%d): see man ipcrm\n",id); - if(pref_addr){ - printf("ARMCI shared memory allocator was unable to obtain from "); - printf("the operating system multiple segments adjacent to "); - printf("each other in order to combine them into a one large "); - printf("segment together\n"); - shmem_errmsg(size); - armci_die("allocate: failed to attach to shared region", 0L); - } - } - POST_ALLOC_CHECK(temp,MinShmem*SHM_UNIT); - - region_list[alloc_regions].addr = temp; - region_list[alloc_regions].id = id; - region_list[alloc_regions].attached=1; - - if(DEBUG_) fprintf(stderr," allocate:attach: id=%d addr=%p \n",id, temp); - alloc_regions++; - if(i==0)ftemp = temp; - } - return (void*)(min(ftemp,temp)); -} - -/************************** END of MULTIPLE_REGIONS *******************/ - -#else /* Now, the machines where shm segments are not glued together */ +/* Now, the machines where shm segments are not glued together */ static int last_allocated=-1; @@ -997,10 +569,6 @@ char *Attach_Shared_Region(id, size, offset) { int reg, found, shmflag=0; static char *temp; - -#if defined(SGI_N32) && defined(SHM_SGI_ANYADDR) - shmflag= SHM_SGI_ANYADDR; -#endif if(alloc_regions>=MAX_REGIONS) armci_die("Attach_Shared_Region: to many regions ",0); @@ -1047,11 +615,7 @@ static char *temp; /* attach if not attached yet */ if(!region_list[reg].attached){ -# ifdef ALLOC_MUNMAP - char *pref_addr = alloc_munmap((size_t) (size)); -# else - char *pref_addr = (char*)0; -# endif + char *pref_addr = (char*)0; if ( (long) (temp = shmat((int) *id, pref_addr, shmflag)) == -1L){ fprintf(stderr,"%d:attach error:id=%ld off=%ld seg=%ld\n",armci_me,*id,offset,MinShmem); shmem_errmsg((size_t)MinShmem*1024); @@ -1092,14 +656,7 @@ void *armci_allocate(long size) char * temp; int id,shmflag=0; size_t sz = (size_t)size; -#ifdef ALLOC_MUNMAP - char *pref_addr = alloc_munmap((size_t) (MinShmem*SHM_UNIT)); -#else - char *pref_addr = (char*)0; -#endif -#if defined(SGI_N32) && defined(SHM_SGI_ANYADDR) - shmflag= SHM_SGI_ANYADDR; -#endif +char *pref_addr = (char*)0; if(DEBUG1){ printf("%d:allocate: Shmem allocate size %ld bytes\n",armci_me,size); @@ -1136,7 +693,7 @@ size_t sz = (size_t)size; printf("%d:allocate:attach:id=%d paddr=%p size=%ld\n",armci_me,id,temp,size); fflush(stdout); } -#if !defined(AIX) && !defined(HPUX64) +#if !defined(AIX) /* delete segment id so that OS cleans it when all attached processes are gone */ if(shmctl( id, IPC_RMID, (struct shmid_ds *)NULL)) fprintf(stderr,"failed to remove shm id=%d\n",id); @@ -1163,7 +720,6 @@ size_t sz = (size_t)size; return (void*) (temp); } -#endif /******************** common code for the two versions *********************/ @@ -1259,8 +815,3 @@ int code=0; } } - - -#else - what are doing here ? -#endif diff --git a/armci/src/memory/shmlimit.c b/armci/src/memory/shmlimit.c index c49dfd6e6..a2e82adf7 100644 --- a/armci/src/memory/shmlimit.c +++ b/armci/src/memory/shmlimit.c @@ -37,22 +37,11 @@ #define DEBUG_ 0 -#if defined(DECOSF) || defined(SOLARIS64) || defined(HPUX) -#define PIPE_AFTER_FORK_BUG -#endif - void (*armci_sig_chld_orig)(); static int status=0; int armci_shmlimit_caught_sigchld=0; -#if defined(SUN) && !defined(SOLARIS) -static void SigChldHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else static void SigChldHandler(sig) -#endif int sig; { #ifdef DISABLED @@ -75,15 +64,6 @@ static void RestoreSigChld() armci_die("Restore_SigChld: error from restoring signal SIGChld",0); } - -#ifdef SOLARIS -static int child_finished() -{ - return armci_shmlimit_caught_sigchld; -} -#endif - - int armci_child_shmem_init() { pid_t pid; @@ -129,10 +109,6 @@ int armci_child_shmem_init() if(val < 0 || (size_t)val < sizeof(int)) armci_die("armci shmem_test: read failed",val); -#ifdef SOLARIS - while(!child_finished()); -#endif - again: rc = wait (&status); if(rc == -1 && errno == EINTR) goto again; diff --git a/armci/src/memory/winshmem.c b/armci/src/memory/winshmem.c index 03c8a7f26..51f0b520d 100644 --- a/armci/src/memory/winshmem.c +++ b/armci/src/memory/winshmem.c @@ -41,33 +41,6 @@ # include # include # define GETPID _getpid -#elif defined(NEC) -# if HAVE_UNISTD_H -# include -# endif -# include - typedef void* HANDLE; - typedef void* LPVOID; -# define GETPID getpid -#elif defined(HITACHI) -# if HAVE_UNISTD_H -# include -# endif -# define PAGE_SIZE 0x1000 -# define ROUND_UP_PAGE(size) ((size + (PAGE_SIZE-1)) & ~(PAGE_SIZE-1)) -# if HAVE_STRINGS_H -# include -# endif -# if HAVE_STDLIB_H -# include -# endif -# include -# include - typedef long HANDLE; - typedef char* LPVOID; -# define GETPID getpid - static long cb_key=1961; - static long _hitachi_reg_size; #elif defined(MMAP) # if HAVE_FCNTL_H # include @@ -249,8 +222,6 @@ int reg; # if defined(WIN32) UnmapViewOfFile(region_list[reg].addr); CloseHandle(region_list[reg].id); -# elif defined(NEC) - (int)dp_xmfree(region_list[reg].addr); # else munmap(region_list[reg].addr, region_list[reg].size); SET_MAPNAME(reg); @@ -274,54 +245,6 @@ char *armci_get_core_from_map_file(int exists, long size) { LPVOID ptr; -#if defined(HITACHI) - - Cb_object_t oid; - int desc; - - region_list[alloc_regions].addr = (char*)0; - if(exists){ - int rc,nsize=_hitachi_reg_size; - if(size < MinShmem*SHM_UNIT) size = MinShmem*SHM_UNIT; - nsize = ROUND_UP_PAGE(nsize); - - if((rc=combuf_object_get(region_list[alloc_regions].id,(Cb_size_t)nsize,0, &oid)) - != COMBUF_SUCCESS) armci_die("attaching combufget fail",0); - if((rc=combuf_map(oid, 0, (Cb_size_t)nsize, COMBUF_COMMON_USE, &ptr)) - != COMBUF_SUCCESS) armci_die("combuf map failed",0); - - }else{ - int rc; - size = ROUND_UP_PAGE(size); - - if((rc=combuf_object_get(cb_key,(Cb_size_t)size,COMBUF_OBJECT_CREATE,&oid)) - != COMBUF_SUCCESS) armci_die("creat combufget fail",0); - if((rc=combuf_map(oid, 0, (Cb_size_t)size, COMBUF_COMMON_USE, &ptr)) - != COMBUF_SUCCESS) armci_die("combuf map failed",0); - - /* make the region suitable for communication */ - if(combuf_create_field(oid, ptr, (Cb_size_t)size, FIELD_NUM, 0, 0, &desc) - != COMBUF_SUCCESS) armci_die("create field failed",0); - - region_list[alloc_regions].id = cb_key; - _hitachi_reg_size=size; - cb_key++; /* increment for next combuf create call */ - - } - -#elif defined(NEC) - - region_list[alloc_regions].addr = (char*)0; - if(exists) - ptr = dp_xmatt(parent_pid, region_list[alloc_regions].id, (void*)0); - else { - ptr = dp_xmalloc((void*)0, (long long) size); - region_list[alloc_regions].id = ptr; - } - - if(ptr == (void*)-1) return ((char*)0); - -#else HANDLE h_shm_map; SET_MAPNAME(alloc_regions); region_list[alloc_regions].addr = (char*)0; @@ -353,7 +276,7 @@ char *armci_get_core_from_map_file(int exists, long size) CloseHandle(h_shm_map); h_shm_map = INVALID_HANDLE_VALUE; } -#elif defined(MMAP)&&!defined(HITACHI) && !defined(MACX) +#elif defined(MMAP) && !defined(MACX) if(exists){ if(size < MinShmem*SHM_UNIT) size = MinShmem*SHM_UNIT; @@ -417,8 +340,6 @@ char *armci_get_core_from_map_file(int exists, long size) /* save file handle in the array to close it in the future */ region_list[alloc_regions].id = h_shm_map; -#endif - if(DEBUG0){printf("%d: got ptr=%p bytes=%ld mmap\n",armci_me,ptr,size); fflush(stdout); } region_list[alloc_regions].addr = (char*)ptr; region_list[alloc_regions].size = size; @@ -498,24 +419,10 @@ char* Create_Shared_Region(long idlist[], long size, long *offset) /* idlist[0] = alloc_regions; This is set in find_regions() */ idlist[1] = parent_pid; -#if defined(HITACHI) || defined(NEC) - idlist[2] = (long) region_list[reg].id; -#if defined(HITACHI) - idlist[SHMIDLEN-2]=_hitachi_reg_size; -#endif -#endif if(DEBUG)printf("%d:created %p %ld id=%ld id[0]=%ld\n",armci_me,temp, size,idlist[2],idlist[0]); return (temp); } -#ifdef HITACHI -void server_reset_memory_variables() -{ - alloc_regions=0; - parent_pid=-1; -} -#endif - char *Attach_Shared_Region(long id[], long size, long offset) { /*int found=0;*/ @@ -535,13 +442,6 @@ char *Attach_Shared_Region(long id[], long size, long offset) /* find out if a new shmem region was allocated */ if(alloc_regions < id[0]+1){ -#if defined(HITACHI) || defined(NEC) -#if defined(HITACHI) - _hitachi_reg_size=id[SHMIDLEN-2]; -# endif - - region_list[alloc_regions].id = (HANDLE) id[2]; -# endif if(DEBUG)printf("alloc_regions=%d size=%ld\n",alloc_regions,size); temp = armci_get_core_from_map_file(1,size); if(temp != NULL)alloc_regions++; diff --git a/armci/src/progress/fence.c b/armci/src/progress/fence.c index 4b61a7995..c3057e962 100644 --- a/armci/src/progress/fence.c +++ b/armci/src/progress/fence.c @@ -8,9 +8,7 @@ #if HAVE_STDIO_H # include #endif -#if defined(PVM) -# include -#elif defined(TCGMSG) +#if defined(TCGMSG) # include static void tcg_synch(long type) { @@ -18,8 +16,6 @@ static void tcg_synch(long type) SYNCH_(&atype); } -#elif defined(BGML) -# include "bgml.h" #else # include #endif @@ -28,7 +24,7 @@ char *_armci_fence_arr; void armci_init_fence() { -#if defined (DATA_SERVER) || defined(PORTALS) +#if defined (DATA_SERVER) #if defined(THREAD_SAFE) _armci_fence_arr = calloc(armci_nproc*armci_user_threads.max,1); #else @@ -41,26 +37,15 @@ void armci_init_fence() void armci_finalize_fence() { -#if defined (DATA_SERVER) || defined(PORTALS) +#if defined (DATA_SERVER) free(_armci_fence_arr); _armci_fence_arr = NULL; #endif } -#ifdef PORTALS -void armci_update_fence_array(int proc, int inc) -{ - if (inc) - FENCE_ARR(proc)++; - else - FENCE_ARR(proc)--; -} -#endif - - void PARMCI_Fence(int proc) { -#if defined(DATA_SERVER) && !(defined(GM) && defined(ACK_FENCE)) +#if defined(DATA_SERVER) if(FENCE_ARR(proc) && (armci_nclus >1)){ int cluster = armci_clus_id(proc); @@ -71,11 +56,6 @@ void PARMCI_Fence(int proc) bzero(&FENCE_ARR(master), armci_clus_info[cluster].nslave); } -#elif defined(ARMCIX) - ARMCIX_Fence (proc); -#elif defined(BGML) - BGML_WaitProc(proc); - MEM_FENCE; #else FENCE_NODE(proc); MEM_FENCE; @@ -90,11 +70,7 @@ void PARMCI_GroupFence(ARMCI_Group *group) void PARMCI_AllFence() { -#if defined(ARMCIX) - ARMCIX_AllFence (); -#elif defined(BGML) - BGML_WaitAll(); -#elif defined(LAPI) || defined(CLUSTER) +#if defined(CLUSTER) int p; for(p = 0;p < armci_nproc; p++) { @@ -106,21 +82,15 @@ void PARMCI_AllFence() void PARMCI_Barrier() { - if (armci_nproc==1) - return; -#if defined(BGML) - BGML_WaitAll(); - bgml_barrier(3); -#else + if (armci_nproc==1) return; PARMCI_AllFence(); -# ifdef MSG_COMMS_MPI +#ifdef MSG_COMMS_MPI MPI_Barrier(ARMCI_COMM_WORLD); -# else +#else { long type=ARMCI_TAG; tcg_synch(type); } -# endif #endif MEM_FENCE; } diff --git a/armci/src/progress/wait.c b/armci/src/progress/wait.c index d17f76881..46b0f0909 100644 --- a/armci/src/progress/wait.c +++ b/armci/src/progress/wait.c @@ -10,12 +10,6 @@ int PARMCI_Wait(armci_hdl_t* usr_hdl) int success=0; int direct = SAMECLUSNODE(nb_handle->proc); -#ifdef BGML - assert(nb_handle->cmpl_info); - BGML_Wait(&(nb_handle->count)); - return(success); -#else - if(direct) { return(success); } @@ -35,7 +29,7 @@ int PARMCI_Wait(armci_hdl_t* usr_hdl) ARMCI_NB_WAIT(nb_handle->cmpl_info); return(success); } -#if defined(LAPI) || defined(ALLOW_PIN) || defined(ARMCIX) +#if defined(ALLOW_PIN) if(nb_handle->tag!=0 && nb_handle->bufid==NB_NONE){ ARMCI_NB_WAIT(nb_handle->cmpl_info); return(success); @@ -47,7 +41,6 @@ int PARMCI_Wait(armci_hdl_t* usr_hdl) COMPLETE_HANDLE(nb_handle->bufid,nb_handle->tag,(&success)); #endif } -#endif return(success); } @@ -68,9 +61,6 @@ armci_hdl_t *armci_set_implicit_handle (int op, int proc) PARMCI_Wait(&armci_nb_handle[i]); nbh = (armci_ihdl_t)&armci_nb_handle[i]; -#ifdef BGML - nbh->count=0; -#endif nbh->tag = GET_NEXT_NBTAG(); nbh->op = op; nbh->proc = proc; @@ -83,11 +73,6 @@ armci_hdl_t *armci_set_implicit_handle (int op, int proc) /* wait for all non-blocking operations to finish */ int PARMCI_WaitAll (void) { -#ifdef BGML - BGML_WaitAll(); -#elif ARMCIX - ARMCIX_WaitAll (); -#else int i; if(impcount) { for(i=0; icount; -#else int direct=SAMECLUSNODE(nb_handle->proc); if(direct)return(success); if(nb_handle) { @@ -144,18 +119,11 @@ int PARMCI_Test(armci_hdl_t *usr_hdl) ARMCI_NB_TEST(nb_handle->cmpl_info,&success); return(success); } -#ifdef LAPI - if(nb_handle->tag!=0 && nb_handle->bufid==NB_NONE){ - ARMCI_NB_TEST(nb_handle->cmpl_info,&success); - return(success); - } -#endif #endif #ifdef TEST_HANDLE TEST_HANDLE(nb_handle->bufid,nb_handle->tag,(&success)); #endif } -#endif return(success); } diff --git a/armci/src/util/armci_cpp b/armci/src/util/armci_cpp index 355659ae6..3535ebaa3 100755 --- a/armci/src/util/armci_cpp +++ b/armci/src/util/armci_cpp @@ -15,7 +15,7 @@ # # An example invocation of armci_cpp is as follows: # -# armci_cpp __ia64 LINUX64 LINUX SYSV PTHREADS DATA_SERVER \ +# armci_cpp LINUX64 LINUX SYSV PTHREADS DATA_SERVER \ # SERVER_THREAD _REENTRANT VAPI ALLOW_PIN PEND_BUFS REF_THREAD_SAFE \ # MPI OPENIB # diff --git a/armci/src/util/threads.c b/armci/src/util/threads.c index ff43db694..2e3cd56ba 100644 --- a/armci/src/util/threads.c +++ b/armci/src/util/threads.c @@ -82,7 +82,7 @@ void armci_finalize_threads() /* calling armci_thread_idx for every function that accesses thread-private data * might be expensive -- needs optiomization */ -INLINE int armci_thread_idx() +int armci_thread_idx() { int i, n = ARMCI_MIN(armci_user_threads.avail, armci_user_threads.max); thread_id_t id = THREAD_ID_SELF(); @@ -96,7 +96,7 @@ INLINE int armci_thread_idx() return armci_register_thread(id); } -INLINE int armci_register_thread(thread_id_t id) +int armci_register_thread(thread_id_t id) { int i; diff --git a/armci/src/xfer/rmw.c b/armci/src/xfer/rmw.c index d8cf10cd6..78143878f 100644 --- a/armci/src/xfer/rmw.c +++ b/armci/src/xfer/rmw.c @@ -12,26 +12,13 @@ # include "atomics-i386.h" #endif - -/* enable use of newer interfaces in SHMEM */ -#ifndef CRAY -#ifndef LIBELAN_ATOMICS -/* manpages for shmem_fadd exist on the T3E but library code does not */ -#define SHMEM_FADD -#endif -#endif - - /* global scope to prevent compiler optimization of volatile code */ int _a_temp; long _a_ltemp; -/* JAD -- DCMF implements its own rmw - there were linking errors with missing atomic_fetch_and_add for DCMF */ -#if !ARMCIX void armci_generic_rmw(int op, void *ploc, void *prem, int extra, int proc) { -#if defined(CLUSTER) && !defined(SGIALTIX) +#if defined(CLUSTER) int lock = (proc-armci_clus_info[armci_clus_id(proc)].master)%NUM_LOCKS; #else int lock = 0; @@ -42,7 +29,7 @@ void armci_generic_rmw(int op, void *ploc, void *prem, int extra, int proc) switch (op) { case ARMCI_FETCH_AND_ADD: #if (defined(__i386__) || defined(__x86_64__)) && !defined(NO_I386ASM) -#if (defined(__GNUC__) || defined(__INTEL_COMPILER__) ||defined(__PGIC__)) && !defined(PORTALS) && !defined(NO_I386ASM) +#if (defined(__GNUC__) || defined(__INTEL_COMPILER__) ||defined(__PGIC__)) && !defined(NO_I386ASM) if(SERVER_CONTEXT || armci_nclus == 1){ /* *(int*)ploc = __sync_fetch_and_add((int*)prem, extra); */ atomic_fetch_and_add(prem, ploc, extra, sizeof(int)); @@ -62,7 +49,7 @@ void armci_generic_rmw(int op, void *ploc, void *prem, int extra, int proc) armci_put(&_a_ltemp,prem,sizeof(long),proc); break; case ARMCI_SWAP: -#if (defined(__i386__) || defined(__x86_64__)) && !defined(PORTALS) && !defined(NO_I386ASM) +#if (defined(__i386__) || defined(__x86_64__)) && !defined(NO_I386ASM) if(SERVER_CONTEXT || armci_nclus==1){ atomic_exchange(ploc, prem, sizeof(int)); } @@ -87,40 +74,11 @@ void armci_generic_rmw(int op, void *ploc, void *prem, int extra, int proc) PARMCI_Fence(proc); NATIVE_UNLOCK(lock,proc); } -#endif /* ARMCIX */ int PARMCI_Rmw(int op, void *ploc, void *prem, int extra, int proc) { -#ifdef LAPI64 - extern int LAPI_Rmw64(lapi_handle_t hndl, RMW_ops_t op, uint tgt, - long long *tgt_var, - long long *in_val, long long *prev_tgt_val, lapi_cntr_t *org_cntr); - long long llval, *pllarg = (long long*)ploc, lltmp; -/* enable RMWBROKEN if RMW fails for long datatype */ -#define RMWBROKEN_ -#endif - -#ifdef LAPI - int ival, rc, opcode=SWAP, *parg=ploc; - lapi_cntr_t req_id; -#elif defined(_CRAYMPP) || defined(QUADRICS) || defined(CRAY_SHMEM) - int ival; - long lval; -#endif - -#if defined(LAPI64) && defined(RMWBROKEN) -/* hack for rmw64 BROKEN: we operate on least significant part of long */ -if(op==ARMCI_FETCH_AND_ADD_LONG || op==ARMCI_SWAP_LONG){ - ploc[0]=0; - ploc[1]=0; - ploc++; - parg ++; prem++; -} -#endif - -#if defined(CLUSTER) && !defined(LAPI) && !defined(QUADRICS) &&!defined(CYGWIN)\ - && !defined(HITACHI) && !defined(CRAY_SHMEM) && !defined(PORTALS) +#if defined(CLUSTER) && !defined(CYGWIN) if(!SAMECLUSNODE(proc)){ armci_rem_rmw(op, ploc, prem, extra, proc); return 0; @@ -130,163 +88,15 @@ if(op==ARMCI_FETCH_AND_ADD_LONG || op==ARMCI_SWAP_LONG){ #ifdef REGION_ALLOC if(SAMECLUSNODE(proc)) (void)armci_region_fixup(proc,&prem); #endif -#ifdef BGML - BGML_Op oper; - BGML_Dt dt; - void *temp; - long ltemp; - switch(op) - { - - case ARMCI_FETCH_AND_ADD: - case ARMCI_FETCH_AND_ADD_LONG: - dt=BGML_SIGNED_INT; - temp=(int *)&extra; - oper=BGML_SUM; - break; -#if 0 - case ARMCI_FETCH_AND_ADD_LONG: - fprintf(stderr,"adding int to longs....\n"); - dt=BGML_SIGNED_LONG; - ltemp=(long)extra; - temp=<emp; - oper=BGML_SUM; - break; -#endif - case ARMCI_SWAP: - case ARMCI_SWAP_LONG: - dt=BGML_SIGNED_INT; - oper=BGML_NOOP; - temp=(int *)ploc; - break; -#if 0 - case ARMCI_SWAP_LONG: - fprintf(stderr,"long armci_swap\n"); - dt=BGML_SIGNED_LONG; - oper=BGML_NOOP; - temp=(long *)ploc; - break; -#endif - default: - ARMCI_Error("Invalid operation for RMW", op); - } - - /* int PARMCI_Rmw(int op, int *ploc, int *prem, int extra, int proc) */ - /* assumes ploc will change - dstbuf=prem, input=temp(extra), output=ploc - val=ploc, arr[0]=prem, 1=extra */ - - int me=armci_msg_me(); - BG1S_t request; - unsigned done=1; - BGML_Callback_t cb_wait={wait_callback, &done}; - BG1S_rmw(&request, proc, 0, prem, temp, ploc, oper, dt, &cb_wait, 1); - BGML_Wait(&done); -#elif ARMCIX - ARMCIX_Rmw(op, ploc, prem, extra, proc); -#else switch (op) { -# if defined(QUADRICS) || defined(_CRAYMPP) || defined(CRAY_SHMEM) - case ARMCI_FETCH_AND_ADD: -#ifdef SHMEM_FADD - /* printf(" calling intfdd arg %x %ld \n", prem, *prem); */ - *(int*) ploc = shmem_int_fadd(prem, extra, proc); -#elif defined(LIBELAN_ATOMICS) - *(int*) ploc = elan_int_fadd(prem, extra, proc); -#else - while ( (ival = shmem_int_swap(prem, INT_MAX, proc) ) == INT_MAX); - (void) shmem_int_swap(prem, ival +extra, proc); - *(int*) ploc = ival; -#endif - break; - case ARMCI_FETCH_AND_ADD_LONG: -#ifdef SHMEM_FADD - *(long*) ploc = shmem_long_fadd( (long*)prem, (long) extra, proc); -#elif defined(LIBELAN_ATOMICS) - *(long*) ploc = elan_long_fadd( (long*)prem, (long) extra, proc); -#else - while ((lval=shmem_long_swap((long*)prem,LONG_MAX,proc)) == LONG_MAX); - (void) shmem_long_swap((long*)prem, (lval + extra), proc); - *(long*)ploc = lval; -#endif - break; - case ARMCI_SWAP: -#ifdef LIBELAN_ATOMICS - *(int*)ploc = elan_int_swap((int*)prem, *(int*)ploc, proc); -#else - *(int*)ploc = shmem_int_swap((int*)prem, *(int*)ploc, proc); -#endif - break; - case ARMCI_SWAP_LONG: -#ifdef LIBELAN_ATOMICS - *(long*)ploc = elan_long_swap((long*)prem, *(long*)ploc, proc); -#else - *(long*)ploc = shmem_long_swap((long*)prem, *(long*)ploc, proc); -#endif - break; -# elif defined(LAPI) -# if defined(LAPI64) && !defined(RMWBROKEN) - case ARMCI_FETCH_AND_ADD_LONG: - opcode = FETCH_AND_ADD; - lltmp = (long long)extra; - pllarg = &lltmp; - case ARMCI_SWAP_LONG: -#if 0 - printf("before opcode=%d rem=%ld, loc=(%ld,%ld) extra=%ld\n", - opcode,*prem,*(long*)ploc,llval, lltmp); - rc= sizeof(long); - PARMCI_Get(prem, &llval, rc, proc); - printf("%d:rem val before %ld\n",armci_me, llval); fflush(stdout); -#endif - if( rc = LAPI_Setcntr(lapi_handle,&req_id,0)) - armci_die("rmw setcntr failed",rc); - if( rc = LAPI_Rmw64(lapi_handle, opcode, proc, (long long*)prem, - pllarg, &llval, &req_id)) armci_die("rmw failed",rc); - if( rc = LAPI_Waitcntr(lapi_handle, &req_id, 1, NULL)) - armci_die("rmw wait failed",rc); - - *(long*)ploc = (long)llval; -#if 0 - rc= sizeof(long); - PARMCI_Get(prem, &lltmp, rc, proc); - printf("%d:after rmw remote val from rmw=%ld and get=%ld extra=%d\n", - armci_me,llval, lltmp,extra); -#endif - break; -# endif - /************** here sizeof(long)= sizeof(int) **************/ - case ARMCI_FETCH_AND_ADD: -# if !defined(LAPI64) || defined(RMWBROKEN) - case ARMCI_FETCH_AND_ADD_LONG: -# endif - opcode = FETCH_AND_ADD; - parg = &extra; - case ARMCI_SWAP: -# if !defined(LAPI64) || defined(RMWBROKEN) - case ARMCI_SWAP_LONG: -# endif - /* Within SMPs LAPI_Rmw needs target's address. */ - if(SAMECLUSNODE(proc)) proc=armci_me; - - if( rc = LAPI_Setcntr(lapi_handle,&req_id,0)) - armci_die("rmw setcntr failed",rc); - if( rc = LAPI_Rmw(lapi_handle, opcode, proc, prem, - parg, &ival, &req_id)) armci_die("rmw failed",rc); - if( rc = LAPI_Waitcntr(lapi_handle, &req_id, 1, NULL)) - armci_die("rmw wait failed",rc); - * (int *)ploc = ival; - break; -# else case ARMCI_FETCH_AND_ADD: case ARMCI_FETCH_AND_ADD_LONG: case ARMCI_SWAP: case ARMCI_SWAP_LONG: armci_generic_rmw(op, ploc, prem, extra, proc); break; -# endif default: armci_die("rmw: operation not supported",op); } -#endif /*bgml*/ return 0; } diff --git a/armci/src/xfer/strided.c b/armci/src/xfer/strided.c index 397be0204..668a94aa3 100644 --- a/armci/src/xfer/strided.c +++ b/armci/src/xfer/strided.c @@ -27,17 +27,9 @@ #if defined(CLIENT_BUF_BYPASS) #define CAN_REQUEST_DIRECTLY _armci_bypass #else -# if defined(HITACHI) -# define CAN_REQUEST_DIRECTLY 0 -# else # define CAN_REQUEST_DIRECTLY 1 -# endif #endif -#if defined(BGML) || defined(ARMCIX) -#define PREPROCESS_STRIDED(tmp_count) -#define POSTPROCESS_STRIDED(tmp_count) -#else #define BIGINT 2147483647 #define PREPROCESS_STRIDED(tmp_count) { \ tmp_count=0; \ @@ -54,7 +46,6 @@ } \ } #define POSTPROCESS_STRIDED(tmp_count) if(tmp_count)seg_count[1]=tmp_count -#endif #define SERVER_GET 1 #define SERVER_NBGET 2 @@ -139,21 +130,7 @@ int armci_iwork[MAX_STRIDE_LEVEL]; static void armci_copy_2D(int op, int proc, void *src_ptr, void *dst_ptr, int bytes, int count, int src_stride, int dst_stride) { -#ifdef LAPI - int armci_th_idx = ARMCI_THREAD_IDX; -#endif - -#ifdef LAPI2__ -# define COUNT 1 -#else -# define COUNT count -#endif - -#ifdef __crayx1 - int shmem = 1; -#else int shmem = SAMECLUSNODE(proc); -#endif if(shmem) { @@ -166,23 +143,7 @@ static void armci_copy_2D(int op, int proc, void *src_ptr, void *dst_ptr, }else { if(bytes < THRESH){ /* low-latency copy for small data segments */ -#if defined(__crayx1) - if( !(bytes%sizeof(float)) ) { - float *ps=(float*)src_ptr; - float *pd=(float*)dst_ptr; - long fsstride = src_stride/sizeof(float); - long fdstride = dst_stride/sizeof(float); - int j; - - for (j = 0; j < count; j++){ - int i; -#pragma _CRI concurrent - for(i=0;i=1 && count[0]<=LONG_PUT_THRESHOLD))) -# elif defined(DOELAN4) && !defined(NB_NONCONT) - /*if(!ARMCI_ACC(op) && !SAMECLUSNODE(proc) && nb_handle && stride_levels<2)*/ - if(!ARMCI_ACC(op) && !SAMECLUSNODE(proc) && stride_levels<2) -# else - if(!SAMECLUSNODE(proc)) -# endif - armci_network_strided(op,scale,proc,src_ptr,src_stride_arr,dst_ptr, - dst_stride_arr,count,stride_levels,nb_handle); - else -# endif switch (stride_levels) { case 0: /* 1D copy */ @@ -574,22 +481,6 @@ void armci_acc_1D(int op, void *scale, int proc, void *src, void *dst, int bytes } } - - /* deal with non-blocking loads and stores */ -#if defined(LAPI) || defined(_ELAN_PUTGET_H) || defined(NB_NONCONT) -# if defined(LAPI) - if(!nb_handle) -# endif - { - if(!(SAMECLUSNODE(proc))){ - if(op == GET){ - WAIT_FOR_GETS; /* wait for data arrival */ - }else { - WAIT_FOR_PUTS; /* data must be copied out*/ - } - } - } -#endif /* if(proc!=armci_me) INTR_ON;*/ @@ -636,17 +527,10 @@ static int _armci_puts(void *src_ptr, if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; if(proc<0)return FAIL5; -#ifdef __crayx1 - if(!stride_levels) { - memcpy(dst_ptr, src_ptr,count[0]); - return 0; - } -#endif - PREPROCESS_STRIDED(tmp_count); -# if (!defined(QUADRICS) || defined(PACKPUT)) +# if defined(PACKPUT) direct=SAMECLUSNODE(proc); -# endif /*(!QUADRICS||!PACKPUT)&&!PORTALS*/ +# endif /*PACKPUT*/ if(put_flag) dassert(1,nbh==NULL); @@ -674,72 +558,7 @@ static int _armci_puts(void *src_ptr, nbh->bufid=NB_NONE; } } - -#ifdef BGML - if(nbh) { - nbh->count = 1; - BGML_Callback_t cb_wait={wait_callback, &nbh->count}; - BG1S_MemputS (&nbh->cmpl_info, proc, - src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, - seg_count, stride_levels, - 0, &cb_wait, 1); - } - else if(!stride_levels) { - unsigned temp_count=1; - BGML_Callback_t cb_wait={wait_callback, &temp_count}; - BG1S_t request; - BGML_CriticalSection_enter(); - BG1S_Memput(&request, proc, src_ptr, 0, dst_ptr, count[0], &cb_wait, 1); - /*BGML_Wait(&count);*/ - while (temp_count) BGML_Messager_advance(); - BGML_CriticalSection_exit(); - } - else { - armci_hdl_t nb_handle; - ARMCI_INIT_HANDLE(&nb_handle); - PARMCI_NbPutS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, - stride_levels, proc, &nb_handle); - PARMCI_Wait(&nb_handle); - } - if(put_flag) { /*=>!nbh*/ - PARMCI_Fence(proc); - PARMCI_Put(&put_flag->val,put_flag->ptr,sizeof(int),proc); - } -#elif ARMCIX - if(nbh) - ARMCIX_NbPutS (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, stride_levels, proc, nbh); - else if(!stride_levels) { - ARMCIX_Put(src_ptr, dst_ptr, count[0], proc); - } - else { - ARMCIX_PutS (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, stride_levels, proc); - } - if(put_flag) { /*=>!nbh*/ - PARMCI_Fence(proc); - PARMCI_Put(&put_flag->val,put_flag->ptr,sizeof(int),proc); - } -#else /*BGML*/ - - /* use direct protocol for remote access when performance is better */ -# if defined(LAPI) || defined(DOELAN4) - if(!direct) { - switch(stride_levels) { - case 0: -# ifndef LAPI_RDMA - direct =1; -# endif - break; - case 1: if((count[1]LONG_PUT_THRESHOLD) direct =1; break; - default: if(count[0]> LONG_PUT_THRESHOLD )direct=1; break; - } - } -# endif /*LAPI||DOELAN4*/ -# ifdef PORTALS - if(stride_levels) direct=1; -# endif -# if !defined(LAPI2) || defined(LAPI_RDMA) if(!direct){ # ifdef ALLOW_PIN /*if we can pin, we do*/ if(!stride_levels && @@ -817,9 +636,7 @@ static int _armci_puts(void *src_ptr, # endif /*VAPI*/ # endif /*ALLOW_PIN*/ } -#endif /* !LAPI2||LAPI_RDMA */ -# ifndef LAPI2 if(!direct){ if(nbh) { DO_FENCE(proc,SERVER_PUT); } else { DO_FENCE(proc,SERVER_NBPUT); } @@ -851,14 +668,10 @@ static int _armci_puts(void *src_ptr, } } else -# endif /*!LAPI*/ { if(!nbh && stride_levels == 0) { armci_copy_2D(PUT, proc, src_ptr, dst_ptr, count[0], 1, count[0], count[0]); -# if defined(LAPI) || defined(_ELAN_PUTGET_H) - if(proc != armci_me) { WAIT_FOR_PUTS; } -# endif /*LAPI||_ELAN_PUTGET_H*/ } else { rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr, @@ -870,7 +683,6 @@ static int _armci_puts(void *src_ptr, PARMCI_Put(&put_flag->val,put_flag->ptr,sizeof(int),proc); } } -#endif /*BGML*/ POSTPROCESS_STRIDED(tmp_count); if(rc) return FAIL6; else return 0; @@ -978,80 +790,22 @@ static int _armci_accs( int optype, void *scale, } PREPROCESS_STRIDED(tmp_count); -#ifdef BGML - armci_ihdl_t inbh; - armci_hdl_t tmp_hdl; - if(nbh) inbh = nbh; - else { - ARMCI_INIT_HANDLE(&tmp_hdl); - inbh = (armci_ihdl_t)&tmp_hdl; - } - inbh->count=1; - BGML_Callback_t cb_wait={wait_callback, &inbh->count}; - - BGML_Op oper1=BGML_PROD; - BGML_Op oper2=BGML_SUM; - BGML_Dt dt; - switch(optype) { - case ARMCI_ACC_INT: - case ARMCI_ACC_LNG: - dt=BGML_SIGNED_INT; - break; -#if 0 - case ARMCI_ACC_LNG: - dt=BGML_SIGNED_LONG; - break; -#endif - case ARMCI_ACC_DBL: - dt=BGML_DOUBLE; - break; - case ARMCI_ACC_CPL: - dt=BGML_SINGLE_COMPLEX; - break; - case ARMCI_ACC_DCP: - dt=BGML_DOUBLE_COMPLEX; - break; - case ARMCI_ACC_FLT: - dt=BGML_FLOAT; - break; - default: - assert(0); - } - - BG1S_AccumulateS (&inbh->cmpl_info, proc, - src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, - seg_count, stride_levels, - scale, 0, - dt, oper1, oper2, - &cb_wait, 1); - - if(!nbh) PARMCI_Wait(&tmp_hdl); -#elif ARMCIX - if(!nbh) - ARMCIX_AccS (optype, scale, src_ptr, src_stride_arr, dst_ptr, - dst_stride_arr, count, stride_levels, proc); - else - ARMCIX_NbAccS (optype, scale, src_ptr, src_stride_arr, dst_ptr, - dst_stride_arr, count, stride_levels, proc, nbh); -#else direct=SAMECLUSNODE(proc); -# if defined(ACC_COPY) && !defined(ACC_SMP) +#if defined(ACC_COPY) if(armci_me != proc) direct=0; -# endif /*ACC_COPY && !ACC_SMP*/ - - if(direct) +#endif /*ACC_COPY*/ + + if(direct) { rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr, dst_stride_arr, count, stride_levels,1,NULL); - else{ + } else { if(nbh) { DO_FENCE(proc,SERVER_NBPUT); } else { DO_FENCE(proc,SERVER_PUT); } rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr, dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,nbh); } -#endif /*BGML*/ POSTPROCESS_STRIDED(tmp_count); if(rc) return FAIL6; else return 0; @@ -1096,12 +850,8 @@ int PARMCI_Put_flag(void *src, void* dst,int bytes,int *f,int v,int proc) { int PARMCI_Get(void *src, void* dst, int bytes, int proc) { int rc=0; -#ifdef __crayx1 - memcpy(dst,src,bytes); -#else rc = PARMCI_GetS(src, NULL, dst, NULL, &bytes, 0, proc); -#endif - + dassert(1,rc==0); return rc; } @@ -1341,22 +1091,7 @@ int PARMCI_NbGetS( void *src_ptr, /* pointer to 1st segment at source*/ if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4; if(proc<0)return FAIL5; -#ifdef BGML - armci_ihdl_t nbh; - set_nbhandle(&nbh, usr_hdl, PUT, proc); - nbh->count=1; - BGML_Callback_t cb_wait={wait_callback, &nbh->count}; - - BG1S_MemgetS (&nbh->cmpl_info, proc, - src_ptr, src_stride_arr, - dst_ptr, dst_stride_arr, - seg_count, stride_levels, - 0, &cb_wait, 1); -#else - -#if !defined(QUADRICS) direct=SAMECLUSNODE(proc); -#endif PREPROCESS_STRIDED(tmp_count); /* aggregate get */ @@ -1384,17 +1119,6 @@ int PARMCI_NbGetS( void *src_ptr, /* pointer to 1st segment at source*/ nb_handle = (armci_ihdl_t)armci_set_implicit_handle(GET, proc); } -#ifdef LAPI_RDMA - if(stride_levels == 0 || count[0] > LONG_GET_THRESHOLD) - direct=0; -#endif - -#ifdef PORTALS - if(stride_levels) - direct=1; -#endif - -#if !defined(LAPI2) || defined(LAPI_RDMA) if(!direct){ # ifdef ALLOW_PIN if(!stride_levels && @@ -1406,54 +1130,30 @@ int PARMCI_NbGetS( void *src_ptr, /* pointer to 1st segment at source*/ } # endif } -#endif /*!LAPI||LAPI_RDMA */ -#ifndef LAPI2 if(!direct){ DO_FENCE(proc,SERVER_NBGET); #if defined(DATA_SERVER) && (defined(SOCKETS) || defined(CLIENT_BUF_BYPASS) ) /* for larger strided or 1D reqests buffering can be avoided to send data * we can try to bypass the packetization step and send request directly */ - /* JAD 4/17/18 - * This code was never executed, shown by gcc -Werror=type-limits - * 'comparison is always false due to limited range of data type'. - * count[0] is an int, LONG_GET_THRESHOLD is 2147483648 (int max). - * So this is always false. */ -#if 0 - if(CAN_REQUEST_DIRECTLY && ((count[0]> LONG_GET_THRESHOLD) || - (stride_levels && count[0]>LONG_GET_THRESHOLD_STRIDED) ) ) { - - int nobuf =1; /* tells the sending routine not to buffer */ - rc = armci_rem_strided(GET, NULL, proc,src_ptr,src_stride_arr,dst_ptr, - dst_stride_arr, count, stride_levels, - (ext_header_t*)0,nobuf,nb_handle); - if(rc) goto DefaultPath; /* attempt to avoid buffering failed */ - - }else - DefaultPath: /* standard buffered path */ -#endif -#endif -#ifdef ARMCIX - rc = ARMCIX_NbGetS (src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, count, stride_levels, proc, nb_handle); -#else rc = armci_pack_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr,dst_stride_arr,count,stride_levels, NULL,-1,-1,-1,nb_handle); -#endif - }else + } else #else /* avoid LAPI_GetV */ - if(stride_levels==1 && count[0]>320 && !direct) + if(stride_levels==1 && count[0]>320 && !direct) { ARMCI_REM_GET(proc,src_ptr,src_stride_arr,dst_ptr, dst_stride_arr, count, stride_levels, nb_handle); - else + } else #endif + { rc = armci_op_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr, dst_stride_arr,count, stride_levels,0,nb_handle); + } POSTPROCESS_STRIDED(tmp_count); -#endif /*bgml*/ if(rc) return FAIL6; else return 0; @@ -1478,7 +1178,7 @@ int PARMCI_NbAccS( int optype, /* operation */ } -#if !defined(ACC_COPY)&&!defined(CRAY_YMP)&&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML)&&!defined(DCMF) +#if !defined(ACC_COPY)&&!defined(CYGNUS)&&!defined(CYGWIN) # define REMOTE_OP #endif @@ -1510,9 +1210,6 @@ int PARMCI_NbGet(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle) static void _armci_op_value(int op, void *src, void *dst, int proc, int bytes, armci_hdl_t *usr_hdl) { int rc=0,pv=0; -#ifdef LAPI - int armci_th_idx = ARMCI_THREAD_IDX; -#endif armci_ihdl_t nbh = (armci_ihdl_t)usr_hdl; if(!nbh) { @@ -1533,50 +1230,18 @@ static void _armci_op_value(int op, void *src, void *dst, int proc, nbh->bufid=NB_NONE; } } -#if defined(REMOTE_OP) && !defined(QUADRICS) +#if defined(REMOTE_OP) rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL, &bytes, 0, NULL, 0, nbh); if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6); #else if(op==PUT) { UPDATE_FENCE_STATE(proc, PUT, 1); -# ifdef LAPI - SET_COUNTER(ack_cntr[armci_th_idx], 1); -# endif -# if defined(BGML) || defined(ARMCIX) - if(usr_hdl) PARMCI_NbPut(src,dst,bytes,proc,usr_hdl); - else PARMCI_Put(src,dst,bytes,proc); -# else armci_put(src, dst, bytes, proc); -# endif } else { -# ifdef LAPI - SET_COUNTER(get_cntr[armci_th_idx], 1); -# endif -# if defined(BGML) || defined(ARMCIX) - if(usr_hdl) PARMCI_NbGet(src,dst,bytes,proc,usr_hdl); - else PARMCI_Get(src,dst,bytes,proc); -# else armci_get(src, dst, bytes, proc); -# endif } - - /* deal with non-blocking loads and stores */ -# if defined(LAPI) || defined(_ELAN_PUTGET_H) -# ifdef LAPI - if(!nbh) -# endif - { - if(proc != armci_me){ - if(op == GET){ - WAIT_FOR_GETS; /* wait for data arrival */ - }else { - WAIT_FOR_PUTS; /* data must be copied out*/ - } - } - } -# endif #endif } diff --git a/armci/src/xfer/vector.c b/armci/src/xfer/vector.c index f7654811c..cfadfd0c7 100644 --- a/armci/src/xfer/vector.c +++ b/armci/src/xfer/vector.c @@ -239,9 +239,6 @@ int armci_copy_vector(int op, /* operation code */ ) { int i,s,shmem= SAMECLUSNODE(proc); -#ifdef LAPI - int armci_th_idx = ARMCI_THREAD_IDX; -#endif if(shmem ){ /* local/shared memory copy */ @@ -263,40 +260,22 @@ int armci_copy_vector(int op, /* operation code */ for(i = 0; i< len; i++){ -#ifdef QUADRICS - armcill_putv(proc, darr[i].bytes, darr[i].ptr_array_len, - darr[i].src_ptr_array, darr[i].dst_ptr_array); -#else -# ifdef LAPI - SET_COUNTER(ack_cntr[armci_th_idx],darr[i].ptr_array_len); -# endif UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len); for( s=0; s< darr[i].ptr_array_len; s++){ armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s], darr[i].bytes, proc); } -#endif } break; case GET: for(i = 0; i< len; i++){ - -#ifdef QUADRICS - armcill_getv(proc, darr[i].bytes, darr[i].ptr_array_len, - darr[i].src_ptr_array, darr[i].dst_ptr_array); -#else -# ifdef LAPI - SET_COUNTER(get_cntr[armci_th_idx],darr[i].ptr_array_len); -# endif - for( s=0; s< darr[i].ptr_array_len; s++){ armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s], darr[i].bytes,proc); } -#endif } break; @@ -305,14 +284,6 @@ int armci_copy_vector(int op, /* operation code */ } } -#ifdef LAPI - if(!shmem){ - - if(op == GET) CLEAR_COUNTER(get_cntr[armci_th_idx]); /* wait for data arrival */ - if(op == PUT) CLEAR_COUNTER(ack_cntr[armci_th_idx]); /* data must be copied out*/ - } -#endif - return 0; } @@ -367,44 +338,27 @@ int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */ if(proc<0 || proc >= armci_nproc)return FAIL5; ORDER(PUT,proc); /* ensure ordering */ -#ifndef QUADRICS direct=SAMECLUSNODE(proc); -#endif /* use direct protocol for remote access when performance is better */ -# if defined(LAPI) || defined(PORTALS) -# if defined(PORTALS) - direct=1; -# else - if(!direct) - if(len <5 || darr[0].ptr_array_len <5) direct=1; -# endif -# endif - -#ifdef BGML - armci_hdl_t nb_handle; - ARMCI_INIT_HANDLE(&nb_handle); - PARMCI_NbPutV(darr, len, proc, &nb_handle); - PARMCI_Wait(&nb_handle); -#elif ARMCIX - ARMCIX_PutV (darr, len, proc); -#else - if(direct) + if (direct) { rc = armci_copy_vector(PUT, darr, len, proc); - else{ -#if defined(DATA_SERVER) && defined(SOCKETS) && defined(USE_SOCKET_VECTOR_API) - /*500 is very conservative, the number here should be modified to be - based on the size of send/recv buffer*/ - if(totvec<500) + } else { +#if defined(DATA_SERVER) && defined(SOCKETS) && defined(USE_SOCKET_VECTOR_API) + /* 500 is very conservative, the number here should be modified to be + based on the size of send/recv buffer */ + if(totvec<500) { rc = armci_rem_vector(PUT, NULL, darr, len, proc, 1,NULL); - else -#endif + } else +#endif + { rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL); + } } -#endif - - if(rc) return FAIL6; - else return 0; + if (rc) { + return FAIL6; + } + return 0; } @@ -432,44 +386,27 @@ int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */ if(proc<0 || proc >= armci_nproc)return FAIL5; ORDER(GET,proc); /* ensure ordering */ -#ifndef QUADRICS direct=SAMECLUSNODE(proc); -#endif /* use direct protocol for remote access when performance is better */ -# if defined(LAPI) || defined(PORTALS) -# if defined(PORTALS) - direct=1; -# else - if(!direct) - if(len <5 || darr[0].ptr_array_len <8) direct=1; -# endif -# endif - -#ifdef BGML - armci_hdl_t nb_handle; - ARMCI_INIT_HANDLE(&nb_handle); - PARMCI_NbGetV(darr, len, proc, &nb_handle); - PARMCI_Wait(&nb_handle); -#elif ARMCIX - ARMCIX_GetV (darr, len, proc); -#else - if(direct) + if (direct) { rc = armci_copy_vector(GET, darr, len, proc); - else{ -#if defined(DATA_SERVER) && defined(SOCKETS) && defined(USE_SOCKET_VECTOR_API) - /*500 is very conservative, the number here should be modified to be - based on the size of send/recv buffer*/ - if(totvec<500) + } else { +#if defined(DATA_SERVER) && defined(SOCKETS) && defined(USE_SOCKET_VECTOR_API) + /* 500 is very conservative, the number here should be modified to be + based on the size of send/recv buffer*/ + if (totvec<500) { rc = armci_rem_vector(GET, NULL, darr, len, proc,1,NULL); - else -#endif - rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL); - } - + } else #endif + { + rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL); + } + } - if(rc) return FAIL6; - else return 0; + if(rc) { + return FAIL6; + } + return 0; } @@ -495,28 +432,21 @@ int PARMCI_AccV( int op, /* oeration code */ ORDER(op,proc); /* ensure ordering */ direct=SAMECLUSNODE(proc); -#ifdef BGML - armci_hdl_t nb_handle; - ARMCI_INIT_HANDLE(&nb_handle); - PARMCI_NbAccV(op, scale, darr, len, proc, &nb_handle); - PARMCI_Wait(&nb_handle); -#elif ARMCIX - ARMCIX_AccV (op, scale, darr, len, proc); -#else - -# if defined(ACC_COPY) && !defined(ACC_SMP) + +# if defined(ACC_COPY) if(armci_me != proc) direct=0; # endif - if(direct) + if (direct) { rc = armci_acc_vector( op, scale, darr, len, proc); - else + } else { rc = armci_pack_vector(op, scale, darr, len, proc,NULL); + } -#endif - - if(rc) return FAIL6; - else return 0; + if (rc) { + return FAIL6; + } + return 0; } /*****************************************************************************/ @@ -547,9 +477,7 @@ int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */ if(proc<0 || proc >= armci_nproc)return FAIL5; -#ifndef QUADRICS direct=SAMECLUSNODE(proc); -#endif /* aggregate put */ if(nb_handle && nb_handle->agg_flag == SET) { @@ -574,37 +502,19 @@ int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */ nb_handle = (armci_ihdl_t)armci_set_implicit_handle(PUT, proc); } -# if defined(PORTALS) - direct=1; -# endif - - if(direct){ -#ifdef BGML - nb_handle->count = 0; -#endif + if (direct) { rc = armci_copy_vector(PUT, darr, len, proc); - } - else{ -#ifdef BGML - nb_handle->count = 1; - BGML_Callback_t cb_wait={wait_callback, &nb_handle->count}; - BGML_giov_t *array=(BGML_giov_t *)darr; - BG1S_MemputV(&nb_handle->cmpl_info, proc, len, - (BGML_giov_t *)darr, 0, &cb_wait, 1); -#elif ARMCIX - - ARMCIX_NbPutV (darr, len, proc, nb_handle); - -#else -#if defined(DATA_SERVER) && defined(SOCKETS) && defined(USE_SOCKET_VECTOR_API) - /*500 is very conservative, the number here should be modified to be - based on the size of send/recv buffer*/ - if(totvec<500) + } else{ +#if defined(DATA_SERVER) && defined(SOCKETS) && defined(USE_SOCKET_VECTOR_API) + /* 500 is very conservative, the number here should be modified to be + based on the size of send/recv buffer */ + if (totvec<500) { rc = armci_rem_vector(PUT, NULL, darr, len, proc, 1,nb_handle); - else -#endif + } else +#endif + { rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle); -#endif /* BGML */ + } } if(rc) return FAIL6; @@ -635,13 +545,7 @@ int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */ if(proc<0 || proc >= armci_nproc)return FAIL5; -#ifndef QUADRICS direct=SAMECLUSNODE(proc); -#endif - -#if defined(PORTALS) - direct=1; -#endif /* aggregate get */ if(nb_handle && nb_handle->agg_flag == SET) { @@ -662,32 +566,19 @@ int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */ nb_handle = (armci_ihdl_t)armci_set_implicit_handle(GET, proc); } - if(direct){ -#ifdef BGML - nb_handle->count = 0; -#endif + if (direct) { rc = armci_copy_vector(GET, darr, len, proc); - } - else{ -#ifdef BGML - nb_handle->count = 1; - BGML_Callback_t cb_wait={wait_callback, &nb_handle->count}; - BG1S_MemgetV(&nb_handle->cmpl_info, proc, len, - (BGML_giov_t *)darr, 0, &cb_wait, 1); -#elif ARMCIX - - ARMCIX_NbGetV (darr, len, proc, nb_handle); - -#else -#if defined(DATA_SERVER) && defined(SOCKETS) && defined(USE_SOCKET_VECTOR_API) - /*500 is very conservative, the number here should be modified to be - based on the size of send/recv buffer*/ - if(totvec<500) + } else{ +#if defined(DATA_SERVER) && defined(SOCKETS) && defined(USE_SOCKET_VECTOR_API) + /* 500 is very conservative, the number here should be modified to be + based on the size of send/recv buffer */ + if (totvec<500) { rc = armci_rem_vector(GET, NULL, darr, len, proc,1,nb_handle); - else -#endif - rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle); -#endif /* BGML */ + } else +#endif + { + rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle); + } } if(rc) return FAIL6; @@ -716,64 +607,10 @@ int PARMCI_NbAccV( int op, /* oeration code */ if(proc<0 || proc >= armci_nproc)return FAIL5; -#ifdef BGML - if(nb_handle){ - nb_handle->tag = GET_NEXT_NBTAG(); - nb_handle->op = op; - nb_handle->proc= proc; - nb_handle->bufid=NB_NONE; - } - else - nb_handle = (armci_ihdl_t)armci_set_implicit_handle(op, proc); - - BGML_Dt dt; - switch(op) - { - case ARMCI_ACC_INT: - dt=BGML_SIGNED_INT; - break; - case ARMCI_ACC_LNG: -#if 1 - dt=BGML_SIGNED_LONG; -#else - dt=BGML_SIGNED_INT; -#endif - break; - case ARMCI_ACC_DBL: - dt=BGML_DOUBLE; - break; - case ARMCI_ACC_CPL: - dt=BGML_SINGLE_COMPLEX; - break; - case ARMCI_ACC_FLT: - dt=BGML_FLOAT; - break; - case ARMCI_ACC_DCP: - dt=BGML_DOUBLE_COMPLEX; - break; - default: - armci_die("Unsupported data operation\n",0); - } - - nb_handle->count = 1; - BGML_Callback_t cb_wait={wait_callback, &nb_handle->count}; - BG1S_AccumulateV(&nb_handle->cmpl_info, - proc, - len, - (BGML_giov_t *)darr, - scale, - 0, - dt, - BGML_PROD, - BGML_SUM, - &cb_wait, - 1); -#else - /* ORDER(op,proc); ensure ordering */ UPDATE_FENCE_INFO(proc); direct=SAMECLUSNODE(proc); - + if(nb_handle){ nb_handle->tag = GET_NEXT_NBTAG(); nb_handle->op = op; @@ -783,19 +620,19 @@ int PARMCI_NbAccV( int op, /* oeration code */ else nb_handle = (armci_ihdl_t)armci_set_implicit_handle(op, proc); -# if defined(ACC_COPY) && !defined(ACC_SMP) +# if defined(ACC_COPY) if(armci_me != proc) direct=0; # endif - if(direct){ + if (direct) { rc = armci_acc_vector( op, scale, darr, len, proc); - } - else{ + } else { rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle); } -#endif /* BGML */ - if(rc) return FAIL6; - else return 0; + if (rc) { + return FAIL6; + } + return 0; } /*****************************************************************************/ diff --git a/armci/tcgmsg/README b/armci/tcgmsg/README deleted file mode 100644 index 4e1300f37..000000000 --- a/armci/tcgmsg/README +++ /dev/null @@ -1,256 +0,0 @@ -TCGMSG migration path (January 2013) ------------------------------------- - -TCGMSG (ipcv4.0 and ipcv5.0) have become the MPI-free runtime for ARMCI. It is -now C-only i.e. no fortran code. The TCGMSG fortran interface is preserved for -backwards compatibility within the Global Arrays distribution. - -This was a minimal effort migration. MPI is the preferred runtime. - -TCGMSG Send/receive subroutines ... version 4.04 (January 1994) ---------------------------------------------------------------- - -:author: Robert J. Harrison -:address: P.O. Box 999 Richland WA 99352 K1-90 -:tel: 509-375-2037 -:fax: 509-375-6631 -:email: rj_harrison@pnl.gov - -Summary -------- - -TCGMSG is a toolkit for writing portable parallel programs using a message -passing model. Supported are a variety of common UNIX workstations, -mini-super and super computers and hetrogenous networks of the same, along -with true parallel computers such as the Touchstone Delta, the Intel iPSC, the -Kendall Square Research KSR-1, and the Alliant FX/2800 MPP system. -Applications port between all of these environments without modification to -the parallel constructs. This toolkit is available without charge, along with -a set of example 'chemistry' applications. We are using this toolkit in our -production codes, but cannot guarantee support or accept any liability for its -use. - -Release Notes (4.04) --------------------- - - 1) Several bug fixes for SUN and Intel - - 2) Fixed, but slow, KSR port ... bug KSR to write quality software - instead of hacking - - 3) Paragon port - - 4) DEC Alpha port (thanks to Nico Sanna, Rome) - - 5) Implemented bitwise OR operation for IGOP - - 6) Fixed UNIX version so that an executable invoked on its own (i.e., - without using the parallel command) functions correctly as a single - process ... this is very useful for debugging (thanks to J. Nieplocha). - - 7) Probe now formally supported on all platforms including entry in - ipcv4.0/README and the interactive test program - - -Release Notes (4.03) --------------------- - - 1) Tuning of shared memory communication mechanism. - - 2) Fixed bugs causing non-conservation of energy in MD example. - - -Release Notes (4.02) --------------------- - - 0) Note my new address. - - 1) New port to the KSR, done by KSR and integrated by me back - into the main release. This looks good, though the C - compiler will not compile the socket based code with any - optimisation and I have not done any extensive testing other - than verify the test code and examples. - - 2) Script (makep) to build a .p file for a UNIX workstation - network. It queries a list of machines to see if they are - up and then sorts them by their load. Finally, a .p file - is built to use the desired no. of machines using the least - loaded ones first. Have a look at the top of tcgmsg/makep - for more info. - - 3) A few minor bug fixes. Worst one was specific to SUN Fortran. - - 4) New function returns wall clock time in seconds as accurately - as possible (or rather as accurately as currently implemented). - - DOUBLE PRECISION FUNCTION TCGTIME() - double TCGTIME_() - - 5) Wrapper around nice for FORTRAN users only - - INTEGER FUNCTION NICE(INCR) - INTEGER INCR - - 6) MD and SCF examples extensively revised to more accurately reflect - actual applications. - - 7) Thanks to multiple users for input (Rick Kendall, Theresa Windus, - Mike Coolidge, Joe Golab, Rik Littlefield, ...) - - 8) Experiments with a function to probe the message queue for - available messages. - - INTEGER FUNCTION PROBE(TYPE, NODE) - INTEGER TYPE, NODE - - long PROBE_(long * type, long * node) - - 9) Are you aware of the message passing interface standardization - effort? Send mail to netlib@ornl.gov with the body - 'send index from mpi' to get more information. - - -Release Notes (4.0) -------------------- - - 1) Hopefully all (?) inconsistencies have now been removed. - In particular receive from anyone now checks the type on ALL - messages by peeking at data in the socket/buffer if necessary. - - 2) Numerous bug fixes, mostly to do with specific process - distributions and removing stupidities in socket I/O. - - 3) Much improved (2-10+) short message performance. Part of this - comes from SHORT messages being sent asynchronously as far as - buffering in the transport mechanism permits. This also makes - the UNIX world consistent with the iPSC world. However, the - maximum length/number of messages that can be sent asyncrhonously - is system and transport mechanism dependent and should not be - relied upon. - - 4) Updating of all existing ports to reflect latest O/S releases - (e.g. CRAY UNICOS 6.* and 7.*) - - 5) New ports including the Alliant Massively parallel system. - - 6) Simplified install procedure ... just type 'make all MACHINE=CRAY' - - 7) Broadcast and global operations have been tuned to both the - UNIX networked environment (to minimize network traffic) and - to the iPSC and DELTA. All exploit full pipe-lining and overlap - of operations on large vectors. The requirement to provide - workspace for the global operations has been eliminated. - - 8) Improved support for the IPSC and DELTA, including working demo - command for the examples. - - -Obtaining TCGMSG ----------------- - - a) anonymous ftp from ftp.tcg.anl.gov - - If you obtain the source by ftp please send e-mail to me so that - I can maintain a list of users for bugfixes etc. - - Using binary mode transfer the file pub/tcgmsg/tcgmsg.4.04.tar.Z - (or tcgmsg.4.04.tar if you don't have compress). - - b) e-mail (only if you can't ftp it please) - - Send a request to me at the above address. I will send you a - split uuencoded version of tcgmsg.4.04.tar.Z. - - -Installing TCGSMG ------------------ - - a) Existing ports - - Meaning of status field - - . - tested and fully functional - s - tested but O/S does not support shared memory -> sockets only - x - tested but O/S does not support XDR -> no data conversion - ? - not tested recently but is supposed to work - ! - known problems - - - Status Machine Description - ------ ------- ----------- - . SUN Sun workstation running Sun O/S 4.0 or above - . KSR Kendall Square Research KSR-1 - . DEC DecStation running ULTRIX - . DECOSF DEC Alpha running OSF (must use 8 byte FORTRAN integers) - . SGI Silicon Graphics workstation IRIX 4.0 - . ALLIANT Alliant FX/8/80/800/2800 Concentrix 2800 2.2 - ? ALLIANTMPP Alliant MPP system ALPHA VERSION - . ARDENT Stardent (formerly Ardent) Titan O/S 2.2 - . CONVEX Convex C220 running ConvexOS V8.1 - . IBM IBM R6000 with AIX 3.1 (xlf -qEXTNAME) - . IBMNOEXT IBM R6000 with AIX 3.1 - . HPUX Hewlett-Packard risc (HP-UX A.B8.05) (f77 +ppu) - . HPUXNOEXT Hewlett-Packard risc (HP-UX A.B8.05) - . IPSC Intel iPSC i860 hypercube - . DELTA Intel/DARPA Touchstone Delta - . PARAGON Intel Paragon running OSF - x APOLLO Apollo DN10000, DomainOS Release 10.3 (bsd4.3) - s CRAY Cray running UNICOS 6.1 or above - s NEXT NeXT Workstation (OS 2.1) (Absoft f77 -f -N9) - s NEXTNOEXT NeXT Workstation (OS 2.1) (Absoft f77 -f) - ? ENCORE Encore running UMAX-4.3 - ! SEQUENT Sequent (DYNIX V3.0.14) - - In the top-level TCGMSG directory type - - make all MACHINE= - - e.g. - - make all MACHINE=ALLIANT - - That's it, unless you're building the IPSC source. For the IPSC you also - need to log onto the cube manager and in the tcgmsg/ipcv4.0 directory type - 'make parallel'. See the section on testing below and the Makefile and - README in the ipcv4.0 directory for more detailed information. - - b) Ports to new UNIX machines - - See ipcv4.0/README for suggested procedure. - - c) Ports to new true message passing machines - - See the source for the iPSC. - - -Testing -------- - - 1) System test codes in ipcv4.0 - - Make installs default proc-group files using the local - machine only. See the ipcv4.0/README info on how to include - additional hosts/processes. - - a) Hello world ... try the following command. - - parallel hello - - b) Interactive system test program - - parallel test - - c) Non-interactive FORTRAN test program - - parallel testf - - 2) The codes in examples - - e.g. - - cd examples - demo scf - ... respond 15 for the number of basis functions - - See the README for more info on the examples and how to run them. - diff --git a/armci/tcgmsg/ipcv4.0/brdcst.c b/armci/tcgmsg/ipcv4.0/brdcst.c deleted file mode 100644 index 4433482a4..000000000 --- a/armci/tcgmsg/ipcv4.0/brdcst.c +++ /dev/null @@ -1,75 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/brdcst.c,v 1.6 2002-07-17 17:20:11 vinod Exp $ */ - -#include "sndrcv.h" - -#include -#include "sndrcvP.h" - -void BRDCST_(type, buf, lenbuf, originator) - long *type; - void *buf; - long *lenbuf; - long *originator; -/* - broadcast buffer to all other processes from process originator - ... all processes call this routine specifying the same - orginating process. - - Optimized for communicating clusters of processes ... broadcast - amoung cluster masters, and then amoung slaves in a cluster. -*/ -{ - long me = NODEID_(); - long master = SR_clus_info[SR_clus_id].masterid; - long nslave = SR_clus_info[SR_clus_id].nslave; - long slaveid = me - master; - long synch = 1; - long lenmes, from, up, left, right; - - /* Process zero is at the top of the broadcast tree */ - - if ((me == *originator) && (me != 0)) { - long zero = 0; - SND_(type, buf, lenbuf, &zero, &synch); - } - else if ((*originator != 0) && (me == 0)) { - RCV_(type, buf, lenbuf, &lenmes, originator, &from, &synch); - } - - if ((*originator != 0) && (SR_n_proc == 2)) return; /* Special case */ - - /* Broadcast amoung cluster masters */ - - if (me == master) { - up = (SR_clus_id-1)/2; - left = 2*SR_clus_id + 1; - right = 2*SR_clus_id + 2; - up = SR_clus_info[up].masterid; - left = (left < SR_n_clus) ? SR_clus_info[left].masterid : -1; - right = (right < SR_n_clus) ? SR_clus_info[right].masterid : -1; - - if (me != 0) - RCV_(type, buf, lenbuf, &lenmes, &up, &from, &synch); - if (left > 0) - SND_(type, buf, lenbuf, &left, &synch); - if (right > 0) - SND_(type, buf, lenbuf, &right, &synch); - } - - /* Broadcast amoung local slaves */ - - up = master + (slaveid-1)/2; - left = master + 2*slaveid + 1; - right = master + 2*slaveid + 2; - - if (me != master) - RCV_(type, buf, lenbuf, &lenmes, &up, &from, &synch); - if (left < (master+nslave)) - SND_(type, buf, lenbuf, &left, &synch); - if (right < (master+nslave)) - SND_(type, buf, lenbuf, &right, &synch); -} diff --git a/armci/tcgmsg/ipcv4.0/checkbyte.c b/armci/tcgmsg/ipcv4.0/checkbyte.c deleted file mode 100644 index 814703c11..000000000 --- a/armci/tcgmsg/ipcv4.0/checkbyte.c +++ /dev/null @@ -1,27 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/checkbyte.c,v 1.4 1995-02-24 02:17:11 d3h325 Exp $ */ - -unsigned char CheckByte(c, n) - unsigned char *c; - long n; -{ -/* - unsigned char sum = (char) 0; - while (n-- > 0) - sum = sum ^ *c++; - - return sum; -*/ - - unsigned int sum = 0; - unsigned int mask = 0xff; - - while (n-- > 0) - sum += (int) *c++; - - sum = (sum + (sum>>8) + (sum>>16) + (sum>>24)) & mask; - return (unsigned char) sum; -} diff --git a/armci/tcgmsg/ipcv4.0/cluster.c b/armci/tcgmsg/ipcv4.0/cluster.c deleted file mode 100644 index 2387c04e3..000000000 --- a/armci/tcgmsg/ipcv4.0/cluster.c +++ /dev/null @@ -1,195 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/cluster.c,v 1.11 2004-04-01 02:04:56 manoj Exp $ */ - -#include -#include - -#ifdef SEQUENT -#include -#else -#include -#endif - -#include "sndrcvP.h" -#include "defglobals.h" - -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT)|| defined(AIX) \ - || defined(CONVEX) || defined(ARDENT) || defined(ULTRIX) \ - || defined(NEXT) -extern char *strdup(); -extern char *strtok(); -#endif - -extern void Error(); - -void InitClusInfoNotParallel() -{ -int SR_n_clus = 0; - - SR_clus_info[SR_n_clus].user = "?"; - SR_clus_info[SR_n_clus].hostname = "?"; - SR_clus_info[SR_n_clus].nslave = 1; - SR_clus_info[SR_n_clus].image = "?"; - SR_clus_info[SR_n_clus].workdir = "?"; - SR_clus_info[SR_n_clus].masterid = 0; -} - -void InitClusInfo(procgrp, masterhostname) - char *procgrp, *masterhostname; -/* - Initialize the SR_clus_info structure, SR_n_clus and SR_n_proc - by parsing the PROCGRP info. - - The procgrp file consists of white space separated records. - user host nslave image workdir - - Masterhostname is the name of the host running the parallel command. - - This routine could do with some more error checking. - -*/ -{ - char *user, *host, *nslave, *image, *workdir; - char *white = " \t\n"; - char *tmp = strdup(procgrp); - int i; - - SR_n_clus = 0; - SR_n_proc = 0; - - if (!tmp) Error("InitClusInfo: no memory", 0L); - - while (1) { - user = strtok(tmp, white); - tmp = (char *) NULL; - if (user == (char *) NULL) - break; - host = strtok(tmp, white); - nslave = strtok(tmp, white); - image = strtok(tmp, white); - workdir = strtok(tmp, white); - if (workdir == (char *) NULL) - Error("InitClusInfo: error parsing PROCGRP, line=",SR_n_clus+1); - - if (SR_n_clus == MAX_CLUSTER) - Error("InitClusInfo: maximum no. of clusters exceeded", - (long) MAX_CLUSTER); - - if (atoi(nslave) > MAX_SLAVE) - Error("InitClusInfo: maximum no. of slaves per cluster exceeded", - (long) MAX_SLAVE); - - SR_clus_info[SR_n_clus].user = strdup(user); - SR_clus_info[SR_n_clus].hostname = strdup(host); - SR_clus_info[SR_n_clus].nslave = atoi(nslave); - SR_clus_info[SR_n_clus].image = strdup(image); - SR_clus_info[SR_n_clus].workdir = strdup(workdir); - SR_clus_info[SR_n_clus].masterid = SR_n_proc; - - if (!SR_clus_info[SR_n_clus].user || !SR_clus_info[SR_n_clus].hostname || - !SR_clus_info[SR_n_clus].image || !SR_clus_info[SR_n_clus].workdir) - Error("InitClusInfo: no memory 2 ", 0L); - - for (i=0; i -#include -#include - -#include "sndrcvP.h" -#include "sndrcv.h" -#include "signals.h" -#include "tcgsockets.h" - -#if defined(SHMEM) || defined(SYSV) -#include "sema.h" -#include "tcgshmem.h" -#endif - -extern jmp_buf SR_jmp_buf; /* Jumped to on soft error */ - -#include - -extern void exit(); -extern int SR_caught_sigint; - -void Error(string, integer) - char *string; - long integer; -{ - (void) signal(SIGCHLD, SIG_DFL); /* Death of children to be expected */ - (void) signal(SIGINT, SIG_IGN); - - (void) fflush(stdout); - if (SR_caught_sigint) { - (void) fprintf(stderr,"%3ld: interrupt(%d)\n",NODEID_(), SR_caught_sigint); - (void) fflush(stderr); - } - else { - (void) fprintf(stdout,"%3ld: %s %ld (%#lx).\n", NODEID_(), string, - integer,integer); - (void) fflush(stdout); - (void) fprintf(stderr,"%3ld: %s %ld (%#lx).\n", NODEID_(), string, - integer,integer); - if (errno != 0) - perror("system error message"); - if (DEBUG_) - PrintProcInfo(); - } - (void) fflush(stdout); - (void) fflush(stderr); - - /* Shut down the sockets and remove shared memory and semaphores to - propagate an error condition to anyone that is trying to communicate - with me */ - - ZapChildren(); /* send interrupt to children which should trap it - and call Error in the handler */ - -#if defined(SHMEM) || defined(SYSV) -# if (defined(SGI_N32) || defined(SGITFP)) -# define PARTIALSPIN -# else -# define NOSPIN -# endif -#endif - -#if defined(SHMEM) || defined(SYSV) -#if defined(NOSPIN) || defined(PARTIALSPIN) - (void) SemSetDestroyAll(); -#endif - (void) DeleteSharedRegion(SR_proc_info[NODEID_()].shmem_id); -#endif - ShutdownAll(); /* Close sockets for machines with static kernel */ - -/* abort(); */ - - if (SR_exit_on_error) - exit(1); - else { - SR_error = 1; - (void) longjmp(SR_jmp_buf, 1); /* For NXTVAL server */ - } -} - -void PARERR_(code) - long *code; -/* - Interface from fortran to c error routine -*/ -{ - Error("User detected error in FORTRAN", *code); -} diff --git a/armci/tcgmsg/ipcv4.0/evlog.c b/armci/tcgmsg/ipcv4.0/evlog.c deleted file mode 100644 index 64be6e477..000000000 --- a/armci/tcgmsg/ipcv4.0/evlog.c +++ /dev/null @@ -1,405 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/evlog.c,v 1.7 2003-09-18 18:27:43 edo Exp $ */ - -/* Event logging routine with key driven varargs interface */ - -#include - -#ifdef MACX -/* jn: hack around bug in /usr/include/varargs.h */ -#ifndef __PPC__ -# define __PPC__ -#endif -#ifndef _CALL_SYSV -# define _CALL_SYSV -#endif -#endif -#include - -extern long nodeid_(); - -#ifdef SEQUENT -#include -#else -#include -#endif - - -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || \ - defined(CONVEX) || defined(ARDENT) || defined(ULTRIX) || \ - defined(AIX) || defined(IPSC) || defined(NEXT) || defined(DECOSF) -extern char *strdup(); -#endif - -#if defined(ULTRIX) || defined(SGI) || defined(NEXT) || defined(KSR) || \ - defined(DECOSF) -extern void *malloc(); -#else -#include -#endif - -#include "evlog.h" - -static double walltime(); - -/* If sprintf returns an integer instead of a character pointer - then define the flag INTSPRINTF */ - -#if defined(SGI) || defined(AIX) || defined(IPSC) || defined(CRAY) || \ - defined(HPUX) || defined(ALLIANT) || defined(CONVEX) || defined(NEXT) || \ - defined(KSR) || defined(DECOSF) -#define INTSPRINTF -#endif - -void evlog(int farg_key, ...) -/* - The format of the argument list is as follows: - - evlog([(int) key, [values, ...]], ..., EVKEY_LAST_ARG) - - Arguments are read as keys with corresponding values. Recognised keys - are defined in evlog.h and are described in detail below. - - Logging is enabled/disabled by calling evlog with one of EVKEY_ENABLE - or EVKEY_DISABLE specified. Note that EVKEY_ENABLE must be the - first key specified for it to be recognized and that all keys - in the argument list after EVKEY_DISABLE are ignored. By default - events are logged in the file events. This can be overridden with - the key EVKEY_FILENAME, which takes the filename as its value. - - The model for logging events assumed by the post-analysis routines - assumes that upon logging an event: - - a) no state chage occurs (EVKEY_EVENT). The event is just recorded. - - b) the process changes state by pushing the event onto the state stack - (EVKEY_BEGIN). - - c) the process changes state by popping an event off the state stack - (EVKEY_END). If the event or state popped off the stack does not - match the specified event then the post-analysis may get confused - but this does not interfere with the actual logging. - - EVKEY_EVENT, EVKEY_BEGIN or EVKEY_END must be the first key specified other - than a possible EVKEY_ENABLE. - - Internally an event is stored as a large character string to simplify - post-analysis. Users specify data for storage in addition to - that which is automatically stored (only the time and process) with - key, value combinations (EVKEY_STR_INT, EVKEY_STR_DBL, EVKEY_STR). - Many such key-value combinations as required may be specified. - Since the internal data format uses colons ':', double quotation - marks '"' and carriage returns users should avoid these in their - string data. - - ---------------------------- - Sample calling sequence: - - evlog(EVKEY_ENABLE, EVKEY_FILENAME, "events.log", EVKEY_LAST_ARG); - - - evlog(EVKEY_EVENT, "Finished startup code", - EVKEY_STR, "Now do some real work", - EVKEY_LAST_ARG); - - evlog(EVKEY_BEGIN, "Get Matrix", EVKEY_LAST_ARG); - - - evlog(EVKEY_END, "Get matrix", - EVKEY_STR_INT, "Size of matrix", (int) N, - EVKEY_STR_DBL, "Norm of matrix", (double) matrix_norm, - EVKEY_LAST_ARG); - - evlog(EVKEY_BEGIN, "Transform matrix", - EVKEY_STR_DBL, "Recomputed norm", (double) matrix_norm, - EVKEY_LAST_ARG); - - - evlog(EVKEY_END, "Transform matrix", - EVKEY_STR_INT, "No. of iterations", (int) niters, - EVKEY_LAST_ARG); - - evlog(EVKEY_DUMP, EVKEY_DISABLE, EVKEY_LAST_ARG); - - evlog(EVKEY_EVENT, "Logging is disabled ... this should not print", - EVKEY_DUMP, EVKEY_LAST_ARG); - - ---------------------------- - - EVKEY_LAST_ARG - Terminates list ... takes no value and must be present - - EVKEY_EVENT, (char *) event - Simply log occurence of the event - - EVKEY_BEGIN, (char *) event - Push event onto process state stack - - EVKEY_END, (char *) event - Pop event off process state stack - - EVKEY_MSG_LEN, (int) length - Value is (int) mesage length SND/RCV only - - EVKEY_MSG_TO, (int) to - Value is (int) to process id SND/RCV only - - EVKEY_MSG_FROM, (int) from - Value is (int) from process SND/RCV only - - EVKEY_MSG_TYPE, (int) type - Value is (int) message type SND/RCV only - - EVKEY_STR_INT, (char *) string, (int) data - User data value pair - - EVKEY_STR_DBL, (char *) string, (double) data - User data value pair (char *), (double) - - EVKEY_STR, (char *) string - User data value (char *) - - EVKEY_ENABLE - Enable logging - - EVKEY_DISABLE - Disable logging - - EVKEY_DUMP - Dump out the current buffer to disk - - EVKEY_FILE, (char *) filename - Use specified file to capture events. Default is "events". -*/ -{ - static int logging=0; /* Boolean flag for login enabled/disabled */ - static int error=0; /* Boolean flag for error detected */ - static int ncall=0; /* Need to do stuff on first entry */ - static char *buffer; /* Logging buffer ... null terminated */ - static char *bufpt; /* Pointer to next free char in buffer */ - static int left; /* Amount of free space in buffer */ -#define BUFLEN 262144 /* Size allocated for buffer ... biggish */ -#define MAX_EV_LEN 1000 /* Assumed maximum size of single event record */ - static FILE *file; /* File where events will be dumped */ - static char *filename = "events"; /* Default name of events file */ - - va_list ap; /* For variable argument list */ - int key; /* Hold key being processed */ - int nchars; /* No. of chars printed by sprintf call */ - char *temp; /* Temporary copy of bufpt */ - char *string; /* Temporary */ - int integer; /* Temporary */ - double dbl; /* Temporary */ - int valid; /* Temporary */ - -#define ERROR_RETURN error = 1; return; - -#define DUMPBUF (void) fputs(buffer, file); \ - (void) fflush(file);\ - if(ferror(file)) {ERROR_RETURN} \ - bufpt = buffer; left = BUFLEN; -/* - sprintf is a disaster area!!!! -#ifdef INTSPRINTF -#define RECORD(A) nchars = (A); bufpt += nchars; left -= nchars -#else -#define RECORD(A) nchars = strlen(A); bufpt += nchars; left -= nchars -#endif -*/ -#define RECORD(A) {A; nchars = strlen(bufpt); bufpt += nchars; left -= nchars;} - - - /* If an error was detected on a previous call don't even try to - do anything */ - - if (error) {ERROR_RETURN} - - /* First time in need to allocate the buffer, open the file etc */ - - if (ncall == 0) { - ncall = 1; - if (!(bufpt = buffer = malloc((unsigned) BUFLEN))) {ERROR_RETURN} - left = BUFLEN; - - if (!(file = fopen(filename, "w"))) {ERROR_RETURN} - } - - /* Parse the arguments */ - - temp = bufpt; /* Save to check if anything has been logged */ - valid = 0; /* One of BEGIN, END or EVENT must preceed most keys */ - - va_start(ap, farg_key); - key = farg_key; - while (key != EVKEY_LAST_ARG) { - - if ( (!logging) && (key != EVKEY_ENABLE) ) - return; - - switch (key) { - - case EVKEY_ENABLE: - logging = 1; - break; - - case EVKEY_DISABLE: - logging = 0; - goto done; -/* break; */ - - case EVKEY_FILENAME: - if (!(filename = strdup(va_arg(ap, char *)))) - {ERROR_RETURN} - if (!(file = freopen(filename, "w", file))) {ERROR_RETURN} - break; - - case EVKEY_BEGIN: - valid = 1; - RECORD(sprintf(bufpt, ":BEGIN:%s", va_arg(ap, char *))); - RECORD(sprintf(bufpt, ":TIME:%.2f", walltime())); - break; - - case EVKEY_END: - valid = 1; - RECORD(sprintf(bufpt, ":END:%s", va_arg(ap, char *))); - RECORD(sprintf(bufpt, ":TIME:%.2f", walltime())); - break; - - case EVKEY_EVENT: - valid = 1; - RECORD(sprintf(bufpt, ":EVENT:%s", va_arg(ap, char *))); - RECORD(sprintf(bufpt, ":TIME:%.2f", walltime())); - break; - - case EVKEY_MSG_LEN: - if (!valid) {ERROR_RETURN} - RECORD(sprintf(bufpt, ":MSG_LEN:%d", va_arg(ap, int))); - break; - - case EVKEY_MSG_TO: - if (!valid) {ERROR_RETURN} - RECORD(sprintf(bufpt, ":MSG_TO:%d", va_arg(ap, int))); - break; - - case EVKEY_MSG_FROM: - if (!valid) {ERROR_RETURN} - RECORD(sprintf(bufpt, ":MSG_FROM:%d", va_arg(ap, int))); - break; - - case EVKEY_MSG_TYPE: - if (!valid) {ERROR_RETURN} - RECORD(sprintf(bufpt, ":MSG_TYPE:%d", va_arg(ap, int))); - break; - - case EVKEY_MSG_SYNC: - if (!valid) {ERROR_RETURN} - RECORD(sprintf(bufpt, ":MSG_SYNC:%d", va_arg(ap, int))); - break; - - case EVKEY_STR_INT: - if (!valid) {ERROR_RETURN} - string = va_arg(ap, char *); - integer = va_arg(ap, int); - RECORD(sprintf(bufpt, ":STR_INT:%s:%d", string, integer)); - break; - - case EVKEY_STR_DBL: - if (!valid) {ERROR_RETURN} - string = va_arg(ap, char *); - dbl = va_arg(ap, double); - RECORD(sprintf(bufpt, ":STR_DBL:%s:%g", string, dbl)); - break; - - case EVKEY_STR: - if (!valid) {ERROR_RETURN} - RECORD(sprintf(bufpt, ":STR:%s", va_arg(ap, char *))); - break; - - case EVKEY_DUMP: - {DUMPBUF} - if (temp != bufpt) { - RECORD(sprintf(bufpt, "\n")); - temp = bufpt; - } - break; - - default: - {DUMPBUF} - {ERROR_RETURN} - } - key = va_arg(ap, int); - } - - done: - va_end(ap); - - /* Put a linefeed on the end of the record if something is written */ - - if (temp != bufpt) { - RECORD(sprintf(bufpt, "\n")); - temp = bufpt; - } - - /* Should really check on every access to the buffer that there is - enough space ... however just assume a very large maximum size - for a single event log entry and check here */ - - if (left <= 0) - {ERROR_RETURN} - - if (left < MAX_EV_LEN) - {DUMPBUF} -} - -#include "sndrcv.h" - -static double walltime() -/* - return the wall time in seconds as a double -*/ -{ - return ((double) MTIME_()) * 0.01; -} - -/* -int main() -{ - int N = 19; - double matrix_norm = 99.1; - int niters = 5; - - evlog(EVKEY_ENABLE, EVKEY_FILENAME, "events.log", EVKEY_LAST_ARG); - - - evlog(EVKEY_EVENT, "Finished startup code", - EVKEY_STR, "Now do some real work", - EVKEY_LAST_ARG); - - evlog(EVKEY_BEGIN, "Get Matrix", EVKEY_LAST_ARG); - - - evlog(EVKEY_END, "Get matrix", - EVKEY_STR_INT, "Size of matrix", (int) N, - EVKEY_STR_DBL, "Norm of matrix", (double) matrix_norm, - EVKEY_LAST_ARG); - - evlog(EVKEY_BEGIN, "Transform matrix", - EVKEY_STR_DBL, "Recomputed norm", (double) matrix_norm, - EVKEY_LAST_ARG); - - - evlog(EVKEY_END, "Transform matrix", - EVKEY_STR_INT, "No. of iterations", (int) niters, - EVKEY_LAST_ARG); - - evlog(EVKEY_DUMP, EVKEY_LAST_ARG); - - evlog(EVKEY_EVENT, "Logging is disabled ... this should not print", - EVKEY_DUMP, EVKEY_LAST_ARG); - - return 0; -} -*/ diff --git a/armci/tcgmsg/ipcv4.0/evlog.h b/armci/tcgmsg/ipcv4.0/evlog.h deleted file mode 100644 index bf056e12a..000000000 --- a/armci/tcgmsg/ipcv4.0/evlog.h +++ /dev/null @@ -1,36 +0,0 @@ -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/evlog.h,v 1.5 2003-06-27 13:52:54 manoj Exp $ */ - -/* - Define EVENT and KEY values used when calling evlog. -*/ - -extern void evlog(int farg_key, ...); - -/* Values of keys in key value pairs */ - -#define EVKEY_LAST_ARG 0 /* Terminates list ... takes no value */ - -#define EVKEY_BEGIN 1 /* Push (char *) value onto state stack */ -#define EVKEY_END 2 /* Pop (char *) value off state stack */ -#define EVKEY_EVENT 3 /* Record (char *) value, no stack change */ - -#define EVKEY_MSG_LEN 4 /* Value is (int) mesage length SND/RCV only */ -#define EVKEY_MSG_TO 5 /* Value is (int) to process id SND/RCV only */ -#define EVKEY_MSG_FROM 6 /* Value is (int) from process SND/RCV only */ -#define EVKEY_MSG_TYPE 7 /* Value is (int) message type SND/RCV only */ -#define EVKEY_MSG_SYNC 8 /* Value is (int) message sync SND/RCV only */ - -#define EVKEY_STR_INT 9 /* User data value pair (char *), (int) */ -#define EVKEY_STR_DBL 10 /* User data value pair (char *), (double) */ -#define EVKEY_STR 11 /* User data value (char *) */ - -#define EVKEY_ENABLE 12 /* Enable logging ... takes no value */ -#define EVKEY_DISABLE 13 /* Disable logging ... takes no value */ - -#define EVKEY_DUMP 14 /* Dump out the current buffer to disk */ - -#define EVKEY_FILENAME 15 /* Set the name of the events file */ - -#define EVENT_SND "Snd" /* Predefined strings for internal events */ -#define EVENT_RCV "Rcv" -#define EVENT_PROCESS "Process" diff --git a/armci/tcgmsg/ipcv4.0/evon.c b/armci/tcgmsg/ipcv4.0/evon.c deleted file mode 100644 index 989a96e44..000000000 --- a/armci/tcgmsg/ipcv4.0/evon.c +++ /dev/null @@ -1,192 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/evon.c,v 1.4 1995-02-24 02:17:17 d3h325 Exp $ */ - -/* Crude FORTRAN interface to C event logging routines. - See evlog.c for more details. - - FORTRAN character variables are so unportable that guaranteeing - that U can parse a variable length argument list is next to impossible. - - This provides very basic event logging functionality. - - CALL EVON() - - enable logging. - - CALL EVOFF() - - disable logging. - - CALL EVBGIN("event description") - - push event onto state stack - - CALL EVEND("event description") - - pop event off state stack - - CALL EVENT("event description") - - log occurence of event that doesn't change state stack -*/ - -#include - -#ifdef IPSC -#define bcopy(a, b, n) memcpy((b), (a), (n)) -#endif - -#if 0 -#if defined(ULTRIX) || defined(SGI) || defined(NEXT) || defined(HPUX) || \ - defined(KSR) || defined(DECOSF) -extern void *malloc(); -#else -extern char *malloc(); -#endif -#endif - -#include "evlog.h" - -/* These to get portable FORTRAN interface ... these routines - will not be called from C which has the superior evlog interface */ - -#if (defined(AIX) || defined(NEXT) || defined(HPUX)) && !defined(EXTNAME) -#define evon_ evon -#define evoff_ evoff -#define evbgin_ evbgin -#define evend_ evend -#define event_ event -#endif - -#if (defined(CRAY) || defined(ARDENT)) -#define evon_ EVON -#define evoff_ EVOFF -#define evbgin_ EVBGIN -#define evend_ EVEND -#define event_ EVENT -#endif - -/* Define crap for handling FORTRAN character arguments */ - -#ifdef CRAY -#include -#endif -#ifdef ARDENT -struct char_desc { - char *string; - int len; -}; -#endif - -void evon_() -{ -#ifdef EVENTLOG - evlog(EVKEY_ENABLE, EVKEY_LAST_ARG); -#endif -} - -void evoff_() -{ -#ifdef EVENTLOG - evlog(EVKEY_DISABLE, EVKEY_LAST_ARG); -#endif -} - -#ifdef ARDENT -void evbgin_(arg) - struct char_desc *arg; -{ - char *string = arg->string; - int len = arg->len; -#endif -#ifdef CRAY -void evbgin_(arg) - _fcd arg; -{ - char *string = _fcdtocp(arg); - int len = _fcdlen(arg); -#endif -#if !defined(ARDENT) && !defined(CRAY) -void evbgin_(string, len) - char *string; - int len; -{ -#endif -#ifdef EVENTLOG - char *value = malloc( (unsigned) (len+1) ); - - if (value) { - (void) bcopy(string, value, len); - value[len] = '\0'; - evlog(EVKEY_BEGIN, value, EVKEY_LAST_ARG); - (void) free(value); - } -#endif -} - -#ifdef ARDENT -void evend_(arg) - struct char_desc *arg; -{ - char *string = arg->string; - int len = arg->len; -#endif -#ifdef CRAY -void evend_(arg) - _fcd arg; -{ - char *string = _fcdtocp(arg); - int len = _fcdlen(arg); -#endif -#if !defined(CRAY) && !defined(ARDENT) -void evend_(string, len) - char *string; - int len; -{ -#endif -#ifdef EVENTLOG - char *value = malloc( (unsigned) (len+1) ); - - if (value) { - (void) bcopy(string, value, len); - value[len] = '\0'; - evlog(EVKEY_END, value, EVKEY_LAST_ARG); - (void) free(value); - } -#endif -} - -#ifdef ARDENT -void event_(arg) - struct char_desc *arg; -{ - char *string = arg->string; - int len = arg->len; -#endif -#ifdef CRAY -void event_(arg) - _fcd arg; -{ - char *string = _fcdtocp(arg); - int len = _fcdlen(arg); -#endif -#if !defined(ARDENT) && !defined(CRAY) -void event_(string, len) - char *string; - int len; -{ -#endif -#ifdef EVENTLOG - char *value = malloc( (unsigned) (len+1) ); - - if (value) { - (void) bcopy(string, value, len); - value[len] = '\0'; - evlog(EVKEY_EVENT, value, EVKEY_LAST_ARG); - (void) free(value); - } -#endif -} diff --git a/armci/tcgmsg/ipcv4.0/globalop.c b/armci/tcgmsg/ipcv4.0/globalop.c deleted file mode 100644 index 6afae6884..000000000 --- a/armci/tcgmsg/ipcv4.0/globalop.c +++ /dev/null @@ -1,360 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/globalop.c,v 1.8 2004-04-01 02:04:56 manoj Exp $ */ -#include -#ifdef SEQUENT -#include -#else -#include -#endif -#include "sndrcv.h" -#include "msgtypesc.h" - -#define TCG_MAX(a,b) (((a) >= (b)) ? (a) : (b)) -#define TCG_MIN(a,b) (((a) <= (b)) ? (a) : (b)) -#define TCG_ABS(a) (((a) >= 0) ? (a) : (-(a))) - -extern void free(); - -#ifndef IPSC -#include "sndrcvP.h" - -#define GOP_BUF_SIZE 81920 - -/*\ reduce operation for int -\*/ -static void idoop(long n, char * op, long * x, long * work) -{ - if (strncmp(op,"+",1) == 0) { - while(n--) { - *x++ += *work++; - } - } - else if (strncmp(op,"*",1) == 0) { - while(n--) { - *x++ *= *work++; - } - } - else if (strncmp(op,"max",3) == 0) { - while(n--) { - *x = TCG_MAX(*x, *work); - x++; work++; - } - } - else if (strncmp(op,"min",3) == 0) { - while(n--) { - *x = TCG_MIN(*x, *work); - x++; work++; - } - } - else if (strncmp(op,"absmax",6) == 0) { - while(n--) { - register long x1 = TCG_ABS(*x), x2 = TCG_ABS(*work); - *x = TCG_MAX(x1, x2); - x++; work++; - } - } - else if (strncmp(op,"absmin",6) == 0) { - while(n--) { - register long x1 = TCG_ABS(*x), x2 = TCG_ABS(*work); - *x = TCG_MIN(x1, x2); - x++; work++; - } - } - else if (strncmp(op,"or",2) == 0) { - while(n--) { - *x |= *work; - x++; work++; - } - } - /* these are new */ - else if ((strncmp(op, "&&", 2) == 0) || (strncmp(op, "land", 4) == 0)) { - while(n--) { - *x = *x && *work; - x++; work++; - } - } - else if ((strncmp(op, "||", 2) == 0) || (strncmp(op, "lor", 3) == 0)) { - while(n--) { - *x = *x || *work; - x++; work++; - } - } - else if ((strncmp(op, "&", 1) == 0) || (strncmp(op, "band", 4) == 0)) { - while(n--) { - *x &= *work; - x++; work++; - } - } - else if ((strncmp(op, "|", 1) == 0) || (strncmp(op, "bor", 3) == 0)) { - while(n--) { - *x |= *work; - x++; work++; - } - } - else { - Error("idoop: unknown operation requested", n); - } -} - -static void ddoop(long n, char * op, double * x, double * work) -{ - if (strncmp(op,"+",1) == 0) { - while(n--) { - *x++ += *work++; - } - } - else if (strncmp(op,"*",1) == 0) { - while(n--) { - *x++ *= *work++; - } - } - else if (strncmp(op,"max",3) == 0) { - while(n--) { - *x = TCG_MAX(*x, *work); - x++; work++; - } - } - else if (strncmp(op,"min",3) == 0) { - while(n--) { - *x = TCG_MIN(*x, *work); - x++; work++; - } - } - else if (strncmp(op,"absmax",6) == 0) { - while(n--) { - register double x1 = TCG_ABS(*x), x2 = TCG_ABS(*work); - *x = TCG_MAX(x1, x2); - x++; work++; - } - } - else if (strncmp(op,"absmin",6) == 0) { - while(n--) { - register double x1 = TCG_ABS(*x), x2 = TCG_ABS(*work); - *x = TCG_MIN(x1, x2); - x++; work++; - } - } - else { - Error("ddoop: unknown operation requested", (long) n); - } -} - -/* - Global summation optimized for networks of clusters of processes. - - This routine is directly callable from C only. There is a - wrapper that makes fortran work (see bottom of this file). -*/ -void DGOP_(long * ptype, double * x, long * pn, char * op, int len) -{ - long me = NODEID_(); - long master = SR_clus_info[SR_clus_id].masterid; - long nslave = SR_clus_info[SR_clus_id].nslave; - long slaveid = me - master; - long synch = 1; - long type = (*ptype & MSGDBL) ? *ptype : *ptype + MSGDBL; - long nleft = *pn; - long buflen = TCG_MIN(nleft,GOP_BUF_SIZE); /* Try to get even sized buffers */ - long nbuf = (nleft-1) / buflen + 1; - long zero = 0; - double *tmp = x; - double *work; - long nb, ndo, lenmes, from, up, left, right; - - buflen = (nleft-1) / nbuf + 1; - if (!(work = (double *) malloc((unsigned) (buflen*sizeof(double))))) - Error("DGOP: failed to malloc workspace", nleft); - - /* This loop for pipelining and to avoid caller - having to provide workspace */ - - while (nleft) { - ndo = TCG_MIN(nleft, buflen); - nb = ndo * sizeof(double); - - /* Do summation amoung slaves in a cluster */ - - up = master + (slaveid-1)/2; - left = master + 2*slaveid + 1; - right = master + 2*slaveid + 2; - - if (left < (master+nslave)) { - RCV_(&type, (char *) work, &nb, &lenmes, &left, &from, &synch); - ddoop(ndo, op, x, work); - } - if (right < (master+nslave)) { - RCV_(&type, (char *) work, &nb, &lenmes, &right, &from, &synch); - ddoop(ndo, op, x, work); - } - if (me != master) - SND_(&type, (char *) x, &nb, &up, &synch); - - /* Do summation amoung masters */ - - if (me == master) { - up = (SR_clus_id-1)/2; - left = 2*SR_clus_id + 1; - right = 2*SR_clus_id + 2; - up = SR_clus_info[up].masterid; - left = (left < SR_n_clus) ? SR_clus_info[left].masterid : -1; - right = (right < SR_n_clus) ? SR_clus_info[right].masterid : -1; - - if (left > 0) { - RCV_(&type, (char *) work, &nb, &lenmes, &left, &from, &synch); - ddoop(ndo, op, x, work); - } - if (right > 0) { - RCV_(&type, (char *) work, &nb, &lenmes, &right, &from, &synch); - ddoop(ndo, op, x, work); - } - if (me != 0) - SND_(&type, (char *) x, &nb, &up, &synch); - } - nleft -= ndo; - x += ndo; - type += 13; /* Temporary hack for hippi switch */ - } - free((char *) work); - - /* Zero has the results ... broadcast them back */ - nb = *pn * sizeof(double); - BRDCST_(&type, (char *) tmp, &nb, &zero); -} - -/* - Global summation optimized for networks of clusters of processes. - - This routine is directly callable from C only. There is a - wrapper that makes fortran work (see the bottom of this file). -*/ -void IGOP_(long * ptype, long * x, long * pn, char * op, int len) -{ - long me = NODEID_(); - long master = SR_clus_info[SR_clus_id].masterid; - long nslave = SR_clus_info[SR_clus_id].nslave; - long slaveid = me - master; - long synch = 1; - long type = (*ptype & MSGINT) ? *ptype : *ptype + MSGINT; - long nleft = *pn; - long zero = 0; - long *tmp = x; - long *work; - long nb, ndo, lenmes, from, up, left, right; - - if (!(work = (long *) - malloc((unsigned) (TCG_MIN(nleft,GOP_BUF_SIZE)*sizeof(long))))) - Error("IGOP: failed to malloc workspace", nleft); - - /* This loop for pipelining and to avoid caller - having to provide workspace */ - - while (nleft) { - ndo = TCG_MIN(nleft, GOP_BUF_SIZE); - nb = ndo * sizeof(long); - /* Do summation amoung slaves in a cluster */ - - up = master + (slaveid-1)/2; - left = master + 2*slaveid + 1; - right = master + 2*slaveid + 2; - - if (left < (master+nslave)) { - RCV_(&type, (char *) work, &nb, &lenmes, &left, &from, &synch); - idoop(ndo, op, x, work); - } - if (right < (master+nslave)) { - RCV_(&type, (char *) work, &nb, &lenmes, &right, &from, &synch); - idoop(ndo, op, x, work); - } - if (me != master) - SND_(&type, (char *) x, &nb, &up, &synch); - - /* Do summation amoung masters */ - - if (me == master) { - up = (SR_clus_id-1)/2; - left = 2*SR_clus_id + 1; - right = 2*SR_clus_id + 2; - up = SR_clus_info[up].masterid; - left = (left < SR_n_clus) ? SR_clus_info[left].masterid : -1; - right = (right < SR_n_clus) ? SR_clus_info[right].masterid : -1; - - if (left > 0) { - RCV_(&type, (char *) work, &nb, &lenmes, &left, &from, &synch); - idoop(ndo, op, x, work); - } - if (right > 0) { - RCV_(&type, (char *) work, &nb, &lenmes, &right, &from, &synch); - idoop(ndo, op, x, work); - } - if (me != 0) - SND_(&type, (char *) x, &nb, &up, &synch); - } - nleft -= ndo; - x += ndo; - type += 13; /* Temporary hack for hippi switch */ - } - (void) free((char *) work); - - /* Zero has the results ... broadcast them back */ - nb = *pn * sizeof(long); - BRDCST_(&type, (char *) tmp, &nb, &zero); -} - -#endif - -/* Wrapper for fortran interface ... UGH ... note that - string comparisons above do NOT rely on NULL termination - of the operation character string */ - -#ifdef CRAY -#include -#endif -#ifdef ARDENT -struct char_desc { - char *string; - int len; -}; -#endif - -#if defined(CRAY) || defined(CRAY) -#ifdef ARDENT -void dgop_(long * ptype, double * x, long * pn, struct char_desc * arg) -{ - char *op = arg->string; - int len_op = arg->len; -#endif -#if defined(CRAY) -void dgop_(ptype, x, pn, arg) - long *ptype, *pn; - double *x; - _fcd arg; -{ - char *op = _fcdtocp(arg); - int len_op = _fcdlen(arg); -#endif - DGOP_(ptype, x, pn, op); -} -#endif -/* This crap to handle FORTRAN character strings */ - -#if defined(CRAY) || defined(CRAY) -#ifdef ARDENT -void igop_(long * ptype, long * x, long * pn, struct char_desc * arg) -{ - char *op = arg->string; - int len_op = arg->len; -#endif -#if defined(CRAY) -void igop_(long * wrap_ptype, long * x, long * wrap_pn, _fcd arg) -{ - long ptype, pn; - ptype = (long) *ptype; - char *op = _fcdtocp(arg); - int len_op = _fcdlen(arg); -#endif - IGOP_(ptype, x, pn, op); -} -#endif diff --git a/armci/tcgmsg/ipcv4.0/llog.c b/armci/tcgmsg/ipcv4.0/llog.c deleted file mode 100644 index 79e76e1bc..000000000 --- a/armci/tcgmsg/ipcv4.0/llog.c +++ /dev/null @@ -1,47 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/llog.c,v 1.6 2004-04-01 02:04:57 manoj Exp $ */ - -#include - -#include -#include -#include - -#include "sndrcv.h" - -#if (defined(SUN) && !defined(SOLARIS)) - extern char *sprintf(); -#endif -#ifndef SGI - extern time_t time(); -#endif - -extern void Error(); - -void LLOG_() -/* - close and open stdin and stdout to append to a local logfile - with the name log. in the current directory -*/ -{ - char name[12]; - time_t t; - - (void) sprintf(name, "log.%03ld",NODEID_()); - - (void) fflush(stdout); - (void) fflush(stderr); - - if (freopen(name, "a", stdout) == (FILE *) NULL) - Error("LLOG_: error re-opening stdout", (long) -1); - - if (freopen(name, "a", stderr) == (FILE *) NULL) - Error("LLOG_: error re-opening stderr", (long) -1); - - (void) time(&t); - (void) printf("\n\nLog file opened : %s\n\n",ctime(&t)); - (void) fflush(stdout); -} diff --git a/armci/tcgmsg/ipcv4.0/mdtob.c b/armci/tcgmsg/ipcv4.0/mdtob.c deleted file mode 100644 index f1a4898a5..000000000 --- a/armci/tcgmsg/ipcv4.0/mdtob.c +++ /dev/null @@ -1,26 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/mdtob.c,v 1.4 1995-02-24 02:17:23 d3h325 Exp $ */ - -#include "sndrcv.h" - -/* - These routines use C's knowledge of the sizes of data types - to generate a portable mechanism for FORTRAN to translate - between bytes, integers and doubles. Note that we assume that - FORTRAN integers are the same size as C longs. -*/ - -long MDTOB_(n) - long *n; -/* - Return the no. of bytes that n doubles occupy -*/ -{ - if (*n < 0) - Error("MDTOB_: negative argument",*n); - - return (long) (*n * sizeof(double)); -} diff --git a/armci/tcgmsg/ipcv4.0/mdtoi.c b/armci/tcgmsg/ipcv4.0/mdtoi.c deleted file mode 100644 index 595f6116f..000000000 --- a/armci/tcgmsg/ipcv4.0/mdtoi.c +++ /dev/null @@ -1,26 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/mdtoi.c,v 1.4 1995-02-24 02:17:24 d3h325 Exp $ */ - -#include "sndrcv.h" - -/* - These routines use C's knowledge of the sizes of data types - to generate a portable mechanism for FORTRAN to translate - between bytes, integers and doubles. Note that we assume that - FORTRAN integers are the same size as C longs. -*/ - -long MDTOI_(n) - long *n; -/* - Return the minimum no. of integers which will hold n doubles. -*/ -{ - if (*n < 0) - Error("MDTOI_: negative argument",*n); - - return (long) ( (MDTOB_(n) + sizeof(long) - 1) / sizeof(long) ); -} diff --git a/armci/tcgmsg/ipcv4.0/mitob.c b/armci/tcgmsg/ipcv4.0/mitob.c deleted file mode 100644 index 6473be5da..000000000 --- a/armci/tcgmsg/ipcv4.0/mitob.c +++ /dev/null @@ -1,26 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/mitob.c,v 1.4 1995-02-24 02:17:26 d3h325 Exp $ */ - -#include "sndrcv.h" - -/* - These routines use C's knowledge of the sizes of data types - to generate a portable mechanism for FORTRAN to translate - between bytes, integers and doubles. Note that we assume that - FORTRAN integers are the same size as C longs. -*/ - -long MITOB_(n) - long *n; -/* - Return the no. of bytes that n ints=longs occupy -*/ -{ - if (*n < 0) - Error("MITOB_: negative argument",*n); - - return (long) (*n * sizeof(long)); -} diff --git a/armci/tcgmsg/ipcv4.0/mitod.c b/armci/tcgmsg/ipcv4.0/mitod.c deleted file mode 100644 index 85a26b92a..000000000 --- a/armci/tcgmsg/ipcv4.0/mitod.c +++ /dev/null @@ -1,26 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/mitod.c,v 1.4 1995-02-24 02:17:27 d3h325 Exp $ */ - -#include "sndrcv.h" - -/* - These routines use C's knowledge of the sizes of data types - to generate a portable mechanism for FORTRAN to translate - between bytes, integers and doubles. Note that we assume that - FORTRAN integers are the same size as C longs. -*/ - -long MITOD_(n) - long *n; -/* - Return the minimum no. of doubles in which we can store n longs -*/ -{ - if (*n < 0) - Error("MITOD_: negative argument",*n); - - return (long) ( (MITOB_(n) + sizeof(double) - 1) / sizeof(double) ); -} diff --git a/armci/tcgmsg/ipcv4.0/mtime.c b/armci/tcgmsg/ipcv4.0/mtime.c deleted file mode 100644 index 873bac8b0..000000000 --- a/armci/tcgmsg/ipcv4.0/mtime.c +++ /dev/null @@ -1,134 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/mtime.c,v 1.4 1995-02-24 02:17:28 d3h325 Exp $ */ - -#include -#include "sndrcv.h" - -long MTIME_() -/* - return wall clock time in centiseconds -*/ -{ - return (long) (TCGTIME_()*100.0); -} - -#if !(defined(KSR) || defined(ALLIANT)) - -#include -#include - -static unsigned firstsec=0; /* Reference for timer */ -static unsigned firstusec=0; /* Reference for timer */ - -void MtimeReset() /* Sets timer reference */ -{ - struct timeval tp; - struct timezone tzp; - - (void) gettimeofday(&tp,&tzp); - - firstsec = tp.tv_sec; - firstusec = tp.tv_usec; -} - -double TCGTIME_() -/* - Return wall clock time in seconds as accurately as possible -*/ -{ - static int firstcall=1; - double low, high; - - struct timeval tp; - struct timezone tzp; - - if (firstcall) { - MtimeReset(); - firstcall = 0; - } - - (void) gettimeofday(&tp,&tzp); - - low = (double) (tp.tv_usec>>1) - (double) (firstusec>>1); - high = (double) (tp.tv_sec - firstsec); - - return high + 1.0e-6*(low+low); -} - -#endif - -#ifdef KSR -static double firsttime = 0; - -static double KSRTime() -{ - long time; -#pragma setregval (time, i12) - - /* Read timer */ - asm("finop; movb8_8 %x_all_timer,%i12"); - asm("finop; cxnop"); - asm("finop; cxnop"); - - return(time * 4.0e-7); -} - -double TCGTIME_() -/* - Return wall clock time in seconds as accurately as possible -*/ -{ - static int firstcall = 1; - - if (firstcall) { - firstcall = 0; - MtimeReset(); - } - - return KSRTime() - firsttime; -} - -void MtimeReset() /* Sets timer reference */ -{ - firsttime = KSRTime(); -} - -#endif - -#ifdef ALLIANT - -#include - -struct hrcval firsttime; - -void MtimeReset() -{ - hrcstamp(&firsttime); -} - -double TCGTIME_() -{ - double low, high; - struct hrcval current; - static int firstcall = 1; - - if (firstcall) { - firstcall = 0; - MtimeReset(); - } - - hrcstamp(¤t); - - /* Lose a bit but does this avoid the roll problem ? */ - - low = (double) (current.hv_low>>1) - (double) (firsttime.hv_low>>1); - - high = (double) (current.hv_high - firsttime.hv_high); - - return (high*4294967296e-6+ 2.0*low) * 0.997e-5; -} - -#endif diff --git a/armci/tcgmsg/ipcv4.0/niceftn.c b/armci/tcgmsg/ipcv4.0/niceftn.c deleted file mode 100644 index 30e15a1b8..000000000 --- a/armci/tcgmsg/ipcv4.0/niceftn.c +++ /dev/null @@ -1,22 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "srftoc.h" -#ifndef IPSC -#include -#endif - -int NICEFTN_(ival) - int *ival; -/* - Wrapper around nice for FORTRAN users courtesy of Rick Kendall - ... C has the system interface -*/ -{ -#ifndef IPSC - return nice(*ival); -#else - return 0; -#endif -} diff --git a/armci/tcgmsg/ipcv4.0/nnodes.c b/armci/tcgmsg/ipcv4.0/nnodes.c deleted file mode 100644 index 136ca00f2..000000000 --- a/armci/tcgmsg/ipcv4.0/nnodes.c +++ /dev/null @@ -1,17 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/nnodes.c,v 1.4 1995-02-24 02:17:30 d3h325 Exp $ */ - -#include "sndrcv.h" -#include "sndrcvP.h" - -long NNODES_() -/* - return total no. of processes -*/ -{ - return SR_n_proc; -} - diff --git a/armci/tcgmsg/ipcv4.0/nodeid.c b/armci/tcgmsg/ipcv4.0/nodeid.c deleted file mode 100644 index 7bd4c7ea4..000000000 --- a/armci/tcgmsg/ipcv4.0/nodeid.c +++ /dev/null @@ -1,16 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/nodeid.c,v 1.4 1995-02-24 02:17:32 d3h325 Exp $ */ - -#include "sndrcv.h" -#include "sndrcvP.h" - -long NODEID_() -/* - return logical node no. of current process -*/ -{ - return SR_proc_id; -} diff --git a/armci/tcgmsg/ipcv4.0/nxtval.c b/armci/tcgmsg/ipcv4.0/nxtval.c deleted file mode 100644 index 6a85150db..000000000 --- a/armci/tcgmsg/ipcv4.0/nxtval.c +++ /dev/null @@ -1,162 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/nxtval.c,v 1.6 2004-04-01 02:04:57 manoj Exp $ */ - -#include -#include -#include -#include -#include "sndrcvP.h" -#include "sndrcv.h" - -jmp_buf SR_jmp_buf; /* Jumped to on soft error */ - -void NextValueServer() -/* - This runs as process SR_n_proc and provides load balancing service. - -*/ -{ - long cnt = 0; /* actual counter */ - long lencnt = sizeof cnt; /* length of cnt */ - long ndone = 0; /* no. finished for this loop */ - long ntermin = 0; /* no. terminated so far (pend) */ - long node = -1; /* select any node */ - long type = TYPE_NXTVAL; /* message type */ - long buf[2]; /* buffer to get values */ - long lenbuf = sizeof buf; /* length of buffer */ - long mproc; /* no. of processes running loop */ - long nval; /* no. of values requested */ - long done_list[MAX_PROCESS]; /* list of processes finished with this loop */ - long sync = 1; /* all info goes synchronously */ - long on=0; - long lenmes, nodefrom; - - SR_exit_on_error = FALSE; /* Want to return no matter what */ - - if (setjmp(SR_jmp_buf)) { /* Error should long jump to here */ -/* (void) printf("Error long jumped to NXTVAL ... returning.\n"); */ - SR_exit_on_error = TRUE; - return; - } - - SETDBG_(&on); - - while (1) { - - /* Wait for input from any node */ - - RCV_(&type, (char *) buf, &lenbuf, &lenmes, &node, &nodefrom, &sync); - - if (lenmes != lenbuf) { - Error("NextValueServer: lenmes != lenbuf", lenmes); - return; /* Never actually gets here as does long jump */ - } - - mproc = buf[0]; - nval = buf[1]; - if (DEBUG_) - (void) printf("NVS: from=%ld, mproc=%ld, ndone=%ld, ntermin=%ld\n", - nodefrom, mproc, ndone, ntermin); - - if (mproc == 0) { - - /* Sending process is about to terminate. Send reply and disable - sending to him. If all processes have finished return. - - Modified so that all processes block on waiting for message - from nxtval server before terminating. nxtval only lets - everyone go when all have registered termination. - This is so that processes do not close their sockets - while another process is doing a RCV from any node (which - results in an unavoidable error condition). */ - - if (++ntermin == NNODES_()) { - (void) signal(SIGCHLD, SIG_DFL); /* Will be dying naturally */ - for (node=0; node 0) { - - /* This is what we are here for */ - - SND_(&type, (char *) &cnt, &lencnt, &nodefrom, &sync); - cnt += nval; - } - else if (mproc < 0) { - - /* This process has finished the loop. Wait until all mproc - processes have finished before releasing it */ - - done_list[ndone++] = nodefrom; - - if (ndone == -mproc) { - while (ndone--) { - nodefrom = done_list[ndone]; - SND_(&type, (char *) &cnt, &lencnt, &nodefrom, &sync); - } - cnt = 0; - ndone = 0; - } - } - } -} - -long NXTVAL_(mproc) - long *mproc; -/* - Get next value of shared counter. - - mproc > 0 ... returns requested value - mproc < 0 ... server blocks until abs(mproc) processes are queued - and returns junk - mproc = 0 ... indicates to server that I am about to terminate - - this needs to be extended so that clusters of processes with - shared memory collectively get a bunch of values from the server - thus reducing the overhead of calling nextvalue. -*/ -{ - long server = NNODES_(); /* id of server process */ - long buf[2]; - long lenbuf = sizeof buf; - long type = TYPE_NXTVAL; - long lenmes, nodefrom; - long sync = 1; - long result=0; - - if (SR_parallel) { - buf[0] = *mproc; - buf[1] = 1; - - if (DEBUG_) { - (void) printf("%2ld: nxtval: mproc=%ld\n",NODEID_(), *mproc); - (void) fflush(stdout); - } - - SND_(&type, (char *) buf, &lenbuf, &server, &sync); - RCV_(&type, (char *) buf, &lenbuf, &lenmes, &server, &nodefrom, &sync); - result = buf[0]; - } - else { - /* Not running in parallel ... just do a simulation */ - static int count = 0; - if (*mproc == 1) - result = count++; - else if (*mproc == -1) { - count = 0; - result = 0; - } - else - Error("nxtval: sequential version with silly mproc ", (long) *mproc); - } - - return result; -} diff --git a/armci/tcgmsg/ipcv4.0/parallel.c b/armci/tcgmsg/ipcv4.0/parallel.c deleted file mode 100644 index 08e98dd76..000000000 --- a/armci/tcgmsg/ipcv4.0/parallel.c +++ /dev/null @@ -1,447 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/parallel.c,v 1.22 2005-02-22 18:47:02 manoj Exp $ */ - -#include -#ifdef SEQUENT -#include -#else -#include -#endif -#include -#include -#include -#include -#include -#if defined(SUN) || defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || \ - defined(AIX) || defined(NEXT) || defined(LINUX) -#include -#endif - -#include "sndrcvP.h" -#include "cluster.h" -#include "sndrcv.h" -#include "signals.h" -#include "tcgsockets.h" - -extern char *getenv(); -#if defined(ULTRIX) || defined(SGI) || defined(NEXT) || defined(HPUX) || \ - defined(KSR) || defined(DECOSF) -extern void *malloc(); -#else -#include -#endif - -#if 0 -#if !(defined(SGI) || defined(LINUX)) -extern char *strdup(); -#endif -#endif - -extern void NextValueServer(); -extern void Error(); -extern int WaitAll(long nchild); - -#if (defined(SUN) && !defined(SOLARIS)) - extern char *sprintf(); -#endif - - -static char *ProcgrpFile(argc, argv) - int argc; - char **argv; -/* - Find the name of the procgrp file from - - 1) the first argument on the command line with .p appended - 2) as 1) but also prepending $HOME/pdir/ - 2) the translation of the environmental variable PROCGRP - 3) the file PROCGRP in the current directory -*/ -{ - char *tmp, *home; - int len; - struct stat buf; - - if (argc > 1) { - len = strlen(argv[1]); - tmp = malloc((unsigned) (len+3) ); - (void) strcpy(tmp, argv[1]); - (void) strcpy(tmp+len, ".p"); - - if (stat(tmp, &buf) == 0) /* try ./arg1.p */ - return tmp; - else - (void) free(tmp); - - if ( (home = getenv("HOME")) != (char *) NULL ) { - tmp = malloc((unsigned) (strlen(home) + len + 9)); - (void) strcpy(tmp, home); - (void) strcpy(tmp+strlen(home),"/pdir/"); - (void) strcpy(tmp+strlen(home)+6,argv[1]); - (void) strcpy(tmp+strlen(home)+6+len,".p"); - -(void) printf("tmp = %s\n",tmp); - - if (stat(tmp, &buf) == 0) /* try $HOME/pdir/arg1.p */ - return tmp; - else - (void) free(tmp); - } - } - - if ( (tmp = getenv("PROCGRP")) != (char *) NULL ) - if (stat(tmp, &buf) == 0) - return tmp; - - return strdup("PROCGRP"); -} - -static void SkipPastEOL(fp) - FILE *fp; -/* - Read past first newline character -*/ -{ - int test; - - while ( (char) (test = getc(fp)) != '\n') - if (test == EOF) - break; -} - -static char *GetProcgrp(filename, len_procgrp) - char *filename; - long *len_procgrp; -/* - Read the entire contents of the PROCGRP into a NULL terminated - character string. Be lazy and read the file twice, first to - count the number of characters (ftell cannot be beleived?). -*/ -{ - FILE *file; - char *tmp, *procgrp; - int status; - - if ( (file = fopen(filename,"r")) == (FILE *) NULL ) { - (void) fprintf(stderr,"Master: PROCGRP = %s\n",filename); - Error("Master: failed to open PROCGRP", (long) 0); - } - - *len_procgrp = 0; - while ( (status = getc(file)) != EOF) { - if (status == '#') - SkipPastEOL(file); - else - (*len_procgrp)++; - } - - (*len_procgrp)++; - - if ( (tmp = procgrp = malloc((unsigned) *len_procgrp)) == (char *) NULL ) - Error("GetProcgrp: failed in malloc", (long) *len_procgrp); - - (void) fseek(file, 0L, (int) 0); /* Seek to beginning of file */ - - while ( (status = getc(file)) != EOF) { - if (status == '#') - SkipPastEOL(file); - else - *tmp++ = (char) status; - } - - *tmp = '\0'; - - if ( (int) (tmp - procgrp + 1) != *len_procgrp ) - Error("GetProcgrp: screwup dimensioning procgrp", (long) *len_procgrp); - - (void) fclose(file); - - return procgrp; -} - -char *Canonical(name) - char *name; -/* - Use gethostbyname and return the canonicalized name. -*/ -{ - struct hostent *host; - - if ( (host = gethostbyname(name)) != (struct hostent *) NULL ) - return strdup(host->h_name); - else - return (char *) NULL; -} - -static long RemoteCreate(remote_hostname, remote_username, - remote_executable, argc, argv, - n_clus, n_proc, clus_id, proc_id) - char *remote_hostname; - char *remote_username; - char *remote_executable; - int argc; - char **argv; - long n_clus; - long n_proc; - long clus_id; - long proc_id; -/* - Using rsh create a process on remote_hostname running the - executable in the remote file remote_executable. Through - arguments pass it my hostname and the port number of a socket - to conenct on. Also propagate the arguments which this program - was invoked with. - - Listen for a connection to be established. The return value of - RemoteCreate is the filedescriptor of the socket connecting the - processes together. - - Rsh should ensure that the standard output of the remote - process is connected to the local standard output and that - local interrupts are propagated to the remote process. - */ -{ - char local_hostname[256], c_port[8]; - char c_n_clus[8], c_n_proc[8], c_clus_id[8], c_proc_id[8]; - char *argv2[256]; - int sock, port, i, pid; - char *tmp; - - /* Create and bind socket to wild card internet name */ - - CreateSocketAndBind(&sock, &port); - - /* create remote process using rsh passing master hostname and - port as arguments */ - - if (gethostname(local_hostname, 256) || strlen(local_hostname) == 0) - Error("RemoteCreate: gethostname failed", (long) 0); - - (void) sprintf(c_port, "%d", port); - (void) sprintf(c_n_clus, "%ld", n_clus); - (void) sprintf(c_n_proc, "%ld", n_proc); - (void) sprintf(c_clus_id, "%ld", clus_id); - (void) sprintf(c_proc_id, "%ld", proc_id); - - (void) printf(" Creating: host=%s, user=%s,\n\ - file=%s, port=%s\n", - remote_hostname, remote_username, remote_executable, - c_port); - - pid = fork(); - if (pid == 0) { - /* In child process */ - - sleep(1); /* So that parallel can make the sockets */ - -#ifndef SUN - if (proc_id != 0) /* Close all uneeded files */ - (void) fclose(stdin); -#ifdef SPARC64_GP - for (i=3; i<62; i++) -#else - for (i=3; i<64; i++) -#endif - (void) close(i); -#endif - - /* Overlay the desired executable */ - - if (strcmp(remote_hostname, local_hostname) != 0) { - argv2[0 ] = "rsh"; - argv2[1 ] = remote_hostname; - argv2[2 ] = "-l"; - argv2[3 ] = remote_username; - argv2[4 ] = "-n"; - argv2[5 ] = remote_executable; - argv2[6 ] = " "; - for (i=2; i 0) - SR_pids[SR_numchild++] = pid; - else - Error("RemoteCreate: failed forking process", (long) pid); - - /* accept one connection */ - - return ListenAndAccept(sock); -} - -int main(argc, argv) - int argc; - char **argv; -/* - This is the master process of the cluster network. - - a) read the procgrp file. This is found by trying in turn: - - 1) the first argument on the command line with .p appended - 2) the translation of the environmental variable PROCGRP - 3) the file PROCGRP in the current directory - - b) create the remote processes specified in this file, connecting - to them via sockets and pass them the entire contents of the - PROCGRP file in ascii - - c) Navigate messages to establish connections between the remote - processes - - d) wait for all the children to finish and exit with the appropriate - status -*/ -{ - char hostname[256]; /* Me */ - char *filename; /* The name of PROCGRP file */ - char *procgrp; /* The contents of PROCGRP */ - long len_procgrp; /* The length of PROCGRP */ - long i, j, node, type, lenbuf, status=0, sync=1; - - /* Initialize all the globals */ - - InitGlobal(); - - /* Set up handler for SIGINT and SIGCHLD */ - - TrapSigint(); - TrapSigchld(); - TrapSigterm(); - - /* on Solaris parallel gets SIGSEGV interrupted while polling in NxtVal */ -#ifdef SOLARIS - TrapSigsegv(); -#endif - - /* Generate a name for the PROCGRP file */ - - filename = ProcgrpFile(argc, argv); - if (DEBUG_) - (void) printf("PROCGRP = %s\n",filename); - - /* Read in the entire contents of the PROCGRP file */ - - procgrp = GetProcgrp(filename, &len_procgrp); - - /* Parse the procgrp info filling in the ClusterInfo structure and - computing the number of clusters */ - - if (gethostname(hostname, sizeof hostname) || strlen(hostname) == 0) - Error("parallel: gethostname failed?", (long) sizeof hostname); - - InitClusInfo(procgrp, hostname); - - if (DEBUG_) - PrintClusInfo(); - - /* I am the master process so I have the highest ids */ - - SR_proc_id = SR_n_proc; - - /* Now create the remote cluster master processes */ - - for (i=0; i= 0) { - SR_socks[SR_nsock] = SR_proc_info[i].sock; - SR_socks_proc[SR_nsock] = i; - SR_nsock++; - } - } - - /* Provide the next value service ... exit gracefully when get termination - message from everyone or detect error */ - - NextValueServer(); - - /* Now wait patiently for everything to finish, then close all - sockets and return */ - - status = WaitAll(SR_n_clus); - - if (SR_error) - status = 1; - - ShutdownAll(); - - return status; -} diff --git a/armci/tcgmsg/ipcv4.0/pbegin.c b/armci/tcgmsg/ipcv4.0/pbegin.c deleted file mode 100644 index 4c3abd3bc..000000000 --- a/armci/tcgmsg/ipcv4.0/pbegin.c +++ /dev/null @@ -1,591 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/pbegin.c,v 1.20 2005-02-22 18:47:02 manoj Exp $ */ - -#include -#include -#include -#ifdef SEQUENT -#include -#else -#include -#endif -#include -#include -#if defined(CONVEX) && defined(HPUX) -#include -#endif -#if defined(SUN) || defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) \ - || defined(CONVEX) || defined(AIX) || defined(NEXT) \ - || defined(LINUX) -#include -#endif - -#if defined(SHMEM) || defined(SYSV) -# if (defined(SGI_N32) || defined(SGITFP)) -# define PARTIALSPIN -# else -# define NOSPIN -# endif -#endif - -#if defined(SOLARIS) -/* See notes below on processor binding */ -/*#include */ -/*#include */ -#endif - -#include "cluster.h" -#include "sndrcv.h" -#include "sndrcvP.h" -#include "signals.h" -#include "tcgsockets.h" - -#if defined(SHMEM) || defined(SYSV) -#include "tcgshmem.h" -#include "sema.h" -#endif - -#ifdef EVENTLOG -#include "evlog.h" -#endif - -extern void exit(); -extern void InitClusInfoNotParallel(); -extern int WaitAll(long nchild); - -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || \ - defined(CONVEX) || defined(ARDENT) || defined(ULTRIX) || defined(AIX) || \ - defined(NEXT) || defined(DECOSF) -extern char *strdup(); -#endif - -#define max(A, B) ( (A) > (B) ? (A) : (B) ) -#define min(A, B) ( (A) < (B) ? (A) : (B) ) - -#if defined(ULTRIX) || defined(SGI) || defined(NEXT) || defined(HPUX) || \ - defined(KSR) || defined(DECOSF) -extern void *malloc(); -#else -#include -#endif - -#ifdef IPSC -#define bzero(A,N) memset((A), 0, (N)) -#endif - -static int SR_initialized=0; -long TCGREADY_() -{ - return (long)SR_initialized; -} - - -static void ConnectAll() -{ - long j, k, clus1, clus2, node1, node2, nslave1, nslave2; - - for (clus1=1; clus1 < SR_n_clus; clus1++) { - - node1 = SR_clus_info[clus1].masterid; - nslave1 = SR_clus_info[clus1].nslave; - - for (clus2=0; clus2 < clus1; clus2++) { - - node2 = SR_clus_info[clus2].masterid; - - RemoteConnect(node1, node2, SR_n_proc); /* connect masters */ - -#if defined(SHMEM) || defined(SYSV) - - nslave2 = SR_clus_info[clus2].nslave; - - for (j=1; j= argc ) - Error("pbegin: -master present but not other arguments", - (long) argc); - break; - } - - if ( (i+6) >= argc ) { - SR_parallel = FALSE; - InitClusInfoNotParallel(); - SR_n_clus=1; - return; - } - else - SR_parallel = TRUE; - - if (DEBUG_) { - (void) printf("pbegin: assign argument values\n"); - (void) fflush(stdout); - } - - masterhostname = strdup(argv[i+1]); - cport = strdup(argv[i+2]); - SR_n_clus = atoi(argv[i+3]); - SR_n_proc = atoi(argv[i+4]); - SR_clus_id = atoi(argv[i+5]); - SR_proc_id = atoi(argv[i+6]); - - /* Check out some of this info */ - - if ((SR_n_clus >= MAX_CLUSTER) || (SR_n_clus < 1)) - Error("pbegin: invalid no. of clusters", SR_n_clus); - if ((SR_n_proc >= MAX_PROCESS) || (SR_n_proc < 1)) - Error("pbegin: invalid no. of processes", SR_n_proc); - if ((SR_clus_id >= SR_n_clus) || (SR_clus_id < 0)) - Error("pbegin: invalid cluster id", SR_clus_id); - if ((SR_proc_id >= SR_n_proc) || (SR_proc_id < 0)) - Error("pbegin: invalid process id", SR_proc_id); - - /* Close all files we don't need. Process 0 keeps stdin/out/err. - All others only stdout/err. */ - - if (SR_clus_id != 0) - (void) fclose(stdin); -#ifdef SPARC64_GP - for (i=3; i<62; i++) -#else - for (i=3; i<64; i++) -#endif - (void) close((int) i); - - /* Connect to the master process which will have process id - equal to the number of processes */ - - if (DEBUG_) { - (void) printf("pbegin: %ld CreateSocketAndConnect\n",NODEID_()); - (void) fflush(stdout); - } - masterid = SR_n_proc; - SR_proc_info[SR_n_proc].sock = CreateSocketAndConnect(masterhostname, - cport); - - /* Now we have initialized this info we should be able to use the - standard interface routines rather than accessing the SR variables - directly */ - - /* Get the procgrp from the master process - - Note that byteordering and word length start to be an issue. */ - - if (DEBUG_) { - (void) printf("pbegin: %ld get len_pgrp\n",NODEID_()); - (void) fflush(stdout); - } - type = TYPE_BEGIN | MSGINT; - lenbuf = sizeof(long); - nodesel = masterid; - RCV_(&type, (char *) &len_pgrp, &lenbuf, &lenmes, &nodesel, &nodefrom, - &sync); - if (DEBUG_) { - (void) printf("len_pgrp = %ld\n",len_pgrp); (void) fflush(stdout); - } - if ( (procgrp = malloc((unsigned) len_pgrp)) == (char *) NULL ) - Error("pbegin: failed to allocate procgrp",len_pgrp); - - if (DEBUG_) { - (void) printf("pbegin: %ld get progcrp len=%ld\n",NODEID_(),len_pgrp); - (void) fflush(stdout); - } - type = TYPE_BEGIN | MSGCHR; - RCV_(&type, procgrp, &len_pgrp, &lenmes, &nodesel, &nodefrom, &sync); - if (DEBUG_) { - (void) printf("procgrp:\n%55s...\n",procgrp); (void) fflush(stdout); - (void) fflush(stdout); - } - - /* Parse the procgrp to fill out SR_clus_info ... it also again works out - SR_n_clus and SR_n_proc ... ugh */ - - InitClusInfo(procgrp, masterhostname); - - if (DEBUG_) { - PrintClusInfo(); - (void) fflush(stdout); - } - - /* Change to desired working directory ... forked processes - will inherit it */ - - if(chdir(SR_clus_info[SR_clus_id].workdir) != 0) - Error("pbegin: failed to switch to work directory", (long) -1); - - if (DEBUG_) { - printf("%2ld: pbegin: changed to working directory %s\n", - NODEID_(), SR_clus_info[SR_clus_id].workdir); - (void) fflush(stdout); - } - - /* If we have more than 1 process in this cluster we have to - create the shared memory and semaphores and fork the processes - partitioning out the resources */ - - SR_using_shmem = 0; -#if defined(SHMEM) || defined(SYSV) - me = NODEID_(); - nslave = SR_clus_info[SR_clus_id].nslave; - if (nslave > 1) { - SR_proc_info[me].shmem_size = nslave*SHMEM_BUF_SIZE + - (nslave+1)*sizeof(long); - SR_proc_info[me].shmem_size = - ((SR_proc_info[me].shmem_size - 1)/4096)*4096 + 4096; - if (DEBUG_) { - (void) printf("pbegin: %ld allocate shmem, nslave=%ld\n", - NODEID_(), nslave); - (void) fflush(stdout); - } - SR_using_shmem = 1; - SR_proc_info[me].shmem = CreateSharedRegion(&SR_proc_info[me].shmem_id, - &SR_proc_info[me].shmem_size); - if (DEBUG_) { - (void) printf("pbegin: %ld allocate sema, nslave=%ld\n", - NODEID_(), nslave); - (void) fflush(stdout); - } - - flags = (long *) (SR_proc_info[me].shmem + nslave*SHMEM_BUF_SIZE); - - (void) bzero(SR_proc_info[me].shmem, SR_proc_info[me].shmem_size); - - for (i=0; inodeto = -1; - flags[i] = FALSE; - } - -#if defined(NOSPIN) - SR_proc_info[me].semid = SemSetCreate((long) 3*nslave, (long) 0); -#else -#ifdef KSR_NATIVE - /* Bind myself to a processor */ - KSR_BindProcess(0); - if (DEBUG_) { - (void) printf("pbegin: bound master process\n"); - (void) fflush(stdout); - } -#endif -#endif - -#if defined(SOLARIS) - /* If there fewer processes than processors it appears beneficial - to bind processes to processors. It also appears useful to - leave the lowest numbered processors free (???). - BUT ... this code is not general enough since the configured - processors are not necessarily numbered consecutively and - we also need to add logic to determine the list of processors - that have not already been bound to a process. - - Need to also modify the code below for binding slaves and enable - the include of processor.h and procset.h */ - - /* printf("binding master process %d to processor %d\n", getpid(), 31-0); - if (processor_bind(P_PID, P_MYID, 31-0, (void *) NULL)) - printf("binding to %d failed\n", 31-0); */ -#endif - - - for (i=1; i 0) - SR_pids[SR_numchild++] = status; - } - - masterid = SR_clus_info[SR_clus_id].masterid; - - for (i=masterid; i<(masterid+nslave); i++) { - long slaveid = i - masterid; - SR_proc_info[i].slaveid = slaveid; - SR_proc_info[i].local = 1; - SR_proc_info[i].sock = -1; - SR_proc_info[i].shmem = SR_proc_info[masterid].shmem; - SR_proc_info[i].shmem_size = SR_proc_info[masterid].shmem_size; - SR_proc_info[i].shmem_id = SR_proc_info[masterid].shmem_id; -#ifndef KSR_NATIVE - SR_proc_info[i].header = (MessageHeader *) - (SR_proc_info[i].shmem + slaveid * SHMEM_BUF_SIZE); -/* SR_proc_info[i].header->nodeto = -1; */ - SR_proc_info[i].buffer = ((char *) SR_proc_info[i].header) + - sizeof(MessageHeader) + (sizeof(MessageHeader) % 8); - SR_proc_info[i].buflen = SHMEM_BUF_SIZE - sizeof(MessageHeader) - - (sizeof(MessageHeader) % 8); -#ifdef NOSPIN - SR_proc_info[i].semid = SR_proc_info[masterid].semid; - SR_proc_info[i].sem_pend = 3*slaveid; - SR_proc_info[i].sem_read = 3*slaveid + 1; - SR_proc_info[i].sem_written = 3*slaveid + 2; -#else - SR_proc_info[i].semid = -1; -#endif - SR_proc_info[i].buffer_full = flags + slaveid; -/* *SR_proc_info[i].buffer_full = FALSE;*/ -#endif - } - -#ifdef KSR_NATIVE - /* Map the data structures onto the shared memory */ - KSR_MapBufferSpace(masterid, nslave); - if (DEBUG_) { - (void) printf("pbegin: %2ld: Mapped buffer space\n", NODEID_()); - (void) fflush(stdout); - } -#else - /* Post read semaphore to make sends partially asynchronous */ - -#ifdef NOSPIN - SemPost(SR_proc_info[me].semid, SR_proc_info[me].sem_read); -#endif -#endif - -#ifdef KSR_NATIVE - /* Initialize the buffer space data structures */ - KSR_InitBufferSpace(); - if (DEBUG_) { - (void) printf("pbegin: %2ld: Initialized buffer space\n", NODEID_()); - (void) fflush(stdout); - } -#endif - - } - -#else - if (SR_clus_info[SR_clus_id].nslave != 1) - Error("pbegin: no shared memory on this host ... nslave=1 only", - SR_clus_info[SR_clus_id].nslave); -#endif - - /* Now have to connect everyone together */ - - ConnectAll(); - - /* If we are only using sockets we can block in select when waiting for a message */ - SR_nsock = 0; - for (i=0; i<(SR_n_proc+1); i++) { - if (SR_proc_info[i].sock >= 0) { - SR_socks[SR_nsock] = SR_proc_info[i].sock; - SR_socks_proc[SR_nsock] = i; - SR_nsock++; - } - } - /* Synchronize timers before returning to application - or logging any events */ - - (void) TCGTIME_(); - type = TYPE_CLOCK_SYNCH; - SYNCH_(&type); - MtimeReset(); - - /* If logging events make the file events. */ - -#ifdef EVENTLOG - if (eventfile=malloc((unsigned) 32)) { - (void) sprintf(eventfile, "events.%03ld", NODEID_()); - evlog(EVKEY_ENABLE, EVKEY_FILENAME, eventfile, - EVKEY_BEGIN, EVENT_PROCESS, - EVKEY_STR_INT, "Startup used (cs)", (int) (MTIME_()-start), - EVKEY_STR_INT, "No. of processes", (int) NNODES_(), - EVKEY_DISABLE, - EVKEY_LAST_ARG); - (void) free(eventfile); - SYNCH_(&type); - } -#endif - - if (DEBUG_) { - printf("pbegin: %2ld: Returning to application\n",NODEID_()); - fflush(stdout); - } -} - -void PEND_() -/* - Call this to tidy up after parallel section. - The cluster master is responsible for tidying up any shared - memory/semaphore resources. Everyone else can just quit. - - Woops ... everyone should return so that FORTRAN can tidy up - after itself. -*/ -{ - long me = NODEID_(); - long masterid = SR_clus_info[SR_clus_id].masterid; - long nslave = SR_clus_info[SR_clus_id].nslave; - long zero = 0; - long status; -#ifdef EVENTLOG - long start=MTIME_(); -#endif - - SR_initialized = 0; - if (!SR_parallel) return; - - (void) signal(SIGCHLD, SIG_DFL); /* Death of children now OK */ - (void) NXTVAL_(&zero); /* Send termination flag to nxtval server */ - - if (me != masterid) - status = 0; - else { - status = WaitAll(nslave-1); /* Wait for demise of children */ -#if defined(SHMEM) || defined(SYSV) - if (nslave > 1) { -#if defined(NOSPIN) - (void) SemSetDestroyAll(); /* Ex the semaphores and shmem */ -#endif - (void) DeleteSharedRegion(SR_proc_info[me].shmem_id); - } -#endif - } - - ShutdownAll(); /* Close sockets for machines with static kernel */ - - - /* If logging events log end of process and dump trace */ -#ifdef EVENTLOG - evlog(EVKEY_ENABLE, - EVKEY_END, EVENT_PROCESS, - EVKEY_STR_INT, "Time (cs) waiting to finish", (int) (MTIME_()-start), - EVKEY_DUMP, - EVKEY_LAST_ARG); -#endif - /* Return to calling program unless we had an error */ - - if (status) - exit((int) status); -} - - -void tcgi_alt_pbegin(int *argc, char **argv[]) -{ - tcgi_pbegin(*argc, *argv); -} - diff --git a/armci/tcgmsg/ipcv4.0/pbeginf.c b/armci/tcgmsg/ipcv4.0/pbeginf.c deleted file mode 100644 index 8434a254f..000000000 --- a/armci/tcgmsg/ipcv4.0/pbeginf.c +++ /dev/null @@ -1,32 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STRING_H -# include -#endif - -#include "srftoc.h" -#include "sndrcv.h" - - -/** - * Hewlett Packard Risc box and new SparcWorks F77 2.* compilers. - * Have to construct the argument list by calling FORTRAN. - */ -void PBEGINF_() -{ -} - - -/** - * Alternative entry for those senstive to FORTRAN making reference - * to 7 character external names - */ -void PBGINF_() -{ - PBEGINF_(); -} diff --git a/armci/tcgmsg/ipcv4.0/pfilecopy.c b/armci/tcgmsg/ipcv4.0/pfilecopy.c deleted file mode 100644 index 8c049d641..000000000 --- a/armci/tcgmsg/ipcv4.0/pfilecopy.c +++ /dev/null @@ -1,193 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/pfilecopy.c,v 1.9 2004-05-07 20:45:10 pollack Exp $ */ - -#include -#ifdef SEQUENT -#include -#else -#include -#endif -#include "sndrcv.h" -#include "msgtypesc.h" - -#if defined(ULTRIX) || defined(SGI) || defined(NEXT) || defined(HPUX) || \ - defined(KSR) || defined(DECOSF) -extern void *malloc(); -#else -extern void *malloc(); -#endif - -extern void free(); - -void tcgi_pfilecopy(type, node0, filename) - long *type, *node0; - char *filename; -/* - Process node0 has a file (assumed unopened) named fname. - This file will be copied to all other processes which must - simultaneously invoke pfilecopy. Since the processes may be - using the same directory one probably ought to make sure - that each process uses a different name in the call. - - e.g. - - on node 0 pfilecopy(99, 0, 'argosin') - on node 1 pfilecopy(99, 0, 'argosin_001') - on node 2 pfilecopy(99, 0, 'argosin_002') -*/ - -{ - char *buffer; - FILE *file; - long length, nread=32768, len_nread=sizeof(long); - long typenr = (*type & 32767) | MSGINT; /* Force user type integer */ - long typebuf =(*type & 32767) | MSGCHR; - - if (!(buffer = malloc((unsigned) nread))) - Error("pfilecopy: failed to allocate the I/O buffer",nread); - - if (*node0 == NODEID_()) { - - /* I have the original file ... open and check its size */ - - if ((file = fopen(filename,"r")) == (FILE *) NULL) { - (void) fprintf(stderr,"me=%ld, filename = %s.\n",NODEID_(),filename); - Error("pfilecopy: node0 failed to open original file", *node0); - } - - /* Quick sanity check on the length */ - - (void) fseek(file, 0L, (int) 2); /* Seek to end of file */ - length = ftell(file); /* Find the length of file */ - (void) fseek(file, 0L, (int) 0); /* Seek to beginning of file */ - if ( (length<0) || (length>1e12) ) - Error("pfilecopy: the file length is -ve or very big", length); - - /* Send the file in chunks of nread bytes */ - - while (nread) { - nread = fread(buffer, 1, (int) nread, file); - BRDCST_(&typenr, (char *) &nread, &len_nread, node0); - typenr++; - if (nread) { - BRDCST_(&typebuf, buffer, &nread, node0); - typebuf++; - } - } - } - else { - - /* Open the file for the duplicate */ - - if ((file = fopen(filename,"w+")) == (FILE *) NULL) { - (void) fprintf(stderr,"me=%ld, filename = %s.\n",NODEID_(),filename); - Error("pfilecopy: failed to open duplicate file", *node0); - } - - /* Receive data and write to file */ - - while (nread) { - BRDCST_(&typenr, (char *) &nread, &len_nread, node0); - typenr++; - if (nread) { - BRDCST_(&typebuf, buffer, &nread, node0); - typebuf++; - if (nread != (long)fwrite(buffer, 1, (int) nread, file)) - Error("pfilecopy: error data to duplicate file", nread); - } - } - } - - /* Tidy up the stuff we have been using */ - - (void) fflush(file); - (void) fclose(file); - (void) free(buffer); -} - -void PFILECOPY_(type, node0, filename) - long *type, *node0; - char *filename; -{ - tcgi_pfilecopy(type, node0, filename); -} - -#ifdef IPSC -#define bcopy(a, b, n) memcpy((b), (a), (n)) -#endif - -#ifdef CRAY -#include -#endif -#ifdef ARDENT -struct char_desc { - char *string; - int len; -}; -#endif - -/* This crap because FORTRAN has no standard for passing strings */ - -#ifdef ARDENT -void PFCOPY_(type, node0, arg) - long *type; - long *node0; - struct char_desc *arg; -{ - char *fname = arg->string; - int len = arg->len; -#endif -#ifdef CRAY -void PFCOPY_(type, node0, arg) - long *type; - long *node0; - _fcd arg; -{ - char *fname = _fcdtocp(arg); - int len = _fcdlen(arg); -#endif -#if !defined(ARDENT) && !defined(CRAY) -void PFCOPY_(type, node0, fname, len) - long *type; - long *node0; - char *fname; - int len; -{ -#endif - - /* Fortran wrapper around pfilecopy */ - - char *filename; - -#ifdef DEBUG - (void) printf("me=%d, type=%d, node0=%d, fname=%x, fname=%.8s, len=%d\n", - NODEID_(), *type, *node0, fname, fname, len); -#endif - - /* Strip trailing blanks off the file name */ - - while ((len > 0) && (fname[len-1] == ' ')) - len--; - if (len <= 0) - Error("pfcopy_: file name length is toast", (long) len); - - /* Generate a NULL terminated string */ - - filename = malloc( (unsigned) (len+1) ); - if (filename) { - (void) bcopy(fname, filename, len); - filename[len] = '\0'; - } - else - Error("PFCOPY_: failed to malloc space for filename", (long) len); - - /* Now call the C routine to do the work */ - - tcgi_pfilecopy(type, node0, filename); - - (void) free(filename); -} - diff --git a/armci/tcgmsg/ipcv4.0/sema.c b/armci/tcgmsg/ipcv4.0/sema.c deleted file mode 100644 index 147cefd3c..000000000 --- a/armci/tcgmsg/ipcv4.0/sema.c +++ /dev/null @@ -1,979 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/sema.c,v 1.17 2003-05-08 15:44:43 edo Exp $ */ - -/* - These routines simplify the interface to semaphores for use in mutual - exclusion and queuing. Hopefully I can also make this portable. - - An external routine Error is assumed which is called upon an error - and tidies up by calling SemSetDestroyAll. - - In most cases errors cause an internal hard failure (by calling Error). - - 1) make an array of n_sem semaphores, returning the id associated - with the entire set. All the semaphore values are initialized to value - which should be a positve integer (queuing) or 0 (synchronization). - The semaphores in the set are indexed from 0 to n_sem-1. - - long SemSetCreate(long n_sem, long value) - - 2) Decrement and test the value associated with the semaphore specified by - (sem_set_id, sem_num). In effect this: - - if (value >= 0) { - continue execution - } - else { - wait in queue for the semaphore - } - decrement value - - void SemWait(long sem_set_id, long sem_num) - - 3) Increment the value associated with the semaphore specified by - (sem_set_id, sem_num). If value <= 0 (i.e. there are processes - in the queue) this releases the next process. - - void SemPost(long sem_set_id, long sem_num) - - 4) Return the current value associated with the semaphore sepcified by - (sem_set_id, sem_num). - - long SemValue(long sem_set_id, long sem_num) - - 5) Destroy the set of semaphores. Any other processes that are accessing - or try to access the semaphore set should get an error. - On the SUN (all system V machines?) the semaphore sets should - be destroyed explicitly before the final process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroy(long sem_set_id) - - 6) Destroy all the semaphore sets that are known about. This is really - meant for an error routine to call to try and tidy up. Though all - applications could call it before the last process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroyAll() -*/ - -extern void Error(); - -#if defined(SYSV) && !defined SGIUS && !defined(SPPLOCKS) && !defined(MACX) - -/******************************************************************** - Most system V compatible machines - ********************************************************************/ - -/* - - The value used for our semaphore is equal to the value of the - System V semaphore (which is always positive) minus the no. of - processes in the queue. That is because our interface was modelled - after that of Alliant whose semaphore can take on negative values. -*/ - -#include -#include -#include - -#if !HAVE_UNION_SEMUN -union semun { - int val; /* value for SETVAL */ - struct semid_ds *buf; /* buffer for IPC_STAT, IPC_SET */ - unsigned short int *array; /* array for GETALL, SETALL */ - struct seminfo *__buf; /* buffer for IPC_INFO */ -}; -#endif - -/* this global structure maintains a list of allocated semaphore sets - which is used for SemSetDestroyAll */ - -#define MAX_SEM_SETS 20 -static int sem_set_id_list[MAX_SEM_SETS]; -static int num_sem_set = 0; - -#if defined(SGITFP) || defined(SGI64) || defined(KSR) || defined(SOLARIS) || defined (AIX) || defined(LINUX64) -# define MAX_N_SEM 512 -#else -# define MAX_N_SEM 40 -#endif - -void InitSemSetList() -/* Initialise sem_set_id_list */ -{ - int i; - - for (i=0; i= MAX_N_SEM) ) - Error("SemSetCreate: n_sem has invalid value", (long) n_sem); - - if (num_sem_set == 0) - InitSemSetList(); - else if (num_sem_set >= MAX_SEM_SETS) - Error("SemSetCreate: Exceeded man no. of semaphore sets", - (long) num_sem_set); - - /* Actually make the semaphore set */ - - if ( (semid = semget(IPC_PRIVATE, (int) n_sem, IPC_CREAT | 00600)) < 0) - Error("SemSetCreate: failed to create semaphore set", (long) semid); - - /* Put the semid in the first empty slot in sem_set_id_list */ - - for (i=0; i < MAX_SEM_SETS; i++) { - if (sem_set_id_list[i] == -1) { - sem_set_id_list[i] = semid; - break; - } - } - if (i == MAX_SEM_SETS) - Error("SemSetCreate: internal error puting semid in list", (long) i); - - num_sem_set++; - - /* Now set the value of all the semaphores */ - - arg.val = (int) value; - for (i=0; i -#include - -extern int errno; - -/* On the alliant semaphores are handed out one at a time rather than - in sets, so have to maintain sets manually */ - -#define MAX_SEM_SETS 20 -#define MAX_N_SEM 128 - -static struct sem_set_list_struct { - int id[MAX_N_SEM]; /* alliant semaphore id */ - int n_sem; /* no. of semaphores in set */ -} sem_set_list[MAX_SEM_SETS]; - -static int num_sem_set = 0; - - -void InitSemSetList() -/* Initialise sem_set_list */ -{ - int i, j; - - for (i=0; i= MAX_N_SEM) ) - Error("SemSetCreate: n_sem has invalid value", (long) n_sem); - - if (num_sem_set == 0) - InitSemSetList(); - else if (num_sem_set >= MAX_SEM_SETS) - Error("SemSetCreate: Exceeded man no. of semaphore sets", - (long) num_sem_set); - - /* Find first empty slot in sem_set_list */ - - for (i=0; i < MAX_SEM_SETS; i++) - if (sem_set_list[i].n_sem == 0) - break; - - if (i == MAX_SEM_SETS) - Error("SemSetCreate: internal error puting semid in list", (long) i); - - /* Actually make the semaphore set */ - - for (j=0; j -#include -#include -#include -#include - -#define MAX_SEM_SETS 20 -#define MAX_N_SEM 100 - -/* On the convex a semaphore is a structure but on the apollo - it is an array which does not need dereferencing. Use ADDR - to generate the address of a semaphore */ -#ifdef APOLLO -#define ADDR(x) x -#else -#define ADDR(x) &x -#endif - -extern char *mktemp(); - -struct sem_set_struct { - int n_sem; /* no. of semaphores in set */ - semaphore lock[MAX_N_SEM]; /* locks for changing value */ - semaphore wait[MAX_N_SEM]; /* locks for queing */ - int value[MAX_N_SEM]; /* values */ -}; - -static int num_sem_set = 0; -static struct sem_set_struct *sem_sets; -static int fd = -1; -static char template[] = "/tmp/SEMA.XXXXXX"; -static char *filename = (char *) NULL; - -void InitSemSets() -/* Initialise sem_sets and allocate associated shmem region */ -{ - int i, j; - unsigned size = sizeof(struct sem_set_struct) * MAX_SEM_SETS; - -#ifndef APOLLO - /* Generate scratch file to identify region ... mustn't do this - on the APOLLO */ - - filename = mktemp(template); - if ( (fd = open(filename, O_RDWR|O_CREAT, 0666)) < 0 ) - Error("InitSemSets: failed to open temporary file",0); -#endif - - sem_sets = (struct sem_set_struct *) mmap((caddr_t) 0, &size, - PROT_READ|PROT_WRITE, - MAP_ANON|MAP_HASSEMAPHORE|MAP_SHARED, fd, 0); - -#ifdef APOLLO - if (sem_sets == (struct sem_set_struct *) 0) - Error("InitSemSets: mmap failed", (long) -1); -#else - if (sem_sets == (struct sem_set_struct *) -1) - Error("InitSemSets: mmap failed", (long) -1); -#endif - - for (i=0; i= MAX_N_SEM) ) - Error("SemSetCreate: n_sem has invalid value",n_sem); - - if (num_sem_set == 0) - InitSemSets(); - else if (num_sem_set >= MAX_SEM_SETS) - Error("SemSetCreate: Exceeded man no. of semaphore sets", - num_sem_set); - - /* Initialize the values */ - - for (i=0; i= num_sem_set) ) - Error("SemWait: invalid sem_set_id",sem_set_id); - if ( (sem_num < 0) || (sem_num >= sem_sets[sem_set_id].n_sem) ) - Error("SemWait: invalid semaphore number in set",sem_num); - - while (1) { - - /* Get the lock around the whole semaphore */ - - (void) mset(ADDR(sem_sets[sem_set_id].lock[sem_num]), 1); - - /* If the value is positive fall thru, else wait */ - - if (sem_sets[sem_set_id].value[sem_num] > 0) - break; - else { - (void) mclear(ADDR(sem_sets[sem_set_id].lock[sem_num])); - (void) mset(ADDR(sem_sets[sem_set_id].wait[sem_num]), 1); - } - } - - /* Are ready to go ... decrement the value and release lock */ - - sem_sets[sem_set_id].value[sem_num]--; - (void) mclear(ADDR(sem_sets[sem_set_id].lock[sem_num])); - -} - -void SemPost(sem_set_id, sem_num) - long sem_set_id; - long sem_num; -{ - int i; - - if ( (sem_set_id < 0) || (sem_set_id >= num_sem_set) ) - Error("SemPost: invalid sem_set_id",sem_set_id); - if ( (sem_num < 0) || (sem_num >= sem_sets[sem_set_id].n_sem) ) - Error("SemPost: invalid semaphore number in set",sem_num); - - /* Get the lock around the whole semaphore */ - - (void) mset(ADDR(sem_sets[sem_set_id].lock[sem_num]), 1); - - /* Read and increment the value. If is now zero wake up - up the queue */ - - sem_sets[sem_set_id].value[sem_num]++; - i = sem_sets[sem_set_id].value[sem_num]; - - (void) mclear(ADDR(sem_sets[sem_set_id].lock[sem_num])); - if (i >= 0) - (void) mclear(ADDR(sem_sets[sem_set_id].wait[sem_num])); -} - -long SemValue(sem_set_id, sem_num) - long sem_set_id; - long sem_num; -{ - int i; - - if ( (sem_set_id < 0) || (sem_set_id >= num_sem_set) ) - Error("SemValue: invalid sem_set_id",sem_set_id); - if ( (sem_num < 0) || (sem_num >= sem_sets[sem_set_id].n_sem) ) - Error("SemValue: invalid semaphore number in set",sem_num); - - /* There seems no point in getting the lock just to read - the value and it seems more useful not to (e.g. debugging) */ - - i = sem_sets[sem_set_id].value[sem_num]; - - return (long) (i-1); -} - -long SemSetDestroy(sem_set_id) - long sem_set_id; -{ - - if ( (sem_set_id < 0) || (sem_set_id >= num_sem_set) ) - return -1; - - sem_sets[sem_set_id].n_sem = 0; - - return (long) 0; -} - -long SemSetDestroyAll() -{ - long i, status=0; - - for (i=0; i= 0) { - (void) close(fd); - fd = -1; - (void) unlink(filename); - } - - status += munmap((char *) sem_sets, 0); - - if (status) - status = -1; - - return status; -} - -#endif - - -#if defined(SGIUS) || defined(SPPLOCKS) - -/* - SGI fast US library semaphores ... aren't any faster - than system V semaphores ... implement using spin locks -*/ - -#include -#include -#define MAX_SEMA 512 -static volatile int *val; -#define NAME_LEN 200 - -#ifdef SGI -# include - static usptr_t *arena_ptr; - static ulock_t *locks[MAX_SEMA]; - static char arena_name[NAME_LEN]; -# define EIGHT 8 -# define LOCK ussetlock -# define UNLOCK usunsetlock -#define JUMP EIGHT - -#include "sndrcvP.h" - -extern char *getenv(const char *); - -long SemSetCreate(long n_sem, long value) -{ - int i; - char *tmp; - if (!(tmp = getenv("ARENA_DIR"))) tmp = "/tmp"; - - sprintf(arena_name,"%s/tcgmsg.arena.%ld",tmp, (long)getpid()); -#ifdef PRIVATE_ARENA - (void) usconfig(CONF_ARENATYPE, US_SHAREDONLY); -#endif - (void) usconfig(CONF_INITUSERS, (unsigned int)SR_clus_info[SR_clus_id].nslave ); -#ifdef SGI - (void) usconfig(CONF_INITSIZE, 1024*1024); -#endif - - if (!(arena_ptr = usinit(arena_name))) - Error("SemSetCreate: failed to create arena", 0L); - - /* Magic factors of EIGHT here to ensure that values are - in different cache lines to avoid aliasing -- good on SGI and Convex */ - - if (!(val = (int *) usmalloc(EIGHT*MAX_SEMA*sizeof(int), arena_ptr))) - Error("SemSetCreate: failed to get shmem", (long) (MAX_SEMA*sizeof(int))); - - for (i=0; i -#include -#include -#include -#include - -#define SIXTEEN 16 -#define JUMP SIXTEEN -typedef struct{ - int state; - int pad[15]; -} lock_t; - -static lock_t *locks; - -# define LOCK(x) set_lock(&x.state) -# define UNLOCK(x) unset_lock(&x.state) -# define INILOCK(x) init_lock(&x.state) - - -void init_lock(int * volatile ip) -{ - *ip = 1; -} - -void set_lock(int * volatile ip) -{ - while (1) { - while (!(*ip)); - if (__ldcws32(ip)) - break; - } -} - -void unset_lock(int *ip) -{ - *ip = 1; - asm("sync"); -} - -static int fd = -1; -static char template[] = "/tmp/SEMA.XXXXXX"; -static char *filename = (char *) NULL; -static unsigned shmem_size; - -long SemSetCreate(long n_sem, long value) -{ - int i; - shmem_size = SIXTEEN*MAX_SEMA*sizeof(int)+MAX_SEMA*sizeof(lock_t); - - if ( (n_sem <= 0) || (n_sem >= MAX_SEMA) ) - Error("SemSetCreate: n_sem has invalid value",n_sem); - - /* allocate shared memory for locks and semaphore val */ - filename = mktemp(template); - if ( (fd = open(filename, O_RDWR|O_CREAT, 0666)) < 0 ) - Error("SemSetCreate: failed to open temporary file",0); - val = (int *) mmap((caddr_t) 0, shmem_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|CNX_MAP_SEMAPHORE|MAP_SHARED, fd, 0); - locks = (lock_t*)( val + SIXTEEN*MAX_SEMA); - - /* initialize locks and semaphore values */ - for (i=0; i= MAX_SEMA) ) - Error("SemWait: invalid sem_num",sem_num); - - while (value<=0) { - LOCK(locks[sem_num]); - value = val[off]; - if (value>0) - val[off]--; - UNLOCK(locks[sem_num]); - if (value<=0) - Dummy(); - } -} - -void SemPost(long sem_set_id, long sem_num) -{ - int off = sem_num*JUMP; - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemPost: invalid sem_num",sem_num); - - LOCK(locks[sem_num]); - val[off]++; - UNLOCK(locks[sem_num]); -} - -long SemValue(long sem_set_id, long sem_num) -{ - Error("SemValue: not implemented", sem_num); - return 1; -} - -long SemSetDestroy(long sem_set_id) -{ - return(SemSetDestroyAll()); -} - -#endif - - -#if defined(MACX) - - -#include -#include -#include -#include -#include - -#define MAX_SEMA 32 -static int fd = -1; -static char template[] = "/tmp/SEMA.XXXXXX"; -static char *filename = (char *) NULL; -static unsigned shmem_size; - -#if defined(NAMED_SEMAPHORES_SUPPORTED) - -static sem_t *sem_arr; - -long SemSetCreate(long n_sem, long value) -{ - int i; - shmem_size = MAX_SEMA*sizeof(sem_t); - - if ( (n_sem <= 0) || (n_sem >= MAX_SEMA) ) - Error("SemSetCreate: n_sem has invalid value",n_sem); - - /* allocate shared memory for locks and semaphore val */ - filename = mktemp(template); - if ( (fd = shm_open(filename, O_CREAT|O_RDWR, 0666)) < 0 ) - Error("SemSetCreate: failed to open temporary shm file",0); - sem_arr = (sem_t*) mmap((caddr_t)0, shmem_size, PROT_READ|PROT_WRITE, - MAP_ANON|MAP_HASSEMAPHORE|MAP_SHARED, fd, (off_t)0); - if(!sem_arr)Error("SemSetCreate: failed to mmap",0); - - /* initialize locks and semaphore values */ - for (i=0; i= MAX_SEMA) ) - Error("SemWait: invalid sem_num",sem_num); - if(sem_wait(sem_arr+sem_num)<0) - Error("SemWait: failed",sem_num); -} - -void SemPost(long sem_set_id, long sem_num) -{ - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemPost: invalid sem_num",sem_num); - if(sem_post(sem_arr+sem_num)<0) - Error("SemPost: failed",sem_num); - -} - -long SemValue(long sem_set_id, long sem_num) -{ - Error("SemValue: not implemented", sem_num); - return 1L; -} - -#else - - -typedef struct{ - int state; - int pad[15]; -} lock_t; -static lock_t *locks; - - -static char template1[] = "/tmp/SEMA1.XXXXXX"; -static char *filename1 = (char *) NULL; -static sem_t *sem; -static lock_t *locks; - -#include - -long SemSetCreate(long n_sem, long value) -{ - int i; - shmem_size = MAX_SEMA*sizeof(lock_t); - - if ( (n_sem <= 0) || (n_sem >= MAX_SEMA) ) - Error("SemSetCreate: n_sem has invalid value",n_sem); - - /* allocate shared memory for locks and semaphore val */ - locks = (lock_t*) mmap((caddr_t)0, shmem_size, PROT_READ|PROT_WRITE, - MAP_ANON|MAP_SHARED, -1, (off_t)0); - if(locks == (lock_t*)-1)Error("SemSetCreate: failed to mmap",shmem_size); - - filename1 = mktemp(template1); - sem = sem_open(filename1, O_CREAT|O_EXCL, 0666, 1); - if(!sem)Error("SemSetCreate: failed to sem_open",0); - - /* initialize locks and semaphore values */ - bzero(locks,shmem_size); - return 1L; -} - -long SemSetDestroyAll() -{ - long status=0; - status = munmap((char *) locks, shmem_size); - if(status)status = -1; - sem_unlink(filename1); - return status; -} - -double __tcgmsg_fred__=0.0; -void Dummy() -{ - int n = 200; /* This seems optimal */ - while(n--) - __tcgmsg_fred__++; -} - -void SemWait(long sem_set_id, long sem_num) -{ - int value = 0, count=0; - - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemWait: invalid sem_num",sem_num); - - while (value<=0) { - if(sem_wait(sem)<0)Error("SemWait: sem_op error",sem_num);; - value = locks[sem_num].state; - if (value>0) - locks[sem_num].state--; - if(sem_post(sem)<0)Error("SemWait: sem_op error",sem_num);; - if (value<=0) Dummy(); - count++; - if(count%1000 == 999)usleep(1); - } -} - -void SemPost(long sem_set_id, long sem_num) -{ - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemPost: invalid sem_num",sem_num); - - if(sem_wait(sem)<0)Error("SemPost: sem_op error",sem_num);; - locks[sem_num].state++; - if(sem_post(sem)<0)Error("SemWait: sem_op error",sem_num);; -} - -long SemValue(long sem_set_id, long sem_num) -{ - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemVal: invalid sem_num",sem_num); - return (long)locks[sem_num].state; -} -#endif -#endif diff --git a/armci/tcgmsg/ipcv4.0/sema.h b/armci/tcgmsg/ipcv4.0/sema.h deleted file mode 100644 index 4e24edaf0..000000000 --- a/armci/tcgmsg/ipcv4.0/sema.h +++ /dev/null @@ -1,85 +0,0 @@ -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/sema.h,v 1.4 1995-02-24 02:17:41 d3h325 Exp $ */ - -/* Header file declaring stubs for semaphore routines. */ - -/* - These routines simplify the interface to semaphores for use in mutual - exclusion and queuing. Hopefully I can also make this portable. - - Interruption by signals is not tested for. - - An external routine Error is assumed which is called upon an error - and tidies up by calling SemSetDestroyAll. - - In most cases errors cause an internal hard failure (by calling Error). -*/ - -/* - 1) make an array of n_sem semaphores, returning the id associated - with the entire set. All the semaphore values are initialized to value - which should be a positve integer (queuing) or 0 (synchronization). - The semaphores in the set are indexed from 0 to n_sem-1. - - long SemSetCreate(long n_sem, long value) -*/ -extern long SemSetCreate(); - - -/* - 2) Decrement and test the value associated with the semaphore specified by - (sem_set_id, sem_num). In effect this: - - decrement value - - if (value >= 0) { - continue execution - } - else { - wait in queue for the semaphore - } - - void SemWait(long sem_set_id, long sem_num) -*/ -extern void SemWait(); - - -/* - 3) Increment the value associated with the semaphore specified by - (sem_set_id, sem_num). If value <= 0 (i.e. there are processes - in the queue) this releases the next process. - - void SemPost(long sem_set_id, long sem_num) -*/ -extern void SemPost(); - - -/* - 4) Return the current value associated with the semaphore sepcified by - (sem_set_id, sem_num). - - long SemValue(long sem_set_id, long sem_num) -*/ -extern long SemValue(); - - -/* - 5) Destroy the set of semaphores. Any other processes that are accessing - or try to access the semaphore set should get an error. - On the SUN (all system V machines?) the semaphore sets should - be destroyed explicitly before the final process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroy(long sem_set_id) -*/ -extern long SemSetDestroy(); - - -/* - 6) Destroy all the semaphore sets that are known about. This is really - meant for an error routine to call to try and tidy up. Though all - applications could call it before the last process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroyAll() -*/ -extern long SemSetDestroyAll(); diff --git a/armci/tcgmsg/ipcv4.0/sema_alliant.c b/armci/tcgmsg/ipcv4.0/sema_alliant.c deleted file mode 100644 index 18b348753..000000000 --- a/armci/tcgmsg/ipcv4.0/sema_alliant.c +++ /dev/null @@ -1,206 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* - These routines simplify the interface to semaphores for use in mutual - exclusion and queuing. Hopefully I can also make this portable. - - An external routine Error is assumed which is called upon an error - and tidies up by calling SemSetDestroyAll. - - In most cases errors cause an internal hard failure (by calling Error). - - 1) make an array of n_sem semaphores, returning the id associated - with the entire set. All the semaphore values are initialized to value - which should be a positve integer (queuing) or 0 (synchronization). - The semaphores in the set are indexed from 0 to n_sem-1. - - long SemSetCreate(long n_sem, long value) - - 2) Decrement and test the value associated with the semaphore specified by - (sem_set_id, sem_num). In effect this: - - if (value >= 0) { - continue execution - } - else { - wait in queue for the semaphore - } - decrement value - - void SemWait(long sem_set_id, long sem_num) - - 3) Increment the value associated with the semaphore specified by - (sem_set_id, sem_num). If value <= 0 (i.e. there are processes - in the queue) this releases the next process. - - void SemPost(long sem_set_id, long sem_num) - - 4) Return the current value associated with the semaphore sepcified by - (sem_set_id, sem_num). - - long SemValue(long sem_set_id, long sem_num) - - 5) Destroy the set of semaphores. Any other processes that are accessing - or try to access the semaphore set should get an error. - On the SUN (all system V machines?) the semaphore sets should - be destroyed explicitly before the final process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroy(long sem_set_id) - - 6) Destroy all the semaphore sets that are known about. This is really - meant for an error routine to call to try and tidy up. Though all - applications could call it before the last process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroyAll() -*/ - -extern void Error(); - -/************************************************************* - Alliant Concentrix 5.0 and Concentrix FX/2800 - *************************************************************/ - -/* This is very specific to the Alliant. */ - -#include -#include - -extern int errno; - -/* On the alliant semaphores are handed out one at a time rather than - in sets, so have to maintain sets manually */ - -#define MAX_SEM_SETS 20 -#define MAX_N_SEM 128 - -static struct sem_set_list_struct { - int id[MAX_N_SEM]; /* alliant semaphore id */ - int n_sem; /* no. of semaphores in set */ -} sem_set_list[MAX_SEM_SETS]; - -static int num_sem_set = 0; - - -void InitSemSetList() -/* Initialise sem_set_list */ -{ - int i, j; - - for (i=0; i= MAX_N_SEM) ) - Error("SemSetCreate: n_sem has invalid value", (long) n_sem); - - if (num_sem_set == 0) - InitSemSetList(); - else if (num_sem_set >= MAX_SEM_SETS) - Error("SemSetCreate: Exceeded man no. of semaphore sets", - (long) num_sem_set); - - /* Find first empty slot in sem_set_list */ - - for (i=0; i < MAX_SEM_SETS; i++) - if (sem_set_list[i].n_sem == 0) - break; - - if (i == MAX_SEM_SETS) - Error("SemSetCreate: internal error puting semid in list", (long) i); - - /* Actually make the semaphore set */ - - for (j=0; j= 0) { - continue execution - } - else { - wait in queue for the semaphore - } - decrement value - - void SemWait(long sem_set_id, long sem_num) - - 3) Increment the value associated with the semaphore specified by - (sem_set_id, sem_num). If value <= 0 (i.e. there are processes - in the queue) this releases the next process. - - void SemPost(long sem_set_id, long sem_num) - - 4) Return the current value associated with the semaphore sepcified by - (sem_set_id, sem_num). - - long SemValue(long sem_set_id, long sem_num) - - 5) Destroy the set of semaphores. Any other processes that are accessing - or try to access the semaphore set should get an error. - On the SUN (all system V machines?) the semaphore sets should - be destroyed explicitly before the final process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroy(long sem_set_id) - - 6) Destroy all the semaphore sets that are known about. This is really - meant for an error routine to call to try and tidy up. Though all - applications could call it before the last process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroyAll() -*/ - -extern void Error(); - -#include -#include -#include -#include -#include - -#define MAX_SEM_SETS 20 -#define MAX_N_SEM 100 - -/* On the convex a semaphore is a structure but on the apollo - it is an array which does not need dereferencing. Use ADDR - to generate the address of a semaphore */ -#ifdef APOLLO -#define ADDR(x) x -#else -#define ADDR(x) &x -#endif - -extern char *mktemp(); - -struct sem_set_struct { - int n_sem; /* no. of semaphores in set */ - semaphore lock[MAX_N_SEM]; /* locks for changing value */ - semaphore wait[MAX_N_SEM]; /* locks for queing */ - int value[MAX_N_SEM]; /* values */ -}; - -static int num_sem_set = 0; -static struct sem_set_struct *sem_sets; -static int fd = -1; -static char template[] = "/tmp/SEMA.XXXXXX"; -static char *filename = (char *) NULL; - -void InitSemSets() -/* Initialise sem_sets and allocate associated shmem region */ -{ - int i, j; - unsigned size = sizeof(struct sem_set_struct) * MAX_SEM_SETS; - -#ifndef APOLLO - /* Generate scratch file to identify region ... mustn't do this - on the APOLLO */ - - filename = mktemp(template); - if ( (fd = open(filename, O_RDWR|O_CREAT, 0666)) < 0 ) - Error("InitSemSets: failed to open temporary file",0); -#endif - - sem_sets = (struct sem_set_struct *) mmap((caddr_t) 0, &size, - PROT_READ|PROT_WRITE, - MAP_ANON|MAP_HASSEMAPHORE|MAP_SHARED, fd, 0); - -#ifdef APOLLO - if (sem_sets == (struct sem_set_struct *) 0) - Error("InitSemSets: mmap failed", (long) -1); -#else - if (sem_sets == (struct sem_set_struct *) -1) - Error("InitSemSets: mmap failed", (long) -1); -#endif - - for (i=0; i= MAX_N_SEM) ) - Error("SemSetCreate: n_sem has invalid value",n_sem); - - if (num_sem_set == 0) - InitSemSets(); - else if (num_sem_set >= MAX_SEM_SETS) - Error("SemSetCreate: Exceeded man no. of semaphore sets", - num_sem_set); - - /* Initialize the values */ - - for (i=0; i= num_sem_set) ) - Error("SemWait: invalid sem_set_id",sem_set_id); - if ( (sem_num < 0) || (sem_num >= sem_sets[sem_set_id].n_sem) ) - Error("SemWait: invalid semaphore number in set",sem_num); - - while (1) { - - /* Get the lock around the whole semaphore */ - - (void) mset(ADDR(sem_sets[sem_set_id].lock[sem_num]), 1); - - /* If the value is positive fall thru, else wait */ - - if (sem_sets[sem_set_id].value[sem_num] > 0) - break; - else { - (void) mclear(ADDR(sem_sets[sem_set_id].lock[sem_num])); - (void) mset(ADDR(sem_sets[sem_set_id].wait[sem_num]), 1); - } - } - - /* Are ready to go ... decrement the value and release lock */ - - sem_sets[sem_set_id].value[sem_num]--; - (void) mclear(ADDR(sem_sets[sem_set_id].lock[sem_num])); - -} - -void SemPost(sem_set_id, sem_num) - long sem_set_id; - long sem_num; -{ - int i; - - if ( (sem_set_id < 0) || (sem_set_id >= num_sem_set) ) - Error("SemPost: invalid sem_set_id",sem_set_id); - if ( (sem_num < 0) || (sem_num >= sem_sets[sem_set_id].n_sem) ) - Error("SemPost: invalid semaphore number in set",sem_num); - - /* Get the lock around the whole semaphore */ - - (void) mset(ADDR(sem_sets[sem_set_id].lock[sem_num]), 1); - - /* Read and increment the value. If is now zero wake up - up the queue */ - - sem_sets[sem_set_id].value[sem_num]++; - i = sem_sets[sem_set_id].value[sem_num]; - - (void) mclear(ADDR(sem_sets[sem_set_id].lock[sem_num])); - if (i >= 0) - (void) mclear(ADDR(sem_sets[sem_set_id].wait[sem_num])); -} - -long SemValue(sem_set_id, sem_num) - long sem_set_id; - long sem_num; -{ - int i; - - if ( (sem_set_id < 0) || (sem_set_id >= num_sem_set) ) - Error("SemValue: invalid sem_set_id",sem_set_id); - if ( (sem_num < 0) || (sem_num >= sem_sets[sem_set_id].n_sem) ) - Error("SemValue: invalid semaphore number in set",sem_num); - - /* There seems no point in getting the lock just to read - the value and it seems more useful not to (e.g. debugging) */ - - i = sem_sets[sem_set_id].value[sem_num]; - - return (long) (i-1); -} - -long SemSetDestroy(sem_set_id) - long sem_set_id; -{ - - if ( (sem_set_id < 0) || (sem_set_id >= num_sem_set) ) - return -1; - - sem_sets[sem_set_id].n_sem = 0; - - return (long) 0; -} - -long SemSetDestroyAll() -{ - long i, status=0; - - for (i=0; i= 0) { - (void) close(fd); - fd = -1; - (void) unlink(filename); - } - - status += munmap((char *) sem_sets, 0); - - if (status) - status = -1; - - return status; -} diff --git a/armci/tcgmsg/ipcv4.0/sema_macx.c b/armci/tcgmsg/ipcv4.0/sema_macx.c deleted file mode 100644 index fbbe9d24d..000000000 --- a/armci/tcgmsg/ipcv4.0/sema_macx.c +++ /dev/null @@ -1,227 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* - These routines simplify the interface to semaphores for use in mutual - exclusion and queuing. Hopefully I can also make this portable. - - An external routine Error is assumed which is called upon an error - and tidies up by calling SemSetDestroyAll. - - In most cases errors cause an internal hard failure (by calling Error). - - 1) make an array of n_sem semaphores, returning the id associated - with the entire set. All the semaphore values are initialized to value - which should be a positve integer (queuing) or 0 (synchronization). - The semaphores in the set are indexed from 0 to n_sem-1. - - long SemSetCreate(long n_sem, long value) - - 2) Decrement and test the value associated with the semaphore specified by - (sem_set_id, sem_num). In effect this: - - if (value >= 0) { - continue execution - } - else { - wait in queue for the semaphore - } - decrement value - - void SemWait(long sem_set_id, long sem_num) - - 3) Increment the value associated with the semaphore specified by - (sem_set_id, sem_num). If value <= 0 (i.e. there are processes - in the queue) this releases the next process. - - void SemPost(long sem_set_id, long sem_num) - - 4) Return the current value associated with the semaphore sepcified by - (sem_set_id, sem_num). - - long SemValue(long sem_set_id, long sem_num) - - 5) Destroy the set of semaphores. Any other processes that are accessing - or try to access the semaphore set should get an error. - On the SUN (all system V machines?) the semaphore sets should - be destroyed explicitly before the final process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroy(long sem_set_id) - - 6) Destroy all the semaphore sets that are known about. This is really - meant for an error routine to call to try and tidy up. Though all - applications could call it before the last process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroyAll() -*/ - -extern void Error(); - -#include -#include -#include -#include -#include - -#define MAX_SEMA 32 -static int fd = -1; -static char template[] = "/tmp/SEMA.XXXXXX"; -static char *filename = (char *) NULL; -static unsigned shmem_size; - -#if defined(NAMED_SEMAPHORES_SUPPORTED) - -static sem_t *sem_arr; - -long SemSetCreate(long n_sem, long value) -{ - int i; - shmem_size = MAX_SEMA*sizeof(sem_t); - - if ( (n_sem <= 0) || (n_sem >= MAX_SEMA) ) - Error("SemSetCreate: n_sem has invalid value",n_sem); - - /* allocate shared memory for locks and semaphore val */ - filename = mktemp(template); - if ( (fd = shm_open(filename, O_CREAT|O_RDWR, 0666)) < 0 ) - Error("SemSetCreate: failed to open temporary shm file",0); - sem_arr = (sem_t*) mmap((caddr_t)0, shmem_size, PROT_READ|PROT_WRITE, - MAP_ANON|MAP_HASSEMAPHORE|MAP_SHARED, fd, (off_t)0); - if(!sem_arr)Error("SemSetCreate: failed to mmap",0); - - /* initialize locks and semaphore values */ - for (i=0; i= MAX_SEMA) ) - Error("SemWait: invalid sem_num",sem_num); - if(sem_wait(sem_arr+sem_num)<0) - Error("SemWait: failed",sem_num); -} - -void SemPost(long sem_set_id, long sem_num) -{ - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemPost: invalid sem_num",sem_num); - if(sem_post(sem_arr+sem_num)<0) - Error("SemPost: failed",sem_num); - -} - -long SemValue(long sem_set_id, long sem_num) -{ - Error("SemValue: not implemented", sem_num); - return 1L; -} - -#else - - -typedef struct{ - int state; - int pad[15]; -} lock_t; -static lock_t *locks; - - -static char template1[] = "/tmp/SEMA1.XXXXXX"; -static char *filename1 = (char *) NULL; -static sem_t *sem; -static lock_t *locks; - -#include - -long SemSetCreate(long n_sem, long value) -{ - int i; - shmem_size = MAX_SEMA*sizeof(lock_t); - - if ( (n_sem <= 0) || (n_sem >= MAX_SEMA) ) - Error("SemSetCreate: n_sem has invalid value",n_sem); - - /* allocate shared memory for locks and semaphore val */ - locks = (lock_t*) mmap((caddr_t)0, shmem_size, PROT_READ|PROT_WRITE, - MAP_ANON|MAP_SHARED, -1, (off_t)0); - if(locks == (lock_t*)-1)Error("SemSetCreate: failed to mmap",shmem_size); - - filename1 = mktemp(template1); - sem = sem_open(filename1, O_CREAT|O_EXCL, 0666, 1); - if(!sem)Error("SemSetCreate: failed to sem_open",0); - - /* initialize locks and semaphore values */ - bzero(locks,shmem_size); - return 1L; -} - -long SemSetDestroyAll() -{ - long status=0; - status = munmap((char *) locks, shmem_size); - if(status)status = -1; - sem_unlink(filename1); - return status; -} - -double __tcgmsg_fred__=0.0; -void Dummy() -{ - int n = 200; /* This seems optimal */ - while(n--) - __tcgmsg_fred__++; -} - -void SemWait(long sem_set_id, long sem_num) -{ - int value = 0, count=0; - - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemWait: invalid sem_num",sem_num); - - while (value<=0) { - if(sem_wait(sem)<0)Error("SemWait: sem_op error",sem_num);; - value = locks[sem_num].state; - if (value>0) - locks[sem_num].state--; - if(sem_post(sem)<0)Error("SemWait: sem_op error",sem_num);; - if (value<=0) Dummy(); - count++; - if(count%1000 == 999)usleep(1); - } -} - -void SemPost(long sem_set_id, long sem_num) -{ - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemPost: invalid sem_num",sem_num); - - if(sem_wait(sem)<0)Error("SemPost: sem_op error",sem_num);; - locks[sem_num].state++; - if(sem_post(sem)<0)Error("SemWait: sem_op error",sem_num);; -} - -long SemValue(long sem_set_id, long sem_num) -{ - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemVal: invalid sem_num",sem_num); - return (long)locks[sem_num].state; -} -#endif diff --git a/armci/tcgmsg/ipcv4.0/sema_sgius.c b/armci/tcgmsg/ipcv4.0/sema_sgius.c deleted file mode 100644 index b442b00fb..000000000 --- a/armci/tcgmsg/ipcv4.0/sema_sgius.c +++ /dev/null @@ -1,262 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* - These routines simplify the interface to semaphores for use in mutual - exclusion and queuing. Hopefully I can also make this portable. - - An external routine Error is assumed which is called upon an error - and tidies up by calling SemSetDestroyAll. - - In most cases errors cause an internal hard failure (by calling Error). - - 1) make an array of n_sem semaphores, returning the id associated - with the entire set. All the semaphore values are initialized to value - which should be a positve integer (queuing) or 0 (synchronization). - The semaphores in the set are indexed from 0 to n_sem-1. - - long SemSetCreate(long n_sem, long value) - - 2) Decrement and test the value associated with the semaphore specified by - (sem_set_id, sem_num). In effect this: - - if (value >= 0) { - continue execution - } - else { - wait in queue for the semaphore - } - decrement value - - void SemWait(long sem_set_id, long sem_num) - - 3) Increment the value associated with the semaphore specified by - (sem_set_id, sem_num). If value <= 0 (i.e. there are processes - in the queue) this releases the next process. - - void SemPost(long sem_set_id, long sem_num) - - 4) Return the current value associated with the semaphore sepcified by - (sem_set_id, sem_num). - - long SemValue(long sem_set_id, long sem_num) - - 5) Destroy the set of semaphores. Any other processes that are accessing - or try to access the semaphore set should get an error. - On the SUN (all system V machines?) the semaphore sets should - be destroyed explicitly before the final process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroy(long sem_set_id) - - 6) Destroy all the semaphore sets that are known about. This is really - meant for an error routine to call to try and tidy up. Though all - applications could call it before the last process exits. - 0 is returned if OK. -1 implies an error. - - long SemSetDestroyAll() -*/ - -extern void Error(); - -/* - SGI fast US library semaphores ... aren't any faster - than system V semaphores ... implement using spin locks -*/ - -#include -#include -#define MAX_SEMA 512 -static volatile int *val; -#define NAME_LEN 200 - -#ifdef SGI -# include - static usptr_t *arena_ptr; - static ulock_t *locks[MAX_SEMA]; - static char arena_name[NAME_LEN]; -# define EIGHT 8 -# define LOCK ussetlock -# define UNLOCK usunsetlock -#define JUMP EIGHT - -#include "sndrcvP.h" - -extern char *getenv(const char *); - -long SemSetCreate(long n_sem, long value) -{ - int i; - char *tmp; - if (!(tmp = getenv("ARENA_DIR"))) tmp = "/tmp"; - - sprintf(arena_name,"%s/tcgmsg.arena.%ld",tmp, (long)getpid()); -#ifdef PRIVATE_ARENA - (void) usconfig(CONF_ARENATYPE, US_SHAREDONLY); -#endif - (void) usconfig(CONF_INITUSERS, (unsigned int)SR_clus_info[SR_clus_id].nslave ); -#ifdef SGI - (void) usconfig(CONF_INITSIZE, 1024*1024); -#endif - - if (!(arena_ptr = usinit(arena_name))) - Error("SemSetCreate: failed to create arena", 0L); - - /* Magic factors of EIGHT here to ensure that values are - in different cache lines to avoid aliasing -- good on SGI and Convex */ - - if (!(val = (int *) usmalloc(EIGHT*MAX_SEMA*sizeof(int), arena_ptr))) - Error("SemSetCreate: failed to get shmem", (long) (MAX_SEMA*sizeof(int))); - - for (i=0; i -#include -#include -#include -#include - -#define SIXTEEN 16 -#define JUMP SIXTEEN -typedef struct{ - int state; - int pad[15]; -} lock_t; - -static lock_t *locks; - -# define LOCK(x) set_lock(&x.state) -# define UNLOCK(x) unset_lock(&x.state) -# define INILOCK(x) init_lock(&x.state) - - -void init_lock(int * volatile ip) -{ - *ip = 1; -} - -void set_lock(int * volatile ip) -{ - while (1) { - while (!(*ip)); - if (__ldcws32(ip)) - break; - } -} - -void unset_lock(int *ip) -{ - *ip = 1; - asm("sync"); -} - -static int fd = -1; -static char template[] = "/tmp/SEMA.XXXXXX"; -static char *filename = (char *) NULL; -static unsigned shmem_size; - -long SemSetCreate(long n_sem, long value) -{ - int i; - shmem_size = SIXTEEN*MAX_SEMA*sizeof(int)+MAX_SEMA*sizeof(lock_t); - - if ( (n_sem <= 0) || (n_sem >= MAX_SEMA) ) - Error("SemSetCreate: n_sem has invalid value",n_sem); - - /* allocate shared memory for locks and semaphore val */ - filename = mktemp(template); - if ( (fd = open(filename, O_RDWR|O_CREAT, 0666)) < 0 ) - Error("SemSetCreate: failed to open temporary file",0); - val = (int *) mmap((caddr_t) 0, shmem_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|CNX_MAP_SEMAPHORE|MAP_SHARED, fd, 0); - locks = (lock_t*)( val + SIXTEEN*MAX_SEMA); - - /* initialize locks and semaphore values */ - for (i=0; i= MAX_SEMA) ) - Error("SemWait: invalid sem_num",sem_num); - - while (value<=0) { - LOCK(locks[sem_num]); - value = val[off]; - if (value>0) - val[off]--; - UNLOCK(locks[sem_num]); - if (value<=0) - Dummy(); - } -} - -void SemPost(long sem_set_id, long sem_num) -{ - int off = sem_num*JUMP; - if ( (sem_num < 0) || (sem_num >= MAX_SEMA) ) - Error("SemPost: invalid sem_num",sem_num); - - LOCK(locks[sem_num]); - val[off]++; - UNLOCK(locks[sem_num]); -} - -long SemValue(long sem_set_id, long sem_num) -{ - Error("SemValue: not implemented", sem_num); - return 1; -} - -long SemSetDestroy(long sem_set_id) -{ - return(SemSetDestroyAll()); -} diff --git a/armci/tcgmsg/ipcv4.0/setdbg.c b/armci/tcgmsg/ipcv4.0/setdbg.c deleted file mode 100644 index e840c22ad..000000000 --- a/armci/tcgmsg/ipcv4.0/setdbg.c +++ /dev/null @@ -1,18 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/setdbg.c,v 1.4 1995-02-24 02:17:42 d3h325 Exp $ */ - -#include "sndrcv.h" -#include "sndrcvP.h" - -void SETDBG_(value) - long *value; -/* - set global debug flag for this process to value -*/ -{ - SR_debug = *value; -} - diff --git a/armci/tcgmsg/ipcv4.0/shmem.c b/armci/tcgmsg/ipcv4.0/shmem.c deleted file mode 100644 index 9db4ce475..000000000 --- a/armci/tcgmsg/ipcv4.0/shmem.c +++ /dev/null @@ -1,368 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/shmem.c,v 1.13 2000-10-13 20:55:40 d3h325 Exp $ */ - -/* - This stuff attempts to provide a simple interface to temporary shared - memory regions, loosely modelled after that of Alliant Concentrix 5.0 - - - Note that the input arguments switch between integers and pointers - to integers depending on if they are modified on return. - - - Create a shared region of at least size bytes, returning the actual size, - the id associated with the region. The return value is a pointer to the - the region. Any error is a hard fail. - - (char *) CreateSharedRegion((long *) id, (long *) size) - - - Detach a process from a shared memory region. 0 is returned on success, - -1 for failure. id, size, and addr must match exactly those items returned - from CreateSharedRegion - - long DetachSharedRegion((long) id, (long) size, (char *) addr) - - - Delete a shared region from the system. This has to be done on the SUN - to remove it from the system. On the Alliant the shared region disappears - when the last process dies or detaches. Returns 0 on success, -1 on error. - - long DeleteSharedRegion((long) id) - - - Delete all the shared regions associated with this process. - - long DeleteSharedAll() - - - Attach to a shared memory region of known id and size. Returns the - address of the mapped memory. Size must exactly match the size returned - from CreateSharedRegion (which in turn is the requested size rounded - up to a multiple of 4096). Any error is a hard fail. - - (char *) AttachSharedRegion((long) id, (long) size)) - -*/ - -extern void Error(); - -#ifdef ALLIANT - -#include -#include - -extern char *valloc(); - -char *CreateSharedRegion(id, size) - long *size, *id; -{ - struct timeval tp; - struct timezone tzp; - char *temp; - int status; - - /* Have to round up to a multiple of page size before allocating - on a page boundary */ - - *size = ( (*size + 4095) / 4096 ) * 4096; - - if ( (temp = valloc((unsigned) *size)) == (char *) NULL) - Error("CreateSharedRegion: failed in valloc", (long) 0); - - /* Now have to get a unique id ... try using time of day in centi-sec */ - - if ( (status = gettimeofday(&tp, &tzp)) != 0) - Error("CreateSharedRegion: error from gettimeofday", (long) status); - - *id = (tp.tv_sec + 10000*tp.tv_usec) & 0xffffff; - - /* Now make the region */ - - if ( (status = create_shared_region(*id, temp, *size, 0)) != 0) - Error("CreateSharedRegion: error from create_shared_region", (long) status); - - return temp; -} - -long DetachSharedRegion( id, size, addr) - long id, size; - char *addr; -{ - return detach_shared_region( id, addr, size); -} - -long DeleteSharedRegion(id) - long id; -{ - return delete_shared_region(id); -} - -char *AttachSharedRegion(id, size) - long id, size; -{ - char *temp; - int status; - - if (size != (((size + 4095) / 4096) * 4096)) - Error("AttachSharedRegion: input size is not multiple of 4096", - (long) size); - - if ( (temp = valloc((unsigned) size)) == (char *) NULL) - Error("AttachSharedRegion: failed in valloc", (long) 0); - - /* Now try to attach */ - - if ( (status = attach_shared_region(id, temp, size)) != 0) - Error("AttachSharedRegion: error from attach_shared_region", - (long) status); - - return temp; -} - -#endif -#if defined(SEQUENT) || defined(ENCORE) /* @**!ing SEQUENT and CRAY no elif */ - -#include - -#ifdef SEQUENT -#define SHMALLOC shmalloc -#define SHFREE shfree -#endif -#ifdef ENCORE -#define SHMALLOC share_malloc -#define SHFREE share_free -#endif - -extern char *SHMALLOC(); -extern int SHFREE(); - -#define MAX_ADDR 20 -static int next_id = 0; /* Keep track of id */ -static char *shaddr[MAX_ADDR]; /* Keep track of addresses */ - -char *CreateSharedRegion(id, size) - long *size, *id; -{ - char *temp; - - if (next_id >= MAX_ADDR) - Error("CreateSharedRegion: too many shared regions", (long) next_id); - - if ( (temp = SHMALLOC((unsigned) *size)) == (char *) NULL) - Error("CreateSharedRegion: failed in SHMALLOC", (long) *size); - - *id = next_id++; - shaddr[*id] = temp; - - return temp; -} - -/*ARGSUSED*/ -long DetachSharedRegion( id, size, addr) - long id, size; - char *addr; -{ - /* This needs improving to make more robust */ - return SHFREE(addr); -} - -long DeleteSharedRegion(id) - long id; -{ - /* This needs improving to make more robust */ - return SHFREE(shaddr[id]); -} - -/*ARGSUSED*/ -char *AttachSharedRegion(id, size) - long id, size; -{ - Error("AttachSharedRegion: cannot do this on SEQUENT or BALANCE", (long) -1); -} - - -#endif - /* Bizarre sequent has sysv semaphores but proprietary shmem */ - /* Encore has sysv shmem but is limited to total of 16384bytes! */ -#if defined(SYSV) && !defined(SEQUENT) && !defined(ENCORE) - -#include -#include -#include -#include - -char *CreateSharedRegion(id, size) - long *size, *id; -{ - char *temp; - - /* Create the region */ - - if ( (*id = shmget(IPC_PRIVATE, (int) *size, - (int) (IPC_CREAT | 00600))) < 0 ) - Error("CreateSharedRegion: failed to create shared region", (long) *id); - - /* Attach to the region */ - - if ( (long) (temp = shmat((int) *id, (char *) NULL, 0)) == -1L) - Error("CreateSharedRegion: failed to attach to shared region", (long) 0); - - return temp; -} - -/*ARGSUSED*/ -long DetachSharedRegion( id, size, addr) - long id, size; - char *addr; -{ - return shmdt(addr); -} - -long DeleteSharedRegion(id) - long id; -{ - return shmctl((int) id, IPC_RMID, (struct shmid_ds *) NULL); -} - -/*ARGSUSED*/ -char *AttachSharedRegion(id, size) - long id, size; -{ - char *temp; - - if ( (long) (temp = shmat((int) id, (char *) NULL, 0)) == -1L) - Error("AttachSharedRegion: failed to attach to shared region", (long) 0); - - return temp; -} - -#endif -#if (defined(CONVEX) || defined(APOLLO)) && !defined(HPUX) - -#include -#include -#include -#include -#include - -extern char *strdup(); -extern char *mktemp(); - -#define MAX_ID 20 -static struct id_list_struct { - char *addr; /* pointer to shmem region */ - unsigned size; /* size of region */ - char *filename; /* associated file name */ - int fd; /* file descriptor */ - int status; /* = 1 if in use */ -} id_list[MAX_ID]; - -static int next_id = 0; -static char template[] = "/tmp/SHMEM.XXXXXX"; - -char *CreateSharedRegion(id, size) - long *size, *id; -{ - char *temp; - - if (next_id == MAX_ID) - Error("CreateSharedRegion: MAX_ID exceeded ", MAX_ID); - *id = next_id; - -#ifdef APOLLO - id_list[*id].fd = -1; -#else - if ( (temp = strdup(template)) == (char *) NULL) - Error("CreateSharedRegion: failed to get space for filename", 0); - -/* Generate scratch file to identify region ... need to know this - name to attach to the region so need to establish some policy - before AttachtoSharedRegion can work */ - - id_list[*id].filename = mktemp(temp); - if ( (id_list[*id].fd = open(id_list[*id].filename, - O_RDWR|O_CREAT, 0666)) < 0) - Error("CreateSharedRegion: failed to open temporary file",0); -#endif - - id_list[*id].addr = mmap((caddr_t) 0, (unsigned *) size, - PROT_READ|PROT_WRITE, - MAP_ANON|MAP_SHARED, id_list[*id].fd, 0); -#ifdef APOLLO - if (id_list[*id].addr == (char *) 0) - Error("CreateSharedRegion: mmap failed",-1); -#else - if (id_list[*id].addr == (char *) -1) - Error("CreateSharedRegion: mmap failed",-1); -#endif - - id_list[*id].size = *size; - id_list[*id].status = 1; - - next_id++; - return id_list[*id].addr; -} - -/*ARGSUSED*/ -long DetachSharedRegion( id, size, addr) - long id, size; - char *addr; -{ - if ( (id < 0) || (id > next_id)) - return (long) -1; - - if (id_list[id].status != 1) - return (long) -1; - - id_list[id].status = 0; - - return (long) munmap(id_list[id].addr, 0); -} - -long DeleteSharedRegion(id) - long id; -{ - if ( (id < 0) || (id > next_id) ) - return (long) -1; - - if (id_list[id].status != 1) - return (long) -1; - - (void) DetachSharedRegion(id, 0, (char *) 0); - - if (id_list[id].fd >= 0) { - (void) close(id_list[id].fd); - (void) unlink(id_list[id].filename); - } - - return (long) 0; -} - -/*ARGSUSED*/ -char *AttachSharedRegion(id, size) - long id, size; -{ - Error("AttachSharedRegion: need mods for this to work on CONVEX", - (long) -1); -} - -long DeleteSharedAll() -{ - long id; - long status = 0; - - for (id=0; id -#endif -#if HAVE_SYS_TIME_H -# include -#endif -#if HAVE_MALLOC_H -# include -#endif - -char *CreateSharedRegion(long *id, long *size) -{ - struct timeval tp; - struct timezone tzp; - char *temp; - int status; - - /* Have to round up to a multiple of page size before allocating - on a page boundary */ - - *size = ( (*size + 4095) / 4096 ) * 4096; - - if ( (temp = valloc((unsigned) *size)) == (char *) NULL) { - Error("CreateSharedRegion: failed in valloc", (long) 0); - } - - /* Now have to get a unique id ... try using time of day in centi-sec */ - - if ( (status = gettimeofday(&tp, &tzp)) != 0) { - Error("CreateSharedRegion: error from gettimeofday", (long) status); - } - - *id = (tp.tv_sec + 10000*tp.tv_usec) & 0xffffff; - - /* Now make the region */ - - if ( (status = create_shared_region(*id, temp, *size, 0)) != 0) { - Error("CreateSharedRegion: error from create_shared_region", - (long) status); - } - - return temp; -} - - -long DetachSharedRegion(long id, long size, char *addr) -{ - return detach_shared_region( id, addr, size); -} - - -long DeleteSharedRegion(long id) -{ - return delete_shared_region(id); -} - - -char *AttachSharedRegion(long id, long size) -{ - char *temp; - int status; - - if (size != (((size + 4095) / 4096) * 4096)) { - Error("AttachSharedRegion: input size is not multiple of 4096", - (long) size); - } - - if ( (temp = valloc((unsigned) size)) == (char *) NULL) { - Error("AttachSharedRegion: failed in valloc", (long) 0); - } - - /* Now try to attach */ - - if ( (status = attach_shared_region(id, temp, size)) != 0) { - Error("AttachSharedRegion: error from attach_shared_region", - (long) status); - } - - return temp; -} diff --git a/armci/tcgmsg/ipcv4.0/shmem_convex.c b/armci/tcgmsg/ipcv4.0/shmem_convex.c deleted file mode 100644 index fe5af4cca..000000000 --- a/armci/tcgmsg/ipcv4.0/shmem_convex.c +++ /dev/null @@ -1,130 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include -#include -#include -#include - -extern char *strdup(); -extern char *mktemp(); - -#define MAX_ID 20 -static struct id_list_struct { - char *addr; /* pointer to shmem region */ - unsigned size; /* size of region */ - char *filename; /* associated file name */ - int fd; /* file descriptor */ - int status; /* = 1 if in use */ -} id_list[MAX_ID]; - -static int next_id = 0; -static char template[] = "/tmp/SHMEM.XXXXXX"; - -char *CreateSharedRegion(long *id, long *size) -{ - char *temp; - - if (next_id == MAX_ID) { - Error("CreateSharedRegion: MAX_ID exceeded ", MAX_ID); - } - *id = next_id; - -#ifdef APOLLO - id_list[*id].fd = -1; -#else - if ( (temp = strdup(template)) == (char *) NULL) { - Error("CreateSharedRegion: failed to get space for filename", 0); - } - - /* Generate scratch file to identify region ... need to know this - name to attach to the region so need to establish some policy - before AttachtoSharedRegion can work */ - - id_list[*id].filename = mktemp(temp); - if ( (id_list[*id].fd = open(id_list[*id].filename, - O_RDWR|O_CREAT, 0666)) < 0) { - Error("CreateSharedRegion: failed to open temporary file",0); - } -#endif - - id_list[*id].addr = mmap((caddr_t) 0, (unsigned *) size, - PROT_READ|PROT_WRITE, - MAP_ANON|MAP_SHARED, id_list[*id].fd, 0); -#ifdef APOLLO - if (id_list[*id].addr == (char *) 0) { - Error("CreateSharedRegion: mmap failed",-1); - } -#else - if (id_list[*id].addr == (char *) -1) { - Error("CreateSharedRegion: mmap failed",-1); - } -#endif - - id_list[*id].size = *size; - id_list[*id].status = 1; - - next_id++; - return id_list[*id].addr; -} - -long DetachSharedRegion(long id, long size, char *addr) -{ - if ( (id < 0) || (id > next_id)) { - return (long) -1; - } - - if (id_list[id].status != 1) { - return (long) -1; - } - - id_list[id].status = 0; - - return (long) munmap(id_list[id].addr, 0); -} - -long DeleteSharedRegion(long id) -{ - if ( (id < 0) || (id > next_id) ) { - return (long) -1; - } - - if (id_list[id].status != 1) { - return (long) -1; - } - - (void) DetachSharedRegion(id, 0, (char *) 0); - - if (id_list[id].fd >= 0) { - (void) close(id_list[id].fd); - (void) unlink(id_list[id].filename); - } - - return (long) 0; -} - -char *AttachSharedRegion(long id, long size) -{ - Error("AttachSharedRegion: need mods for this to work on CONVEX", - (long) -1); -} - -long DeleteSharedAll() -{ - long id; - long status = 0; - - for (id=0; id -#endif -#if HAVE_SHMALLOC -# define SHMALLOC shmalloc -# define SHFREE shfree -#endif -#if HAVE_SHARE_MALLOC -# define SHMALLOC share_malloc -# define SHFREE share_free -#endif - -extern char *SHMALLOC(); -extern int SHFREE(); - -#define MAX_ADDR 20 -static int next_id = 0; /* Keep track of id */ -static char *shaddr[MAX_ADDR]; /* Keep track of addresses */ - - -char *CreateSharedRegion(long *id, long *size) -{ - char *temp; - - if (next_id >= MAX_ADDR) { - Error("CreateSharedRegion: too many shared regions", (long) next_id); - } - - if ( (temp = SHMALLOC((unsigned) *size)) == (char *) NULL) { - Error("CreateSharedRegion: failed in SHMALLOC", (long) *size); - } - - *id = next_id++; - shaddr[*id] = temp; - - return temp; -} - - -long DetachSharedRegion(long id, long size, char *addr) -{ - /* This needs improving to make more robust */ - return SHFREE(addr); -} - - -long DeleteSharedRegion(long id) -{ - /* This needs improving to make more robust */ - return SHFREE(shaddr[id]); -} - - -char *AttachSharedRegion(long id, long size) -{ - Error("AttachSharedRegion: cannot do this on SEQUENT or BALANCE", - (long) -1); -} diff --git a/armci/tcgmsg/ipcv4.0/signals.c b/armci/tcgmsg/ipcv4.0/signals.c deleted file mode 100644 index aee43aac1..000000000 --- a/armci/tcgmsg/ipcv4.0/signals.c +++ /dev/null @@ -1,150 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/signals.c,v 1.11 2004-04-01 02:04:57 manoj Exp $ */ - -#include -#include "sndrcvP.h" -#if defined(SUN) || defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || \ - defined(AIX) || defined(NEXT) -#include -#else -#include -#include -#endif - -extern void Error(); -int SR_caught_sigint = 0; - -#if (defined(ENCORE) || defined(SEQUENT) || defined(ARDENT)) -# define SigType int -#else -# define SigType void -#endif - -#ifndef SIG_ERR -# define SIG_ERR (SigType (*)())-1 -#endif - - -#if (defined(ENCORE) || defined(SEQUENT) || defined(ARDENT)) || (defined(SUN) && !defined(SOLARIS)) -SigType SigintHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigintHandler(sig) -#endif - int sig; -{ - SR_caught_sigint = 1; - Error("SigintHandler: signal was caught",(long) sig); -} - - -void TrapSigint() -/* - Trap the signal SIGINT so that we can propagate error - conditions and also tidy up shared system resources in a - manner not possible just by killing everyone -*/ -{ - if ( signal(SIGINT, SigintHandler) == SIG_ERR) - Error("TrapSigint: error from signal setting SIGINT",(long) SIGINT); -} - - -void ZapChildren() -/* - kill -SIGINT all of my beloved children -*/ -{ - while (SR_numchild--) - (void) kill((int) SR_pids[SR_numchild], SIGINT); -} - -#if (defined(ENCORE) || defined(SEQUENT) || defined(ARDENT)) || (defined(SUN) && !defined(SOLARIS)) -SigType SigchldHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -void SigchldHandler(sig) -#endif - int sig; -{ - int status; - -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - union wait ustatus; -#endif - -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - (void) wait(&ustatus); - status = ustatus.w_status; -#else - (void) wait(&status); -#endif - SR_caught_sigint = 1; - Error("Child process terminated prematurely, status=",(long) status); -} - -void TrapSigchld() -/* - Trap SIGCHLD so that can tell if children die unexpectedly. -*/ -{ - if ( signal(SIGCHLD, SigchldHandler) == SIG_ERR) - Error("TrapSigchld: error from signal setting SIGCHLD", (long) SIGCHLD); -} - -#if (defined(ENCORE) || defined(SEQUENT) || defined(ARDENT)) || (defined(SUN) && !defined(SOLARIS)) -SigType SigsegvHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigsegvHandler(sig) -#endif - int sig; -{ - SR_caught_sigint = 1; - Error("SigsegvHandler: signal was caught",(long) sig); -} - - -void TrapSigsegv() -/* - parallel needs to trap the signal SIGSEGV under Solaris - that is generated when interrupted in NxtVal -*/ -{ - if ( signal(SIGSEGV, SigsegvHandler) == SIG_ERR) - Error("TrapSigsegv: error from signal setting SIGSEGV", (long) SIGSEGV); -} - -#if (defined(ENCORE) || defined(SEQUENT) || defined(ARDENT)) || (defined(SUN) && !defined(SOLARIS)) -SigType SigtermHandler(sig, code, scp, addr) - int code; - struct sigcontext *scp; - char *addr; -#else -SigType SigtermHandler(sig) -#endif - int sig; -{ - SR_caught_sigint = 1; - Error("SigtermHandler: signal was caught",(long) sig); -} - -void TrapSigterm() -/* - parallel needs to trap the SIGTERM for batch jobs -*/ -{ - if ( signal(SIGTERM, SigtermHandler) == SIG_ERR) - Error("TrapSigterm: error from signal setting SIGTERM", (long) SIGTERM); -} - - diff --git a/armci/tcgmsg/ipcv4.0/signals.h b/armci/tcgmsg/ipcv4.0/signals.h deleted file mode 100644 index 9aeb7d71e..000000000 --- a/armci/tcgmsg/ipcv4.0/signals.h +++ /dev/null @@ -1,6 +0,0 @@ -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/signals.h,v 1.5 2004-04-01 02:04:57 manoj Exp $ */ - -extern void ZapChildren(); -extern void TrapSigint(); -extern void TrapSigchld(); -extern void TrapSigterm(); diff --git a/armci/tcgmsg/ipcv4.0/snd.c b/armci/tcgmsg/ipcv4.0/snd.c deleted file mode 100644 index bb26b1a78..000000000 --- a/armci/tcgmsg/ipcv4.0/snd.c +++ /dev/null @@ -1,1052 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/snd.c,v 1.21 2004-04-01 02:04:57 manoj Exp $ */ - -#include -#include -#ifdef SEQUENT -#include -#else -#include -#endif - -#ifdef AIX -#include -#endif - -#include -#include - -#if defined(SHMEM) || defined(SYSV) -# if (defined(SGI_N32) || defined(SGITFP)) -# define PARTIALSPIN -# else -# define NOSPIN -# endif -#endif - -#if (defined(SUN) && !defined(SOLARIS)) - extern char *sprintf(); -#endif - -extern void Error(); - -#include "sndrcv.h" -#include "sndrcvP.h" -#include "tcgsockets.h" - -#ifdef GOTXDR -#include "xdrstuff.h" -#endif - -#if defined(SHMEM) || defined(SYSV) -#if !defined(SEQUENT) && !defined(CONVEX) -#include -#endif -#include "sema.h" -#include "tcgshmem.h" -#if defined(USE_SRMOVER) -extern void SRmover(); -#else -#define SRmover(a,b,n) memcpy(a,b,n) -#endif -#endif - -#ifdef EVENTLOG -#include "evlog.h" -#endif - -extern void ListenOnSock(int sock); -extern int AcceptConnection(int sock); - -void PrintProcInfo() -/* - Print out the SR_proc_info structure array for this process -*/ -{ - long i; - - (void) fprintf(stderr,"Process info for node %ld: \n",NODEID_()); - - for (i=0; itype, header->nodefrom, - header->nodeto, header->length, header->tag); - (void) fflush(stdout); -} - - -#if defined(SHMEM) || defined(SYSV) - -static int DummyRoutine() -{int i, sum=0; for(i=0; i<10; i++) sum += i; return sum;} - -static long flag(p) - long *p; -{ -#if defined(CONVEX) && defined(HPUX) - asm("sync"); -#endif - - return *p; -} - -static void Await(p, value) - long *p; - long value; -/* - Wait until the value pointed to by p equals value. - Since *ptr is volatile but cannot usually declare this - include another level of procedure call to protect - against compiler optimization. -*/ -{ - int nspin = 0; - if (DEBUG_) { - printf("%2ld: Await p=%p, value=%ld\n", NODEID_(), p, value); - fflush(stdout); - } - - for (; flag(p) != value; nspin++) { -#if defined(NOSPIN) && !defined(PARTIALSPIN) - if (nspin < 100) - (void) DummyRoutine(); - else - USleep((long) 10000); -#else - if (nspin < 10000000) - (void) DummyRoutine(); - else { -/* printf("%2ld: Await sleeping\n", NODEID_()); fflush(stdout); */ - USleep((long) 100000); - } -#endif - } -} - -static void rcv_local(type, buf, lenbuf, lenmes, nodeselect, nodefrom) - long *type; - char *buf; - long *lenbuf; - long *lenmes; - long *nodeselect; - long *nodefrom; -{ - long me = NODEID_(); - long node = *nodeselect; - MessageHeader *head = SR_proc_info[node].header; - long buflen = SR_proc_info[node].buflen; - char *buffer = SR_proc_info[node].buffer; - long nodeto, len; -#ifdef NOSPIN - long semid = SR_proc_info[node].semid; - long sem_read = SR_proc_info[node].sem_read; - long sem_written = SR_proc_info[node].sem_written; - long semid_to = SR_proc_info[me].semid; - long sem_pend = SR_proc_info[me].sem_pend; -#endif -#if !defined(NOSPIN) || defined(PARTIALSPIN) - long *buffer_full = SR_proc_info[node].buffer_full; -#endif - - /* Error checking */ - - if ( (buffer == (char *) NULL) || (head == (MessageHeader *) NULL) ) - Error("rcv_local: invalid shared memory", (long) node); - -#ifdef NOSPIN - if ( (semid < 0) || (sem_read < 0) || (sem_written < 0) || - (semid_to < 0) || (sem_pend < 0) ) - Error("rcv_local: invalid semaphore set", (long) node); -#endif - -#ifdef NOSPIN - SemWait(semid_to, sem_pend); -#endif - - Await(&head->nodeto, me); /* Still have this possible spin */ - -#if defined(NOSPIN) && !defined(PARTIALSPIN) - SemWait(semid, sem_written); -#else - Await(buffer_full, (long) TRUE); -#endif - - /* Now have a message for me ... check the header info and - copy the first block of the message. */ - - if (DEBUG_) - PrintMessageHeader("rcv_local ",head); - - nodeto = head->nodeto; /* Always me ... history here */ - head->nodeto = -1; - - *nodefrom = head->nodefrom; - - if (head->type != *type) { - PrintMessageHeader("rcv_local ",head); -/* printf("rcv_local: type mismatch ... strong typing enforced\n"); */ -/* abort(); */ - Error("rcv_local: type mismatch ... strong typing enforced", (long) *type); - } - - *lenmes = len = head->length; - - if ( *lenmes > *lenbuf ) - Error("rcv_local: message too long for buffer", (long) *lenmes); - if (nodeto != me) - Error("rcv_local: message meant for someone else?", (long) nodeto); - - if (len) - (void) SRmover(buf, buffer, (len > buflen) ? buflen : len); - -#if defined(NOSPIN) && !defined(PARTIALSPIN) - SemPost(semid, sem_read); -#else - *buffer_full = FALSE; -# if defined(CONVEX) && defined(HPUX) - asm("sync"); -# endif - -#endif - - len -= buflen; - buf += buflen; - - /* Copy the remainder of the message */ - - while (len > 0) { -#if defined(NOSPIN) && !defined(PARTIALSPIN) - SemWait(semid, sem_written); -#else - Await(buffer_full, (long) TRUE); -#endif - (void) SRmover(buf, buffer, (len > buflen) ? buflen : len); -#if defined(NOSPIN) && !defined(PARTIALSPIN) - SemPost(semid, sem_read); -#else - *buffer_full = FALSE; -#endif - len -= buflen; - buf += buflen; - } -} - -static void snd_local(type, buf, lenbuf, node) - long *type; - char *buf; - long *lenbuf; - long *node; -{ - long me = NODEID_(); - MessageHeader *head = SR_proc_info[me].header; - long buflen = SR_proc_info[me].buflen; - long len = *lenbuf; - char *buffer = SR_proc_info[me].buffer; - long tag = SR_proc_info[*node].n_snd; -#ifdef NOSPIN - long semid = SR_proc_info[me].semid; - long sem_read = SR_proc_info[me].sem_read; - long sem_written = SR_proc_info[me].sem_written; - long semid_to = SR_proc_info[*node].semid; - long sem_pend = SR_proc_info[*node].sem_pend; -#endif -#if !defined(NOSPIN) || defined(PARTIALSPIN) - long *buffer_full = SR_proc_info[me].buffer_full; -#endif - - /* Error checking */ - - if ( (buffer == (char *) NULL) || (head == (MessageHeader *) NULL) ) - Error("snd_local: invalid shared memory", (long) *node); - -#ifdef NOSPIN - if ( (semid < 0) || (semid_to < 0) || (sem_read < 0) || (sem_written < 0) ) - Error("snd_local: invalid semaphore set", (long) *node); -#endif - - /* Check that final segment of last message has been consumed */ - -#if defined(NOSPIN) && !defined(PARTIALSPIN) - SemWait(semid, sem_read); -#else - Await(buffer_full, (long) FALSE); -#endif - - /* Fill in message header */ - - head->nodefrom = (char) me; - head->type = *type; - head->length = *lenbuf; - head->tag = tag; - head->nodeto = (char) *node; -#if defined(CONVEX) && defined(HPUX) - asm("sync"); -#endif - - if (DEBUG_) { - PrintMessageHeader("snd_local ",head); - (void) fflush(stdout); - } - - /* Copy the first piece of the message so that send along with - header to minimize use of semaphores. Also need to send header - even for messages of zero length */ - - if (len) - (void) SRmover(buffer, buf, (len > buflen) ? buflen : len); - -#if defined(NOSPIN) && !defined(PARTIALSPIN) - SemPost(semid, sem_written); -#else - *buffer_full = TRUE; -# if defined(CONVEX) && defined(HPUX) - asm("sync"); -# endif -#endif -#ifdef NOSPIN - SemPost(semid_to, sem_pend); -#endif - - len -= buflen; - buf += buflen; - - while (len > 0) { -#if defined(NOSPIN) && !defined(PARTIALSPIN) - SemWait(semid, sem_read); -#else - Await(buffer_full, (long) FALSE); -#endif - (void) SRmover(buffer, buf, (len > buflen) ? buflen : len); -#if defined(NOSPIN) && !defined(PARTIALSPIN) - SemPost(semid, sem_written); -#else - *buffer_full = TRUE; -# if defined(CONVEX) && defined(HPUX) - asm("sync"); -# endif -#endif - len -= buflen; - buf += buflen; - } -} -#endif - -static void snd_remote(type, buf, lenbuf, node) - long *type; - char *buf; - long *lenbuf; - long *node; -/* - synchronous send to remote process - - long *type = user defined integer message type (input) - char *buf = data buffer (input) - long *lenbuf = length of buffer in bytes (input) - long *node = node to send to (input) - - for zero length messages only the header is sent -*/ -{ -#define SHORT_MSG_BUF_SIZE (2048 + 40) - static char fudge[SHORT_MSG_BUF_SIZE]; - MessageHeader header; - long me=NODEID_(); - int sock=SR_proc_info[*node].sock; - long len; -#ifdef SOCK_FULL_SYNC - char sync=0; -#endif - - if ( sock < 0 ) - Error("snd_remote: sending to process without socket", (long) *node); - - header.nodefrom = me; - header.nodeto = *node; - header.type = *type; - header.length = *lenbuf; - header.tag = SR_proc_info[*node].n_snd; - - /* header.length is the no. of items if XDR is used or just the - number of bytes */ - -#ifdef GOTXDR - if ( *type & MSGDBL ) - header.length = *lenbuf / sizeof(double); - else if ( *type & MSGINT ) - header.length = *lenbuf / sizeof(long); - else if ( *type & MSGCHR ) - header.length = *lenbuf / sizeof(char); - else - header.length = *lenbuf; -#else - header.length = *lenbuf; -#endif - - if (DEBUG_) - PrintMessageHeader("snd_remote",&header); - -#ifndef GOTXDR - /* Combine header and messages less than a certain size to avoid - * performance problem on (older?) linuxes */ - if ((*lenbuf + sizeof(header)) <= sizeof(fudge)) { - memcpy(fudge,(char *) &header, sizeof(header)); - memcpy(fudge+sizeof(header), buf, *lenbuf); - if ( (len = WriteToSocket(sock, fudge, sizeof(header)+*lenbuf)) != - ((long)sizeof(header)+*lenbuf)) - Error("snd_remote: writing message to socket", - (long) (len+100000*(sock + 1000* *node))); - return; - } -#endif - -#ifdef GOTXDR - (void) WriteXdrLong(sock, (long *) &header, - (long) (sizeof(header)/sizeof(long))); -#else - if ( (len = WriteToSocket(sock, (char *) &header, (long) sizeof(header))) - != sizeof(header) ) - Error("snd_remote: writing header to socket", len); -#endif - - if (*lenbuf) { -#ifdef GOTXDR - if ( *type & MSGDBL ) - (void) WriteXdrDouble(sock, (double *) buf, header.length); - else if ( *type & MSGINT ) - (void) WriteXdrLong(sock, (long *) buf, header.length); - else if ( *type & MSGCHR ) - (void) WriteXdrChar(sock, (char *) buf, header.length); - else if ( (len = WriteToSocket(sock, buf, header.length)) != - header.length) - Error("snd_remote: writing message to socket", - (long) (len+100000*(sock + 1000* *node))); -#else - if ( (len = WriteToSocket(sock, buf, header.length)) != - header.length) - Error("snd_remote: writing message to socket", - (long) (len+100000*(sock + 1000* *node))); -#endif - } - -#ifdef SOCK_FULL_SYNC - /* this read (and write in rcv_remote) of an acknowledgment - forces synchronous */ - - if ( ReadFromSocket(sock, &sync, (long) 1) != 1) - Error("snd_remote: reading acknowledgement", - (long) (len+100000*(sock + 1000* *node))); -#endif -} - -/*ARGSUSED*/ -void SND_(type, buf, lenbuf, node, sync) - long *type; - void *buf; - long *lenbuf; - long *node; - long *sync; -/* - mostly syncrhonous send - - long *type = user defined integer message type (input) - void *buf = data buffer (input) - long *lenbuf = length of buffer in bytes (input) - long *node = node to send to (input) - long *sync = flag for sync/async ... IGNORED - - for zero length messages only the header is sent -*/ -{ - long me=NODEID_(); - long nproc=NNODES_(); -#ifdef TIMINGS - double start; -#endif - - /* Error checking */ - - if (*node == me) - Error("SND_: cannot send message to self", (long) me); - - if ( (*node < 0) || (*node > nproc) ) - Error("SND_: out of range node requested", (long) *node); - - if ( (*lenbuf < 0) || (*lenbuf > (long)BIG_MESSAGE) ) - Error("SND_: message length out of range", (long) *lenbuf); - -#ifdef EVENTLOG - evlog(EVKEY_BEGIN, EVENT_SND, - EVKEY_MSG_LEN, (int) *lenbuf, - EVKEY_MSG_FROM, (int) me, - EVKEY_MSG_TO, (int) *node, - EVKEY_MSG_TYPE, (int) *type, - EVKEY_MSG_SYNC, (int) *sync, - EVKEY_LAST_ARG); -#endif - - /* Send via shared memory or sockets */ - -#ifdef TIMINGS - start = TCGTIME_(); -#endif - -#if defined(SHMEM) || defined(SYSV) - if (SR_proc_info[*node].local){ -#ifdef KSR_NATIVE - KSR_snd_local(type, buf, lenbuf, node); -#else - snd_local(type, buf, lenbuf, node); -#endif - } else { -#endif - snd_remote(type, buf, lenbuf, node); -#if defined(SHMEM) || defined(SYSV) - } -#endif - - /* Collect statistics */ - - SR_proc_info[*node].n_snd += 1; - SR_proc_info[*node].nb_snd += *lenbuf; - -#ifdef TIMINGS - SR_proc_info[*node].t_snd += TCGTIME_() - start; -#endif - -#ifdef EVENTLOG - evlog(EVKEY_END, EVENT_SND, EVKEY_LAST_ARG); -#endif -} - -static long MatchMessage(header, me, type) - MessageHeader *header; - long me, type; -/* - Wrapper round check on if header is to me and of required - type so that compiler does not optimize out fetching - header info from shared memory. -*/ -{ - return (long) ((header->nodeto == me) && (header->type == type)); -} - -static long NextReadyNode(type) - long type; -/* - Select a node from which input is pending ... also match the - desired type. - - next_node is maintained as the last node that NextReadyNode chose - plus one modulo NNODES_(). This aids in ensuring fairness. - - First use select to get info about the sockets and then loop - through processes looking either at the bit in the fd_set for - the socket (remote process) or the message header in the shared - memory buffer (local process). - - This may be an expensive operation but fairness seems important. - - If only sockets are in use, just block in select until data is - available. -*/ -{ - static long next_node = 0; - - long nproc = NNODES_(); - long me = NODEID_(); - int i, nspin = 0; - - if (!SR_using_shmem) { - int list[MAX_PROCESS]; - int nready; - nready = WaitForSockets(SR_nsock,SR_socks,list); - if (nready == 0) - Error("NextReadyNode: nready = 0\n", 0); - - /* Insert here type checking logic ... not yet done */ - - return SR_socks_proc[list[0]]; - } - - /* With both local and remote processes end up with a busy wait - as no way to wait for both a semaphore and a socket. - Moderate this slightly by having short timeout in select */ - - while (1) { - - for(i=0; i= 0) { - /* Look for message over socket */ - - int sock = SR_proc_info[next_node].sock; - - /* Have we already peeked at this socket? */ - - if (SR_proc_info[next_node].peeked) { - if (SR_proc_info[next_node].head_peek.type == type) - break; - } - else if (PollSocket(sock)) { - /* Data is available ... let's peek at it */ -#ifdef GOTXDR - (void) ReadXdrLong(sock, - (long *) &SR_proc_info[next_node].head_peek, - (long) (sizeof(MessageHeader)/sizeof(long))); -#else - if (ReadFromSocket(sock, - (char *) &SR_proc_info[next_node].head_peek, - (long) sizeof(MessageHeader)) - != sizeof(MessageHeader) ) - Error("NextReadyNode: reading header from socket", next_node); -#endif - SR_proc_info[next_node].peeked = TRUE; - if (DEBUG_) - PrintMessageHeader("peeked_at ", - &SR_proc_info[next_node].head_peek); - - if (SR_proc_info[next_node].head_peek.type == type) - break; - } - } - } - if (i < nproc) /* If found a node skip out of the while loop */ - break; - - nspin++; /* Compromise between low latency and low cpu use */ - if (nspin < 10) - continue; - else if (nspin < 100) - USleep((long) 1000); - else if (nspin < 600) - USleep((long) 10000); - else - USleep((long) 100000); - } - - i = next_node; - next_node = (next_node + 1) % nproc; - - return (long) i; -} - -long PROBE_(type, node) - long *type, *node; - /* - Return 1/0 (TRUE/FALSE) if a message of the given type is available - from the given node. If the node is specified as -1, then all nodes - will be examined. Some attempt is made at ensuring fairness. - - First use select to get info about the sockets and then loop - through processes looking either at the bit in the fd_set for - the socket (remote process) or the message header in the shared - memory buffer (local process). - - This may be an expensive operation but fairness seems important. - */ -{ - long nproc = NNODES_(); - long me = NODEID_(); - int i, proclo, prochi; - - if (*node == me) - Error("PROBE_ : cannot recv message from self, msgtype=", *type); - - if (*node == -1) { /* match anyone */ - proclo = 0; - prochi = nproc-1; - } - else - proclo = prochi = *node; - - for(i=proclo; i<=prochi; i++) { - - if (i == me) { - ; /* can't receive from self */ - } - else if (SR_proc_info[i].local) { - /* Look for local message */ - -#ifdef KSR_NATIVE - if (KSR_MatchMessage(i, me, type)) -#else - if (MatchMessage(SR_proc_info[i].header, me, *type)) -#endif - break; - } - else if (SR_proc_info[i].sock >= 0) { - /* Look for message over socket */ - - int sock = SR_proc_info[i].sock; - - /* Have we already peeked at this socket? */ - - if (SR_proc_info[i].peeked) { - if (SR_proc_info[i].head_peek.type == *type) - break; - } - else if (PollSocket(sock)) { - /* Data is available ... let's peek at it */ -#ifdef GOTXDR - (void) ReadXdrLong(sock, - (long *) &SR_proc_info[i].head_peek, - (long) (sizeof(MessageHeader)/sizeof(long))); -#else - if (ReadFromSocket(sock, - (char *) &SR_proc_info[i].head_peek, - (long) sizeof(MessageHeader)) - != sizeof(MessageHeader) ) - Error("NextReadyNode: reading header from socket", (long) i); -#endif - SR_proc_info[i].peeked = TRUE; - if (DEBUG_) - PrintMessageHeader("peeked_at ", - &SR_proc_info[i].head_peek); - - if (SR_proc_info[i].head_peek.type == *type) - break; - } - } - } - - if (i <= prochi) - return 1; - else - return 0; -} - - -static void rcv_remote(type, buf, lenbuf, lenmes, nodeselect, nodefrom) - long *type; - char *buf; - long *lenbuf; - long *lenmes; - long *nodeselect; - long *nodefrom; -/* - synchronous receive of data - - long *type = user defined type of received message (input) - char *buf = data buffer (output) - long *lenbuf = length of buffer in bytes (input) - long *lenmes = length of received message in bytes (output) - (exceeding receive buffer is hard error) - long *nodeselect = node to receive from (input) - -1 implies that any pending message may be received - - long *nodefrom = node message is received from (output) -*/ -{ - long me = NODEID_(); - long node = *nodeselect; - int sock = SR_proc_info[node].sock; - long len; - MessageHeader header; -#ifdef SOCK_FULL_SYNC - char sync = 0; -#endif - - if ( sock < 0 ) - Error("rcv_remote: receiving from process without socket", (long) node); - - /* read the message header and check contents */ - - if (SR_proc_info[node].peeked) { - /* Have peeked at this socket ... get message header from buffer */ - - if (DEBUG_) - printf("%2ld: rcv_remote message has been peeked at\n", me); - - (void) memcpy((char *) &header, (char *) &SR_proc_info[node].head_peek, - sizeof(MessageHeader)); - SR_proc_info[node].peeked = FALSE; - } - else { -#ifdef GOTXDR - (void) ReadXdrLong(sock, (long *) &header, - (long) (sizeof(header)/sizeof(long))); -#else - if ( (len = ReadFromSocket(sock, (char *) &header, (long) sizeof(header))) - != sizeof(header) ) - Error("rcv_remote: reading header from socket", len); -#endif - } - - if (DEBUG_) - PrintMessageHeader("rcv_remote",&header); - - if (header.nodeto != me) { - PrintMessageHeader("rcv_remote",&header); - Error("rcv_remote: got message meant for someone else", - (long) header.nodeto); - } - - *nodefrom = header.nodefrom; - if (*nodefrom != node) - Error("rcv_remote: got message from someone on incorrect socket", - (long) *nodefrom); - - if (header.type != *type) { - PrintMessageHeader("rcv_remote",&header); - printf("rcv_remote: type mismatch ... strong typing enforced\n"); - abort(); - Error("rcv_remote: type mismatch ... strong typing enforced", (long) *type); - } - -#ifdef GOTXDR - if ( *type & MSGDBL ) - *lenmes = header.length * sizeof(double); - else if ( *type & MSGINT ) - *lenmes = header.length * sizeof(long); - else if ( *type & MSGCHR ) - *lenmes = header.length * sizeof(char); - else - *lenmes = header.length; -#else - *lenmes = header.length; -#endif - - if ( (*lenmes < 0) || (*lenmes > (long)BIG_MESSAGE) || (*lenmes > *lenbuf) ) { - PrintMessageHeader("rcv_remote",&header); - (void) fprintf(stderr, "rcv_remote err: lenbuf=%ld\n",*lenbuf); - Error("rcv_remote: message length out of range",(long) *lenmes); - } - - if (*lenmes > 0) { -#ifdef GOTXDR - if ( *type & MSGDBL ) - (void) ReadXdrDouble(sock, (double *) buf, header.length); - else if ( *type & MSGINT ) - (void) ReadXdrLong(sock, (long *) buf, header.length); - else if ( *type & MSGCHR ) - (void) ReadXdrChar(sock, (char *) buf, header.length); - else if ( (len = ReadFromSocket(sock, buf, *lenmes)) != *lenmes) - Error("rcv_remote: reading message from socket", - (long) (len+100000*(sock+ 1000* *nodefrom))); -#else - if ( (len = ReadFromSocket(sock, buf, *lenmes)) != *lenmes) - Error("rcv_remote: reading message from socket", - (long) (len+100000*(sock+ 1000* *nodefrom))); -#endif - } - - /* this write (and read in snd_remote) makes the link synchronous */ - -#ifdef SOCK_FULL_SYNC - if ( WriteToSocket(sock, &sync, (long) 1) != 1) - Error("rcv_remote: writing sync to socket", (long) node); -#endif - -} - -/*ARGSUSED*/ -void RCV_(type, buf, lenbuf, lenmes, nodeselect, nodefrom, sync) - long *type; - void *buf; - long *lenbuf; - long *lenmes; - long *nodeselect; - long *nodefrom; - long *sync; -/* - long *type = user defined type of received message (input) - void *buf = data buffer (output) - long *lenbuf = length of buffer in bytes (input) - long *lenmes = length of received message in bytes (output) - (exceeding receive buffer is hard error) - long *nodeselect = node to receive from (input) - -1 implies that any pending message may be received - - long *nodefrom = node message is received from (output) - long *sync = 0 for asynchronous, 1 for synchronous (NOT USED) -*/ -{ - long me = NODEID_(); - long nproc = NNODES_(); - long node; -#ifdef TIMINGS - double start; -#endif - -#ifdef EVENTLOG - evlog(EVKEY_BEGIN, EVENT_RCV, - EVKEY_MSG_FROM, (int) *nodeselect, - EVKEY_MSG_TO, (int) me, - EVKEY_MSG_TYPE, (int) *type, - EVKEY_MSG_SYNC, (int) *sync, - EVKEY_LAST_ARG); -#endif - - /* Assign the desired node or the next ready node */ - -#ifdef TIMINGS - start = TCGTIME_(); -#endif - - if (*nodeselect == -1) - node = NextReadyNode(*type); - else - node = *nodeselect; - - /* Check for some errors ... need more checking here ... - note that the overall master process has id nproc */ - - if (node == me) - Error("RCV_: cannot receive message from self", (long) me); - - if ( (node < 0) || (node > nproc) ) - Error("RCV_: out of range node requested", (long) node); - - /* Receive the message ... use shared memory, switch or socket */ - -#if defined(SHMEM) || defined(SYSV) - if (SR_proc_info[node].local){ -#ifdef KSR_NATIVE - KSR_rcv_local(type, buf, lenbuf, lenmes, &node, nodefrom); -#else - rcv_local(type, buf, lenbuf, lenmes, &node, nodefrom); -#endif - } else { -#endif - rcv_remote(type, buf, lenbuf, lenmes, &node, nodefrom); -#if defined(SHMEM) || defined(SYSV) - } -#endif - - /* Collect statistics */ - - SR_proc_info[node].n_rcv += 1; - SR_proc_info[node].nb_rcv += *lenmes; - -#ifdef TIMINGS - SR_proc_info[node].t_rcv += TCGTIME_() - start; -#endif - -#ifdef EVENTLOG - evlog(EVKEY_END, EVENT_RCV, - EVKEY_MSG_FROM, (int) node, - EVKEY_MSG_LEN, (int) *lenmes, - EVKEY_LAST_ARG); -#endif -} - -void RemoteConnect(a, b, c) - long a, b, c; -/* - Make a socket connection between processes a and b via the - process c to which both are already connected. -*/ -{ - long me = NODEID_(); - long nproc = NNODES_(); - long type = TYPE_CONNECT; /* Overriden below */ - char cport[8]; - long tmp, lenmes, nodefrom, clusid, lenbuf, sync=1; - int sock, port; - long lport; - - if ((a == b) || (a == c) || (b == c) ) - return; /* Gracefully ignore redundant connections */ - - if ( (me != a) && (me != b) && (me != c) ) - return; /* I'm not involved in this connection */ - - - if (a < b) { - tmp = a; a = b; b = tmp; - } - - type = (a + nproc*b) | MSGINT; /* Create a unique type */ - - if (DEBUG_) { - (void) printf("RC a=%ld, b=%ld, c=%ld, me=%ld\n",a,b,c,me); - (void) fflush(stdout); - } - - if (a == me) { - CreateSocketAndBind(&sock, &port); /* Create port */ - if (DEBUG_) { - (void) printf("RC node=%ld, sock=%d, port=%d\n",me, sock, port); - (void) fflush(stdout); - } - lport = port; - lenbuf = sizeof lport; - ListenOnSock(sock); - SND_(&type, (char *) &lport, &lenbuf, &c, &sync); /* Port to intermediate */ - SR_proc_info[b].sock = AcceptConnection(sock); /* Accept connection - and save socket info */ - } - else if (b == me) { - clusid = SR_proc_info[a].clusid; - lenbuf = sizeof lport; - RCV_(&type, (char *) &lport, &lenbuf, &lenmes, &c, &nodefrom, &sync); - port = lport; - (void) sprintf(cport,"%d",port); - lenbuf = strlen(cport) + 1; - if (lenbuf > (long)sizeof(cport)) - Error("RemoteConnect: cport too small", (long) lenbuf); - SR_proc_info[a].sock = - CreateSocketAndConnect(SR_clus_info[clusid].hostname, cport); - } - else if (c == me) { - lenbuf = sizeof lport; - RCV_(&type, (char *) &lport, &lenbuf, &lenmes, &a, &nodefrom, &sync); - SND_(&type, (char *) &lport, &lenbuf, &b, &sync); - } -} diff --git a/armci/tcgmsg/ipcv4.0/sndrcvP.h b/armci/tcgmsg/ipcv4.0/sndrcvP.h deleted file mode 100644 index 765db55af..000000000 --- a/armci/tcgmsg/ipcv4.0/sndrcvP.h +++ /dev/null @@ -1,169 +0,0 @@ -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/sndrcvP.h,v 1.17 2002-05-14 22:12:14 d3h325 Exp $ */ - -/* - This include file contains definitions PRIVATE to the message - passing routines and not for public use. These items should not - be directly manipulated even in the message passing routines, except - by the appropriate lowlevel routines. - - Actual instances of the extern data items are declared in defglobals.h - which is included by cluster.c. -*/ - -#define SNDRCVP - -#ifndef MSGTYPES -#include "msgtypesc.h" -#endif - -/****************************** - Defines and macro definitions - *****************************/ - -#define MAX_CLUSTER 128 /* Maximum no. of clusters */ -#define MAX_SLAVE 512 /* Maximum no. of slaves per cluster */ -#define MAX_PROCESS 8192 /* Maximum no. of processes */ - -#define TYPE_SETUP 32768 /* used for setup communication */ -#define TYPE_CHECK 32769 /* used for checking communication */ -#define TYPE_END 32770 /* used for propagating end message */ -#define TYPE_NXTVAL (MSGINT | 32771) /* Used in nxtval */ -#define TYPE_CONNECT (MSGINT | 32772) /* Used in RemoteConnect */ -#define TYPE_BEGIN 32773 /* Used in pbegin and parallel */ -#define TYPE_CLOCK_SYNCH 32774; /* Used to synch clocks */ - -#ifdef BIG_MESSAGE_PROTECTION -#define BIG_MESSAGE 41943040ul /* 40Mb max message only for safety check. - Change as needed.*/ -#else -#define BIG_MESSAGE 2147483647ul /* 2GB */ -#endif - -/* Shared memory allocated per process .. make even multiple of - page size ... usually 4096 */ -#if defined(SGI) || defined(SGITFP) -#define SHMEM_BUF_SIZE 262144 -#endif -#ifdef KSR -#define SHMEM_BUF_SIZE 524288 -#endif -#ifdef ALLIANT -#define SHMEM_BUF_SIZE 524288 -#endif -#ifdef ENCORE -#define SHMEM_BUF_SIZE 4096 -#endif -#ifdef SEQUENT -#define SHMEM_BUF_SIZE 16384 -#endif -#ifdef HPUX -#define SHMEM_BUF_SIZE 262144 -#endif -#ifdef MACX -#define SHMEM_BUF_SIZE 65536 -#endif -#if defined(SOLARIS) -#define SHMEM_BUF_SIZE 253952 -#endif -#ifdef KSR_NATIVE -#include "ksr.h" -#undef SHMEM_BUF_SIZE -#define SHMEM_BUF_SIZE KSR_SHMEM_BUF_SIZE -#endif -#if !defined(SHMEM_BUF_SIZE) -#define SHMEM_BUF_SIZE 131072 -#endif - -#if defined(PARTIALSPIN) && !defined(NOSPIN) -#define NOSPIN -#endif - -#define SR_SOCK_BUF_SIZE 32768 /* Size that system buffers set to */ - -#define PACKET_SIZE SR_SOCK_BUF_SIZE /* Internal packet size over sockets */ - -#define TIMEOUT_ACCEPT 180 /* timeout for connection in secs */ - -#define TRUE 1 -#define FALSE 0 -#define DEBUG_ SR_debug /* substitute name of debug flag */ - -/********************************************************* - Global information and structures ... all begin with SR_ - ********************************************************/ - -extern long SR_n_clus; /* No. of clusters */ -extern long SR_n_proc; /* No. of processes excluding dummy - master process */ - -extern long SR_clus_id; /* Logical id of current cluster */ -extern long SR_proc_id; /* Logical id of current process */ - -extern long SR_debug; /* flag for debug output */ - -extern long SR_parallel; /* True if job started with parallel */ -extern long SR_exit_on_error; /* flag to exit on error */ -extern long SR_error; /* flag indicating error has been called - with SR_exit_on_error == FALSE */ - -extern long SR_numchild; /* no. of forked processes */ -extern long SR_pids[MAX_SLAVE]; /* pids of forked processes */ -extern int SR_socks[MAX_PROCESS]; /* Sockets used for each process */ -extern int SR_socks_proc[MAX_PROCESS]; /* Process associated with a given socket */ -extern int SR_nsock; /* No. of sockets in the list */ -extern long SR_using_shmem; /* 1=if shmem is used for an process, 0 if all - processes are connected to this one by sockets */ - - -/* This is used to store info from the PROCGRP file about each - cluster of processes */ - -struct cluster_info_struct { - char *user; /* user name */ - char *hostname; /* hostname */ - long nslave; /* no. slave on this host */ - char *image; /* path executable image */ - char *workdir; /* work directory */ - long masterid; /* process no. of cluster master */ - int swtchport; /* Switch port for alliant hippi */ -}; - -extern struct cluster_info_struct SR_clus_info[MAX_CLUSTER]; - -typedef struct message_header_struct { - long nodefrom; /* originating node of message */ - long nodeto; /* target node of message */ - long type; /* user defined type */ - long length; /* length of message in bytes */ - long tag; /* No. of this message for id */ -} MessageHeader; - -/* This is used to store all info about processes */ - -struct process_info_struct { - long clusid; /* cluster no. for this process */ - long slaveid; /* slave no. in cluster 0,1,...,nslave */ - long local; /* boolean flag for local/remote */ - int sock; /* socket to remote process */ - char *shmem; /* shared memory region */ - long shmem_size; /* shared memory region size */ - long shmem_id; /* shared memory region id */ - char *buffer; /* shared memory message buffer */ - long buflen; /* shared memory message buffer size */ - MessageHeader *header; /* shared memory message header */ - long semid; /* semaphore group id */ - long sem_pend; /* semaphore no. posted when data pending */ - long sem_read; /* semaphore no. posted when data read */ - long sem_written; /* semaphore no. posted when data written */ - long n_rcv; /* No. of messages received */ - double nb_rcv; /* No. of bytes received */ - double t_rcv; /* Time spent receiving in sec */ - long n_snd; /* No. of messages sent */ - double nb_snd; /* No. of bytes sent */ - double t_snd; /* Time spent sending in sec */ - long peeked; /* True if have peeked at socket */ - MessageHeader head_peek; /* Header that we peeked at */ - long *buffer_full; /* Flag indicating full buffer */ -}; - -extern struct process_info_struct SR_proc_info[MAX_PROCESS]; diff --git a/armci/tcgmsg/ipcv4.0/sockets.c b/armci/tcgmsg/ipcv4.0/sockets.c deleted file mode 100644 index 53baa70b6..000000000 --- a/armci/tcgmsg/ipcv4.0/sockets.c +++ /dev/null @@ -1,535 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/sockets.c,v 1.12 2005-04-08 16:55:04 vinodtipparaju Exp $ */ - - -#include -#include -#include - -#ifdef SEQUENT -#include -#else -#include -#endif -#if defined(SUN) || defined(ALLIANT) || defined(ENCORE) || \ - defined(SEQUENT) || defined(AIX) || \ - defined(NEXT) || defined(LINUX) -#include -#endif - -#ifdef AIX -#include -#endif -#ifdef CONVEX -#include -#else -#include -#endif -#include -#include -#include -#include -#include -#include - -#ifdef CRAY -#include -#include -#else -extern int errno; -#endif - -#include "sndrcv.h" -#include "sndrcvP.h" - -long WaitForSockets(int nsock, int *socks, int *list) -/* - Wait until one or more sockets are ready or have an exception. - - Returns the number of ready sockets and sets corresponding - numbers in list. I.e., list[i]=k meaning sock[k] is ready. -*/ -{ - fd_set ready; - int i; - long nready; - int sockmax = 0; - -again: - FD_ZERO(&ready); - for (i=0; i sockmax) sockmax = socks[i]; - } - nready = (long) select(sockmax+1, &ready, (fd_set *) NULL, (fd_set *) NULL, - (struct timeval *) NULL); - if (nready < 0) { - if (errno == EINTR) { - /*fprintf(stderr,"wait in sockets got interrupted\n");*/ - goto again; - } - else { - Error("WaitForSockets: error from select", 0L); - } - } - else { - int n = 0; - for (i=0; ip_proto; - - status = setsockopt(sock, level, TCP_NODELAY, &value, sizeof(int)); - - if (status != 0) - Error("TcpNoDelay: setsockopt failed", (long) status); -} - -void ShutdownAll() -/* - close all sockets discarding any pending data in either direction. -*/ -{ - int i; - - for (i=0; i= 0) { - (void) shutdown(SR_proc_info[i].sock, 2); - (void) close(SR_proc_info[i].sock); - } -} - -int ReadFromSocket(sock, buf, lenbuf) - int sock; - char *buf; - long lenbuf; -/* - Read from the socket until we get all we want. -*/ -{ - int nread, status; - - status = lenbuf; - while (lenbuf > 0) { -again: - if ( (nread = recv(sock, buf, (int) lenbuf, 0)) < 0) { - if (errno == EINTR) - goto again; - else { - (void) fprintf(stderr,"sock=%d, pid=%ld, nread=%d, len=%ld\n", - sock, NODEID_(), nread, lenbuf); - (void) fflush(stderr); - status = -1; - break; - } - } - buf += nread; - lenbuf -= nread; - } - - return status; -} - -int WriteToSocket(sock, buf, lenbuf) - int sock; - char *buf; - long lenbuf; -/* - Write to the socket in packets of PACKET_SIZE bytes -*/ -{ - int status = lenbuf; - int nsent, len; - - while (lenbuf > 0) { - - len = (lenbuf > PACKET_SIZE) ? PACKET_SIZE : lenbuf; - nsent = send(sock, buf, (int) len, 0); - - if (nsent < 0) { /* This is bad news */ - (void) fprintf(stderr,"sock=%d, pid=%ld, nsent=%d, len=%ld\n", - sock, NODEID_(), nsent, lenbuf); - (void) fflush(stderr); - status = -1; break; - } - - buf += nsent; - lenbuf -= nsent; - } - - return status; -} - -void CreateSocketAndBind(sock, port) - int *sock; - int *port; -/* - Create a socket, bind it to a wildcard internet name and return - the info so that its port number may be advertised -*/ -{ - unsigned int length; - struct sockaddr_in server; - int size = SR_SOCK_BUF_SIZE; - int on = 1; -#if defined(LINUX) && defined(__powerpc__) - int dupsock; -#endif - - length = sizeof (struct sockaddr_in); - - /* Create socket */ - - if ( (*sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) - Error("CreateSocketAndBind: socket creation failed", (long) *sock); - -#if defined(LINUX) && defined(__powerpc__) - if(*sock==0) - dupsock = dup(*sock); - *sock = dupsock; -#endif - - if(setsockopt(*sock, SOL_SOCKET, SO_REUSEADDR, - (char *) &on, sizeof on) == -1) - Error("CreateSocketAndBind: error from setsockopt", (long) -1); - - /* Increase size of socket buffers to improve long message - performance and increase size of message that goes asynchronously */ - - if(setsockopt(*sock, SOL_SOCKET, SO_RCVBUF, (char *) &size, sizeof size)) - Error("CreateSocketAndBind: error setting SO_RCVBUF", (long) size); - if(setsockopt(*sock, SOL_SOCKET, SO_SNDBUF, (char *) &size, sizeof size)) - Error("CreateSocketAndBind: error setting SO_SNDBUF", (long) size); - -#ifndef ARDENT - TcpNoDelay(*sock); -#endif - - /* Name socket with wildcards */ - - server.sin_family = AF_INET; - server.sin_addr.s_addr = INADDR_ANY; - server.sin_port = 0; - if (bind(*sock, (struct sockaddr *) &server, length) < 0) - Error("CreateSocketAndBind: bind failed", (long) 0); - - /* Find out port number etc. */ - - if (getsockname(*sock, (struct sockaddr *) &server, &length) < 0) - Error("CreateSocketAndBind: getsockname failed", (long) 0); - - *port = ntohs(server.sin_port); - -} - -void ListenOnSock(sock) - int sock; -/* - Listen for a connection on the specified socket - which was created with CreateSocketAndBind -*/ -{ -againlist: - if (listen(sock, 1) < 0) { - if (errno == EINTR) - goto againlist; - else - Error("ListenAndAccept: listen failed", (long) 0); - } - - if (DEBUG_) { - (void) printf("process %ld out of listen on socket %d\n",NODEID_(),sock); - (void) fflush(stdout); - } -} - -int AcceptConnection(sock) - int sock; -/* - Accept a connection on the specified socket - which was created with CreateSocketAndBind and - listen has been called. -*/ -{ - fd_set ready; - struct timeval timelimit; - int msgsock, nready; - int size = SR_SOCK_BUF_SIZE; - - /* Use select to wait for someone to try and establish a connection - so that we can add a short timeout to avoid hangs */ - -againsel: - FD_ZERO(&ready); - FD_SET(sock, &ready); - - timelimit.tv_sec = TIMEOUT_ACCEPT; - timelimit.tv_usec = 0; - nready = select(sock+1, &ready, (fd_set *) NULL, (fd_set *) NULL, - &timelimit); - if ( (nready <= 0) && (errno == EINTR) ) - goto againsel; - else if (nready < 0) - Error("ListenAndAccept: error from select", (long) nready); - else if (nready == 0) - Error("ListenAndAccept: timeout waiting for connection", - (long) nready); - - if (!FD_ISSET(sock, &ready)) - Error("ListenAndAccept: out of select but not ready!", (long) nready); - -againacc: - msgsock = accept(sock, (struct sockaddr *) NULL, (unsigned int *) NULL); - if (msgsock == -1) { - if (errno == EINTR) - goto againacc; - else - Error("ListenAndAccept: accept failed", (long) msgsock); - } - - if (DEBUG_) { - (void) printf("process %ld out of accept on socket %d\n", - NODEID_(),msgsock); - (void) fflush(stdout); - } - - /* Increase size of socket buffers to improve long message - performance and increase size of message that goes asynchronously */ - - if(setsockopt(msgsock, SOL_SOCKET, SO_RCVBUF, (char *) &size, sizeof size)) - Error("ListenAndAccept: error setting SO_RCVBUF", (long) size); - if(setsockopt(msgsock, SOL_SOCKET, SO_SNDBUF, (char *) &size, sizeof size)) - Error("ListenAndAccept: error setting SO_SNDBUF", (long) size); - -#ifndef ARDENT - TcpNoDelay(msgsock); -#endif - - (void) close(sock); /* will not be needing this again */ - return msgsock; -} - -int ListenAndAccept(sock) - int sock; -/* - Listen and accept a connection on the specified socket - which was created with CreateSocketAndBind -*/ -{ - fd_set ready; - struct timeval timelimit; - int msgsock, nready; - int size = SR_SOCK_BUF_SIZE; - -againlist: - if (listen(sock, 1) < 0) { - if (errno == EINTR) - goto againlist; - else - Error("ListenAndAccept: listen failed", (long) 0); - } - - if (DEBUG_) { - (void) printf("process %ld out of listen on socket %d\n",NODEID_(),sock); - (void) fflush(stdout); - } - - /* Use select to wait for someone to try and establish a connection - so that we can add a short timeout to avoid hangs */ - -againsel: - FD_ZERO(&ready); - FD_SET(sock, &ready); - - timelimit.tv_sec = TIMEOUT_ACCEPT; - timelimit.tv_usec = 0; - nready = select(sock+1, &ready, (fd_set *) NULL, (fd_set *) NULL, - &timelimit); - if ( (nready <= 0) && (errno == EINTR) ) - goto againsel; - else if (nready < 0) - Error("ListenAndAccept: error from select", (long) nready); - else if (nready == 0) - Error("ListenAndAccept: timeout waiting for connection", - (long) nready); - - if (!FD_ISSET(sock, &ready)) - Error("ListenAndAccept: out of select but not ready!", (long) nready); - -againacc: - msgsock = accept(sock, (struct sockaddr *) NULL, (unsigned int *) NULL); - if (msgsock == -1) { - if (errno == EINTR) - goto againacc; - else - Error("ListenAndAccept: accept failed", (long) msgsock); - } - - if (DEBUG_) { - (void) printf("process %ld out of accept on socket %d\n", - NODEID_(),msgsock); - (void) fflush(stdout); - } - - /* Increase size of socket buffers to improve long message - performance and increase size of message that goes asynchronously */ - - if(setsockopt(msgsock, SOL_SOCKET, SO_RCVBUF, (char *) &size, sizeof size)) - Error("ListenAndAccept: error setting SO_RCVBUF", (long) size); - if(setsockopt(msgsock, SOL_SOCKET, SO_SNDBUF, (char *) &size, sizeof size)) - Error("ListenAndAccept: error setting SO_SNDBUF", (long) size); - -#ifndef ARDENT - TcpNoDelay(msgsock); -#endif - - (void) close(sock); /* will not be needing this again */ - return msgsock; -} - -int CreateSocketAndConnect(hostname, cport) - char *hostname; - char *cport; -/* - Return the file descriptor of the socket which connects me to the - remote process on hostname at port in string cport - - hostname = hostname of the remote process - cport = asci string containing port number of remote socket -*/ -{ - int sock, status; - struct sockaddr_in server; - struct hostent *hp; - int on = 1; - int size = SR_SOCK_BUF_SIZE; -#ifndef SGI - struct hostent *gethostbyname(); -#endif -#if defined(LINUX) && defined(__powerpc__) - int dupsock; -#endif - - /* Create socket */ - - if ( (sock = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) { - (void) fprintf(stderr,"trying to connect to host=%s, port=%s\n", - hostname, cport); - Error("CreateSocketAndConnect: socket failed", (long) sock); - } - -#if defined(LINUX) && defined(__powerpc__) - if(sock==0) - dupsock = dup(sock); - sock = dupsock; -#endif - - if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *) &on, sizeof on) == -1) - Error("CreateSocketAndConnect: error setting REUSEADDR", (long) -1); - - /* Increase size of socket buffers to improve long message - performance and increase size of message that goes asynchronously */ - - if(setsockopt(sock, SOL_SOCKET, SO_RCVBUF, (char *) &size, sizeof size)) - Error("CreateSocketAndConnect: error setting SO_RCVBUF", (long) size); - if(setsockopt(sock, SOL_SOCKET, SO_SNDBUF, (char *) &size, sizeof size)) - Error("CreateSocketAndConnect: error setting SO_SNDBUF", (long) size); - -#ifndef ARDENT - TcpNoDelay(sock); -#endif - - /* Connect socket */ - - server.sin_family = AF_INET; - hp = gethostbyname(hostname); - if (hp == 0) { - (void) fprintf(stderr,"trying to connect to host=%s, port=%s\n", - hostname, cport); - Error("CreateSocketAndConnect: gethostbyname failed", (long) 0); - } - - bcopy((char *) hp->h_addr, (char *) &server.sin_addr, hp->h_length); - server.sin_port = htons((ushort) atoi(cport)); - -againcon: - if ((status = - connect(sock, (struct sockaddr *) &server, sizeof server)) < 0) { - if (errno == EINTR) - goto againcon; - else { - (void) fprintf(stderr,"trying to connect to host=%s, port=%s\n", - hostname, cport); - Error("CreateSocketAndConnect: connect failed", (long) status); - } - } - - return sock; -} diff --git a/armci/tcgmsg/ipcv4.0/srmover.c b/armci/tcgmsg/ipcv4.0/srmover.c deleted file mode 100644 index db0b1ecf3..000000000 --- a/armci/tcgmsg/ipcv4.0/srmover.c +++ /dev/null @@ -1,55 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/srmover.c,v 1.4 1995-02-24 02:17:53 d3h325 Exp $ */ - -#if defined(SEQUENT) || defined(CONVEX) -#define memcpy(a ,b ,c) bcopy((b), (a), (c)) -#else -#include -#endif - -void SRmover(a, b, n) - char *a, *b; - long n; -/* - Move n bytes from b to a -*/ -{ -#if defined(ALLIANT) || defined(IBM) || defined(IBMNOEXT) || \ - defined(CRAY) || defined(CONVEX) || defined(APOLLO) - /* memcpy is fast, Cray is not actually used but - alignment crap below won't work in anycase */ - (void) memcpy(a, b, (int) n); -#else -#define UNALIGNED(a) (((unsigned long) (a)) % sizeof(int)) - - if (UNALIGNED(a) || UNALIGNED(b)) - (void) memcpy(a, b, (int) n); /* abdicate responsibility */ - else { - /* Data is integer aligned ... move first n/sizeof(int) bytes - as integers and the remainder as bytes */ - - int ni = n/sizeof(int); - int *ai = (int *) a; - int *bi = (int *) b; - int i; - -#ifdef ARDENT -#pragma ivdep -#endif - for (i=0; i - -#include "sndrcvP.h" -#include "sndrcv.h" - -void STATS_() -/* - Print out communication statistics for this node -*/ -{ - long me = NODEID_(); - long nproc = NNODES_(); - long i, msg_s, kb_s, s_s, r_s, msg_r, kb_r, s_r, r_r; - - - (void) printf("Communication statistics for node %ld\n",me); - (void) printf("-------------------------------------\n\n"); - (void) printf("\ - sending receiving\n\ - ------------------------- -------------------------\n\ - node #msgs. #kb secs kb/s #msgs. #kb secs kb/s\n\ - ---- ------ ----- ---- ---- ------ ----- ---- ----\n"); - - for (i=0; i 0) ? kb_s / s_s : 0; - msg_r = SR_proc_info[i].n_rcv; - kb_r = SR_proc_info[i].nb_rcv/1024.0; - s_r = SR_proc_info[i].t_rcv; - r_r = (s_r > 0) ? kb_r / s_r : 0; - - (void) printf("%5ld%9ld%7ld%6ld%6ld%9ld%7ld%6ld%6ld\n", i, - msg_s, kb_s, s_s, r_s, - msg_r, kb_r, s_r, r_r); - } -} - diff --git a/armci/tcgmsg/ipcv4.0/synch.c b/armci/tcgmsg/ipcv4.0/synch.c deleted file mode 100644 index 871867113..000000000 --- a/armci/tcgmsg/ipcv4.0/synch.c +++ /dev/null @@ -1,48 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/synch.c,v 1.5 2002-07-17 17:20:11 vinod Exp $ */ - -#include "sndrcv.h" - -#ifdef OLDSYNC -void SYNCH_(type) - long *type; -/* - Synchronize by forcing all process to exchange a zero length message - of given type with process 0. -*/ -{ - long me = NODEID_(); - long nproc = NNODES_(); - char *buf = ""; - long zero = 0; - long sync = 1; - long from, lenmes, i; - - /* First everyone sends null message to zero */ - - if (me == 0) - for (i=1; i -#include -#endif -#include -#include - -#ifdef STUPIDUSLEEP -void USleep(us) - long us; -{ - int s = us/1000000; - if (s == 0) - s = 1; - (void) sleep(s); -} -#else -void USleep(us) - long us; -/* - Sleep for the specified no. of micro-seconds ... uses the timeout - on select ... it seems to be accurate to about a few centiseconds - on a sun. I don't know how much system resources it eats. -*/ -{ - int width=0; - struct timeval timelimit; - - timelimit.tv_sec = (int) (us/1000000); - timelimit.tv_usec = (int) (us - timelimit.tv_sec*1000000); - - (void) select(width, (fd_set *) 0, (fd_set *) 0, (fd_set *) 0, - &timelimit); -} -#endif - diff --git a/armci/tcgmsg/ipcv4.0/waitall.c b/armci/tcgmsg/ipcv4.0/waitall.c deleted file mode 100644 index fc9c3a71f..000000000 --- a/armci/tcgmsg/ipcv4.0/waitall.c +++ /dev/null @@ -1,67 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/waitall.c,v 1.3 1995-02-24 02:18:05 d3h325 Exp $ */ - -#include -#if defined(SUN) || defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || \ - defined(AIX) || defined(NEXT) || defined(DECOSF) || defined(LINUX) -#include -#endif - -int WaitAll(nchild) - long nchild; -/* - Wait for all children to finish and return appropriate status - 0 = OK - 1 = bad news -*/ -{ - int status, pid, child, stat=0, lo, hi; - -#if defined(ALLIANT) || defined(ENCORE) || defined(SEQUENT) || defined(NEXT) - union wait ustatus; -#endif - - for (child=0; child> 8) & 0xff; - - if ( lo == 0177 ) - (void) fprintf(stderr, "(stopped by signal %d).\n", hi); - else if ( (lo != 0) && (lo & 0200) ) - (void) fprintf(stderr, "(killed by signal %d, dumped core).\n", - lo & 0100); - else if ( lo != 0 ) - (void) fprintf(stderr, "(killed by signal %d).\n",lo); - else - (void) fprintf(stderr, "(exited with code %d).\n",hi); - - (void) fflush(stderr); - stat = 1; - } - - } - - return stat; -} diff --git a/armci/tcgmsg/ipcv4.0/waitcom.c b/armci/tcgmsg/ipcv4.0/waitcom.c deleted file mode 100644 index b9e144703..000000000 --- a/armci/tcgmsg/ipcv4.0/waitcom.c +++ /dev/null @@ -1,17 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/waitcom.c,v 1.3 1995-02-24 02:18:06 d3h325 Exp $ */ - -#include "sndrcv.h" - -/*ARGSUSED*/ -void WAITCOM_(node) - long *node; -/* - Wait for async communications to complete ... null operation in - the UNIX environment -*/ -{ -} diff --git a/armci/tcgmsg/ipcv4.0/xdrstuff.c b/armci/tcgmsg/ipcv4.0/xdrstuff.c deleted file mode 100644 index 6b42869f2..000000000 --- a/armci/tcgmsg/ipcv4.0/xdrstuff.c +++ /dev/null @@ -1,420 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/xdrstuff.c,v 1.5 2004-04-01 02:04:57 manoj Exp $ */ - -#ifdef GOTXDR - -#include -#include - -#ifdef CRAY -extern bool_t xdr_char(); -extern char *malloc(); -#define NULL 0 -#endif - -#ifdef SEQUENT -static bool_t xdr_char(); /* below from sun distribution tape */ -#define NULL 0 -#endif - -#ifdef HPUX -#define NULL 0 -#endif - -#if defined(ULTRIX) || defined(SGI) || defined(NEXT) || defined(HPUX)|| \ - defined(AIX) || defined(KSR) || defined(DECOSF) -extern void *malloc(); -#else -extern char *malloc(); -#endif - -#define XDR_BUF_LEN 4096 /* Size of XDR buffer in bytes */ -#define XDR_DOUBLE_LEN 8 /* Size of XDR double in bytes */ -#define XDR_LONG_LEN 4 /* Size of XDR long in bytes */ -#define XDR_CHAR_LEN 4 /* Size of XDR char in bytes */ - -static char *xdrbuf_decode; -static char *xdrbuf_encode; -static XDR xdr_decode; -static XDR xdr_encode; - -static int xdr_buf_allocated = 0; /* =1 if buffers allocated, 0 otherwise */ - -extern void Error(); - -void CreateXdrBuf() -/* - Call at start to allocate the XDR buffers -*/ -{ - if (!xdr_buf_allocated) { - - /* Malloc the buffer space */ - - if ( (xdrbuf_decode = malloc((unsigned) XDR_BUF_LEN)) == (char *) NULL) - Error("CreateXdrBuf: malloc of xdrbuf_decode failed", - (long) XDR_BUF_LEN); - - if ( (xdrbuf_encode = malloc((unsigned) XDR_BUF_LEN)) == (char *) NULL) - Error("CreateXdrBuf: malloc of xdrbuf_encode failed", - (long) XDR_BUF_LEN); - - /* Associate the xdr memory streams with the buffers */ - - xdrmem_create(&xdr_decode, xdrbuf_decode, XDR_BUF_LEN, XDR_DECODE); - - xdrmem_create(&xdr_encode, xdrbuf_encode, XDR_BUF_LEN, XDR_ENCODE); - - xdr_buf_allocated = 1; - } -} - -void DestroyXdrBuf() -/* - Call to free the xdr buffers -*/ -{ - if (xdr_buf_allocated) { - - /* Destroy the buffers and free the space */ - - xdr_destroy(&xdr_encode); - xdr_destroy(&xdr_decode); - (void) free(xdrbuf_encode); - (void) free(xdrbuf_decode); - - xdr_buf_allocated = 0; - } -} - -int WriteXdrDouble(sock, x, n_double) - int sock; - double *x; - long n_double; -/* - Write double x[n_double] to the socket translating to XDR representation. - - Returned is the number of bytes written to the socket. - - All errors are treated as fatal. -*/ -{ - int nd_per_buf = (XDR_BUF_LEN-4)/XDR_DOUBLE_LEN; - /* No. of XDR doubles per buf */ - int status, nb=0; - u_int len; - long lenb; - - if (!xdr_buf_allocated) - CreateXdrBuf(); - - /* Loop thru buffer loads */ - - while (n_double > 0) { - - len = (n_double > nd_per_buf) ? nd_per_buf : n_double; - - /* Position the xdr buffer to the beginning */ - - if (!xdr_setpos(&xdr_encode, (u_int) 0)) - Error("WriteXdrDouble: xdr_setpos failed", (long) -1); - - /* Translate the buffer and then write it to the socket */ - - if (!xdr_array(&xdr_encode, (char **) &x, &len, (u_int) XDR_BUF_LEN, - (u_int) sizeof(double), xdr_double)) - Error("WriteXdrDouble: xdr_array failed", (long) -1); - - lenb = xdr_getpos(&xdr_encode); - - if ((status = WriteToSocket(sock, xdrbuf_encode, lenb)) != lenb) - Error("WriteXdrDouble: WriteToSocket failed", (long) status); - - nb += lenb; - n_double -= len; - x += len; - } - - return nb; -} - -int ReadXdrDouble(sock, x, n_double) - int sock; - double *x; - long n_double; -/* - Read double x[n_double] from the socket translating from XDR representation. - - Returned is the number of bytes read from the socket. - - All errors are treated as fatal. -*/ -{ - int nd_per_buf = (XDR_BUF_LEN-4)/XDR_DOUBLE_LEN; - /* No. of XDR doubles per buf */ - int status, nb=0; - u_int len; - long lenb; - - if (!xdr_buf_allocated) - CreateXdrBuf(); - - /* Loop thru buffer loads */ - - while (n_double > 0) { - - len = (n_double > nd_per_buf) ? nd_per_buf : n_double; - lenb = 4 + len * XDR_DOUBLE_LEN; - - /* Position the xdr buffer to the beginning */ - - if (!xdr_setpos(&xdr_decode, (u_int) 0)) - Error("ReadXdrDouble: xdr_setpos failed", (long) -1); - - /* Read from the socket and then translate the buffer */ - - if ((status = ReadFromSocket(sock, xdrbuf_decode, lenb)) != lenb) - Error("ReadXdrDouble: ReadFromSocket failed", (long) status); - - if (!xdr_array(&xdr_decode, (char **) &x, &len, (u_int) XDR_BUF_LEN, - (u_int) sizeof(double), xdr_double)) - Error("ReadXdrDouble: xdr_array failed", (long) -1); - - nb += lenb; - n_double -= len; - x += len; - } - - return nb; -} - -int WriteXdrLong(sock, x, n_long) - int sock; - long *x; - long n_long; -/* - Write long x[n_long] to the socket translating to XDR representation. - - Returned is the number of bytes written to the socket. - - All errors are treated as fatal. -*/ -{ - int nd_per_buf = (XDR_BUF_LEN-4)/XDR_LONG_LEN; - /* No. of XDR longs per buf */ - int status, nb=0; - u_int len; - long lenb; - - if (!xdr_buf_allocated) - CreateXdrBuf(); - - /* Loop thru buffer loads */ - - while (n_long > 0) { - - len = (n_long > nd_per_buf) ? nd_per_buf : n_long; - - /* Position the xdr buffer to the beginning */ - - if (!xdr_setpos(&xdr_encode, (u_int) 0)) - Error("WriteXdrLong: xdr_setpos failed", (long) -1); - - /* Translate the buffer and then write it to the socket */ - - if (!xdr_array(&xdr_encode, (char **) &x, &len, (u_int) XDR_BUF_LEN, - (u_int) sizeof(long), xdr_long)) - Error("WriteXdrLong: xdr_array failed", (long) -1); - - lenb = xdr_getpos(&xdr_encode); - - if ((status = WriteToSocket(sock, xdrbuf_encode, lenb)) != lenb) - Error("WriteXdrLong: WriteToSocket failed", (long) status); - - nb += lenb; - n_long -= len; - x += len; - } - - return nb; -} - -int ReadXdrLong(sock, x, n_long) - int sock; - long *x; - long n_long; -/* - Read long x[n_long] from the socket translating from XDR representation. - - Returned is the number of bytes read from the socket. - - All errors are treated as fatal. -*/ -{ - int nd_per_buf = (XDR_BUF_LEN-4)/XDR_LONG_LEN; - /* No. of XDR longs per buf */ - int status, nb=0; - u_int len; - long lenb; - - if (!xdr_buf_allocated) - CreateXdrBuf(); - - /* Loop thru buffer loads */ - - while (n_long > 0) { - - len = (n_long > nd_per_buf) ? nd_per_buf : n_long; - lenb = 4 + len * XDR_LONG_LEN; - - /* Position the xdr buffer to the beginning */ - - if (!xdr_setpos(&xdr_decode, (u_int) 0)) - Error("ReadXdrLong: xdr_setpos failed", (long) -1); - - /* Read from the socket and then translate the buffer */ - - if ((status = ReadFromSocket(sock, xdrbuf_decode, lenb)) != lenb) - Error("ReadXdrLong: ReadFromSocket failed", (long) status); - - if (!xdr_array(&xdr_decode, (char **) &x, &len, (u_int) XDR_BUF_LEN, - (u_int) sizeof(long), xdr_long)) - Error("ReadXdrLong: xdr_array failed", (long) -1); - - nb += lenb; - n_long -= len; - x += len; - } - - return nb; -} - -int WriteXdrChar(sock, x, n_char) - int sock; - char *x; - long n_char; -/* - Write char x[n_char] to the socket translating to XDR representation. - - Returned is the number of bytes written to the socket. - - All errors are treated as fatal. -*/ -{ - int nc_per_buf = (XDR_BUF_LEN-4)/XDR_CHAR_LEN; - /* No. of XDR chars per buf */ - int status, nb=0; - u_int len; - long lenb; - - if (!xdr_buf_allocated) - CreateXdrBuf(); - - /* Loop thru buffer loads */ - - while (n_char > 0) { - - len = (n_char > nc_per_buf) ? nc_per_buf : n_char; - - /* Position the xdr buffer to the beginning */ - - if (!xdr_setpos(&xdr_encode, (u_int) 0)) - Error("WriteXdrChar: xdr_setpos failed", (long) -1); - - /* Translate the buffer and then write it to the socket */ - - if (!xdr_array(&xdr_encode, (char **) &x, &len, (u_int) XDR_BUF_LEN, - (u_int) sizeof(char), xdr_char)) - Error("WriteXdrChar: xdr_array failed", (long) -1); - - lenb = xdr_getpos(&xdr_encode); - - if ((status = WriteToSocket(sock, xdrbuf_encode, lenb)) != lenb) - Error("WriteXdrChar: WriteToSocket failed", (long) status); - - nb += lenb; - n_char -= len; - x += len; - } - - return nb; -} - -int ReadXdrChar(sock, x, n_char) - int sock; - char *x; - long n_char; -/* - Read char x[n_char] from the socket translating from XDR representation. - - Returned is the number of bytes read from the socket. - - All errors are treated as fatal. -*/ -{ - int nc_per_buf = (XDR_BUF_LEN-4)/XDR_CHAR_LEN; - /* No. of XDR chars per buf */ - int status, nb=0; - u_int len; - long lenb; - - if (!xdr_buf_allocated) - CreateXdrBuf(); - - /* Loop thru buffer loads */ - - while (n_char > 0) { - - len = (n_char > nc_per_buf) ? nc_per_buf : n_char; - lenb = 4 + len * XDR_CHAR_LEN; - - /* Position the xdr buffer to the beginning */ - - if (!xdr_setpos(&xdr_decode, (u_int) 0)) - Error("ReadXdrChar: xdr_setpos failed", (long) -1); - - /* Read from the socket and then translate the buffer */ - - if ((status = ReadFromSocket(sock, xdrbuf_decode, lenb)) != lenb) - Error("ReadXdrChar: ReadFromSocket failed", (long) status); - - if (!xdr_array(&xdr_decode, (char **) &x, &len, (u_int) XDR_BUF_LEN, - (u_int) sizeof(char), xdr_char)) - Error("ReadXdrChar: xdr_array failed", (long) -1); - - nb += lenb; - n_char -= len; - x += len; - } - - return nb; -} - -#ifdef SEQUENT -/* - * XDR a char - */ -static bool_t xdr_char(xdrs, cp) - XDR *xdrs; - char *cp; -{ - int i; - - i = (*cp); - if (!xdr_int(xdrs, &i)) { - return (FALSE); - } - *cp = i; - return (TRUE); -} -#endif - -#else -/* dummy function to make this source file legitimate */ -#include -#include -void _dummy_ZefP_() {printf("XDR:Illegal function call\n"); exit(1);} -#endif diff --git a/armci/tcgmsg/ipcv4.0/xdrstuff.h b/armci/tcgmsg/ipcv4.0/xdrstuff.h deleted file mode 100644 index d3dc0c65d..000000000 --- a/armci/tcgmsg/ipcv4.0/xdrstuff.h +++ /dev/null @@ -1,68 +0,0 @@ -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv4.0/xdrstuff.h,v 1.3 1995-02-24 02:18:08 d3h325 Exp $ */ - -/* - Called automatically at start to allocate the XDR buffers -*/ -extern void CreateXdrBuf(); - - -/* - Call to free the xdr buffers -*/ -extern void DestroyXdrBuf(); - - -/* - int WriteXdrDouble(sock, x, n_double) - int sock; - double *x; - long n_double; - Write double x[n_double] to the socket translating to XDR representation. - - Returned is the number of bytes written to the socket. - - All errors are treated as fatal. -*/ -extern int WriteXdrDouble(); - - -/* - int ReadXdrDouble(sock, x, n_double) - int sock; - double *x; - long n_double; - Read double x[n_double] from the socket translating from XDR representation. - - Returned is the number of bytes read from the socket. - - All errors are treated as fatal. -*/ -extern int ReadXdrDouble(); - - -/* -int WriteXdrLong(sock, x, n_long) - int sock; - long *x; - long n_long; - Write long x[n_long] to the socket translating to XDR representation. - - Returned is the number of bytes written to the socket. - - All errors are treated as fatal. -*/ -extern int WriteXdrLong(); - - -/* -int ReadXdrLong(sock, x, n_long) - int sock; - long *x; - long n_long; - Read long x[n_long] from the socket translating from XDR representation. - - Returned is the number of bytes read from the socket. - - All errors are treated as fatal. -*/ -extern int ReadXdrLong(); diff --git a/armci/tcgmsg/ipcv5.0/async_send.c b/armci/tcgmsg/ipcv5.0/async_send.c deleted file mode 100644 index bc04752f4..000000000 --- a/armci/tcgmsg/ipcv5.0/async_send.c +++ /dev/null @@ -1,289 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif - -extern void exit(int status); - -#include "tcgmsgP.h" - -static const long false = 0; -static const long true = 1; - -extern void Busy(int); -extern void flush_send_q(void); - -/* All data movement to/from shared memory is done using the - COPY_TO/FROM_SHMEM macros */ -#define COPY_TO_LOCAL(src, dest, n, destnode) (void) memcpy(dest, src, n) -#define COPY_FROM_LOCAL(src, dest, n) (void)memcpy(dest, src, n) -#define COPY_FROM_REMOTE(src,dest,n,p) (void)memcpy(dest, src, n) -#define COPY_TO_REMOTE(src,dest,n,p) (void)memcpy(dest, src, n) -#ifndef FLUSH_CACHE -# define FLUSH_CACHE -#endif -#ifndef FLUSH_CACHE_LINE -# define FLUSH_CACHE_LINE(x) -#endif - -/* #define TCG_ABS(a) (((a) >= 0) ? (a) : (-(a))) */ - - -/** - * Return the value of a volatile variable in shared memory - * that is REMOTE to this processor - */ -static long remote_flag(long *p, long node) -{ - long tmp; - - /* FLUSH_CACHE;*/ /* no need to flush for one word only*/ - COPY_FROM_REMOTE(p, &tmp, sizeof(tmp), node); - return tmp; -} - - -/** - * Return the value of a volatile variable in shared memory - * that is LOCAL to this processor - */ -static long local_flag(long *p) -{ - FLUSH_CACHE_LINE(p); - return(*p); -} - - -/** - * Wait for (*p == value) - */ -static void local_await(long *p, long value) -{ - long pval; - long nspin = 0; - long spinlim = 100000000; - - while ((pval = local_flag(p)) != value) { - - if (pval && (pval != value)) { - fprintf(stdout,"%2ld: invalid value=%ld, local_flag=%p %ld\n", - TCGMSG_nodeid, (long)value, p, (long)pval); - fflush(stdout); - exit(1); - } - nspin++; - if((nspin&7)==0)flush_send_q(); - if (nspin < spinlim) - Busy(100); - else - usleep(1); - } -} - - -/** - * Entry points to info about a message ... determine which - * transport mechanism is appropriate and send as much as - * possible without blocking. - * - * Right now just shared memory ... when sockets are working this - * routine will become async_shmem_send. - * - * Shared-memory protocol aims for low latency. Each process has - * one buffer for every other process. Thus, to send a message U - * merely have to determine if the receivers buffer for you is empty - * and copy directly into the receivers buffer. - * - * Return 0 if more data is to be sent, 1 if the send is complete. - */ -long async_send(SendQEntry *entry) -{ - long node = entry->node; - ShmemBuf *sendbuf= TCGMSG_proc_info[node].sendbuf; - long nleft, ncopy; - long pval; - long info[4]; - -#ifdef DEBUG - (void) fprintf(stdout,"%2ld: sending to %ld buf=%lx len=%ld\n", - TCGMSG_nodeid, node, entry->buf, entry->lenbuf); - (void) fprintf(stdout,"%2ld: sendbuf=%lx\n", TCGMSG_nodeid, sendbuf); - (void) fflush(stdout); -#endif - - if ((pval = remote_flag(&sendbuf->info[3], node))) { -#ifdef DEBUG - { - long info[4]; - FLUSH_CACHE; - COPY_FROM_REMOTE(sendbuf->info, info, sizeof(info), node); - fprintf(stdout,"%2ld: snd info after full = %ld %ld %ld\n", - TCGMSG_nodeid, info[0], info[1], info[2]); - fflush(stdout); - } - sleep(1); -#endif - - return 0; - } - - info[0] = entry->type; info[1] = entry->lenbuf; info[2] = entry->tag; - - /* Copy over the first buffer load of the message */ - - nleft = entry->lenbuf - entry->written; - ncopy = (long) ((nleft <= SHMEM_BUF_SIZE) ? nleft : SHMEM_BUF_SIZE); - - if (ncopy&7) { -#ifdef DEBUG - printf("%2ld: rounding buffer up %ld->%ld\n", - TCGMSG_nodeid, ncopy, ncopy + 8 - (ncopy&7)); - fflush(stdout); -#endif - ncopy = ncopy + 8 - (ncopy&7); - } - - if (ncopy) { - COPY_TO_REMOTE(entry->buf+entry->written, sendbuf->buf, ncopy, node); - } - - - /* NOTE that SHMEM_BUF_SIZE is a multiple of 8 by construction so that - this ncopy is only rounded up on the last write */ - - ncopy = (long) ((nleft <= SHMEM_BUF_SIZE) ? nleft : SHMEM_BUF_SIZE); - entry->written += ncopy; - entry->buffer_number++; - - /* Copy over the header information include buffer full flag */ - - info[3] = entry->buffer_number; - COPY_TO_REMOTE(info, sendbuf->info, sizeof(info), node); - - return (long) (entry->written == entry->lenbuf); -} - - -/** - * Receive a message of given type from the specified node, returning - * the message and length of the message. - * - * Right now just shared memory ... when sockets are working this - * routine will become msg_shmem_rcv - * - * Shared-memory protocol aims for low latency. Each process has - * one buffer for every other process. Thus, to send a message U - * merely have to determine if the receivers buffer for you is empty - * and copy directly into the receivers buffer. - * - * Return 0 if more data is to be sent, 1 if the send is complete. - */ -void msg_rcv(long type, char *buf, long lenbuf, long *lenmes, long node) -{ - long me = TCGMSG_nodeid; - ShmemBuf *recvbuf; /* Points to receving buffer */ - long nleft; - long msg_type, msg_tag, msg_len; - long buffer_number = 1; - long expected_tag = TCGMSG_proc_info[node].tag_rcv++; - - if (node<0 || node>=TCGMSG_nnodes) - Error("msg_rcv: node is out of range", node); - - recvbuf = TCGMSG_proc_info[node].recvbuf; - - /* Wait for first part message to be written */ - -#ifdef DEBUG - (void) fprintf(stdout,"%2ld: receiving from %ld buf=%lx len=%ld\n", - me, node, buf, lenbuf); - - (void) fprintf(stdout,"%2ld: recvbuf=%lx\n", me, recvbuf); - (void) fflush(stdout); -#endif - - local_await(&recvbuf->info[3], buffer_number); - - /* Copy over the header information */ - - /* FLUSH_CACHE;*/ - msg_type = recvbuf->info[0]; - msg_len = recvbuf->info[1]; - msg_tag = recvbuf->info[2]; - - /* Check type and size information */ - - if (msg_tag != expected_tag) { - (void) fprintf(stdout, - "rcv: me=%ld from=%ld type=%ld, tag=%ld, expectedtag=%ld\n", - (long)me, (long)node, (long)type, (long)msg_tag, (long)expected_tag); - fflush(stdout); - Error("msg_rcv: tag mismatch ... transport layer failed????", 0L); - } - - if (msg_type != type) { - (void) fprintf(stdout, - "rcv: me=%ld from=%ld type=(%ld != %ld) tag=%ld len=%ld\n", - (long)me, (long)node, (long)type, (long)msg_type, (long)msg_tag, (long)msg_len); - fflush(stdout); - Error("msg_rcv: type mismatch ... strong typing enforced\n", 0L); - } - - if (msg_len > lenbuf) { - (void) fprintf(stderr, - "rcv: me=%ld from=%ld type=%ld tag=%ld len=(%ld > %ld)\n", - (long)me, (long)node, (long)type, (long)msg_tag, (long)msg_len, (long)lenbuf); - Error("msg_rcv: message too long for buffer\n", 0L); - } - - nleft = *lenmes = msg_len; - if (nleft == 0) { - recvbuf->info[3] = false; - } - - while (nleft) { - long ncopy = (long) ((nleft <= SHMEM_BUF_SIZE) ? nleft : SHMEM_BUF_SIZE); - { - long line; - if(ncopy < 321) - for(line = 0; line < ncopy; line+=32) - FLUSH_CACHE_LINE(recvbuf->buf+line); - else - FLUSH_CACHE; - } - - /* if (buffer_number > 1) FLUSH_CACHE;*/ - COPY_FROM_LOCAL(recvbuf->buf, buf, ncopy); - - recvbuf->info[3] = false; - - nleft -= ncopy; - buf += ncopy; - - if (nleft) { - buffer_number++; - local_await(&recvbuf->info[3], buffer_number); - } - } -} - - -long MatchShmMessage(long node, long type) -{ - ShmemBuf *recvbuf; - long msg_type; - - recvbuf = TCGMSG_proc_info[node].recvbuf; - - if(recvbuf->info[3] == false) return (0); /* no message to receive */ - - /* we have a message but let's see if want it */ - - FLUSH_CACHE_LINE(recvbuf->info); - COPY_FROM_LOCAL(recvbuf->info, &msg_type, sizeof(long)); - if(type == msg_type) return (1); - return (0); -} diff --git a/armci/tcgmsg/ipcv5.0/async_send_lapi.c b/armci/tcgmsg/ipcv5.0/async_send_lapi.c deleted file mode 100644 index a3246e9ce..000000000 --- a/armci/tcgmsg/ipcv5.0/async_send_lapi.c +++ /dev/null @@ -1,398 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif - -#include "tcgmsgP.h" - -/*#define DEBUG 1*/ -/*#define DEBUG2 1*/ -static const long false = 0; -static const long true = 1; - -typedef struct { - int from:16; - int to:16; -} nodepair_t; - -typedef union{ - long fromto; - nodepair_t n; -} pair_t; - - -extern void Busy(int); - -/* All data movement to/from shared memory is done using the - COPY_TO/FROM_SHMEM macros */ -extern lapi_handle_t lapi_handle; -/* ShmemBuf *localbuf = &tmp_snd_buf; */ -extern void lapi_put_c(void* dest, void* src, long bytes, long node, lapi_cntr_t *cntr); -extern void lapi_put(void* dest, void* src, long bytes, long node); -extern void lapi_get(void* dest, void* src, long bytes, long node); -#define COPY_TO_LOCAL(src, dest, n) (void) memcpy(dest, src, (long) n) -#define COPY_FROM_LOCAL(src, dest, n) (void)memcpy(dest, src, (long) n) -#define COPY_TO_REMOTE(src,dest,n,node) lapi_put(dest, src, (long) n, node) -#define COPY_FROM_REMOTE(src,dest,n,node)lapi_get(dest, src, (long) n,node) -/* #define COPY_TO_REMOTE_CNTR(src, dest, n, node, pcntr) lapi_put_c(dest, src, (long) n, node, pcntr) */ -#define COPY_TO_REMOTE_CNTR(localbuf, dest, n, node, pcntr) do { \ - if (LAPI_Put(lapi_handle,(uint)node, (uint)n, dest,localbuf->info, pcntr, &localbuf->cntr, NULL)) { \ - Error("TCG:lapi_put_c failed",0); \ - } \ -} while (0) -#define NEXT_LOC_BUF(localbuf) localbuf = (sendbuf_t*)localbuf->next; -#define GET_LOC_BUF(localbuf) do { \ - if(LAPI_Waitcntr(lapi_handle, &localbuf->cntr, 1, NULL)) { \ - Error("TCG:LAPI_Waitcntr failed",0); \ - } \ -} while (0) -#ifndef FLUSH_CACHE -# define FLUSH_CACHE -#endif -#ifndef FLUSH_CACHE_LINE -# define FLUSH_CACHE_LINE(x) -#endif - -#define TCG_ABS(a) (((a) >= 0) ? (a) : (-(a))) - - -/** - * Return the value of a volatile variable in shared memory - * that is REMOTE to this processor - */ -static long remote_flag(long *p, long node) -{ - long tmp; - - /* FLUSH_CACHE;*/ /* no need to flush for one word only*/ - COPY_FROM_REMOTE(p, &tmp, sizeof(tmp), node); - return tmp; -} - - -/** - * Return the value of a volatile variable in shared memory - * that is LOCAL to this processor - */ -static long local_flag(void *p) -{ - long val; - FLUSH_CACHE_LINE(p); - val = *(long*)p; - return(val); -} - - -void set_local_flag(void *p, long val) -{ - *(long*)p = val; -} - - -void set_remote_flag(void *p, long val, long node) -{ - COPY_TO_REMOTE(&val, p, sizeof(long), node); -} - - -/** - * Wait on Lapi counter for data to appear - * check if *p == value - */ -static void lapi_await(long *p, long value, lapi_cntr_t* cntr) -{ - int val; - long pval; - - if(LAPI_Waitcntr(lapi_handle, cntr, 1, &val)) - Error("lapi_await: error",-1); - -#if 0 - if ( (pval = local_flag(p)) != value) { - fprintf(stdout,"%2ld: invalid value=%ld, local_flag=%lx %ld\n", - TCGMSG_nodeid, value, (unsigned long)p, pval); - fflush(stdout); - Error("lapi_await: exiting..",-1);; - } -#endif -} - - -/** - * Wait for (*p == value) - */ -static void local_await(long *p, long value) -{ - long pval; - long nspin = 0; - long spinlim = 100000000; - long waittim = 100000; - extern void flush_send_q(void); - - while ((pval = local_flag(p)) != value) { - - if (pval && (pval != value)) { - fprintf(stdout,"%2ld: invalid value=%ld, local_flag=%lx %ld\n", - TCGMSG_nodeid, value, (unsigned long)p, pval); - fflush(stdout); - exit(1); - } - nspin++; - if((nspin&7)==0)flush_send_q(); - if (nspin < spinlim) - Busy(100); - else - USleep(waittim); - } -} - - -/** - * Entry points to info about a message ... determine which - * transport mechanism is appropriate and send as much as - * possible without blocking. - * - * Right now just shared memory ... when sockets are working this - * routine will become async_shmem_send. - * - * Shared-memory protocol aims for low latency. Each process has - * one buffer for every other process. Thus, to send a message U - * merely have to determine if the receivers buffer for you is empty - * and copy directly into the receivers buffer. - * - * Return 0 data has not been sent, 1 if the send is complete. - */ -long async_send(SendQEntry *entry) -{ - long node = entry->node; - ShmemBuf *sendbuf= TCGMSG_proc_info[node].sendbuf; -#ifdef NOTIFY_SENDER - void *busy_flag = &TCGMSG_proc_info[node].recvbuf->flag; -#endif - long ncopy, complete; - long pval; - long info[4]; - pair_t pair; - -#ifdef DEBUG2 - (void) fprintf(stdout,"%2ld: sending to %ld buf=%lx len=%ld\n", - TCGMSG_nodeid, node, entry->buf, entry->lenbuf); - (void) fprintf(stdout,"%2ld: sendbuf=%lx\n", TCGMSG_nodeid, sendbuf); - (void) fflush(stdout); -#endif - - /* return if the receiver buffer is not available */ -#ifdef NOTIFY_SENDER - pval = local_flag(busy_flag); -#else - pval = remote_flag(&sendbuf->info[3], node); -#endif - if (pval) { -#ifdef DEBUG - { - long info[4]; - FLUSH_CACHE; - COPY_FROM_REMOTE(sendbuf->info, info, sizeof(info), node); - fprintf(stdout,"%2ld: snd info after full = %ld %ld %ld\n", - TCGMSG_nodeid, info[0], info[1], info[2]); - fflush(stdout); - sleep(1); - } -#endif - - return 0; - } - - /* if data has been written already and we are here, operation is complete */ - if(entry->written) return 1L; - -#ifdef NOTIFY_SENDER - set_local_flag(busy_flag,true); -#endif - - info[0] = entry->type; info[1] = entry->lenbuf; info[2] = entry->tag; -#if 0 - entry->buffer_number++; - info[3] = entry->buffer_number; -#else - pair.n.from = TCGMSG_nodeid; - pair.n.to = node; - info[3] = pair.fromto; -#endif - - /* Copy over the message if it fits in the receiver buffer */ - ncopy = (long) (( entry->lenbuf <= SHMEM_BUF_SIZE) ? entry->lenbuf : 0 ); - - GET_LOC_BUF(localbuf); - - if (ncopy) { -#ifdef DEBUG - printf("%ld:snd:copying data node=%ld adr=%lx %ld bytes\n", - TCGMSG_nodeid, node, sendbuf->buf, ncopy); - fflush(stdout); -#endif - COPY_TO_LOCAL(entry->buf+entry->written, localbuf->buf, ncopy); - complete = 1; - } else { -#ifdef DEBUG - printf("%ld:snd:copying addr node=%ld adr=%lx %ld bytes\n", - TCGMSG_nodeid, node, sendbuf->buf, ncopy); - fflush(stdout); -#endif - /* copy address of the user buffer to the send buffer */ - COPY_TO_LOCAL(&(entry->buf), localbuf->buf, sizeof(char*)); - ncopy = sizeof(char*); - complete = 0; /* sent is complete only when receiver gets the data */ - entry->written = 1; - } - -#ifdef DEBUG - printf("%ld:snd:copying info to node=%ld adr=%lx %ld bytes\n", - TCGMSG_nodeid, node, sendbuf->info, sizeof(info)); - fflush(stdout); -#endif - - COPY_TO_LOCAL(info, localbuf->info, sizeof(info)); - COPY_TO_REMOTE_CNTR(localbuf,sendbuf,sizeof(info)+ncopy,node,&sendbuf->cntr); - - /* advance to next buf */ - NEXT_LOC_BUF(localbuf); - - return complete; -} - - -/** - * Receive a message of given type from the specified node, returning - * the message and length of the message. - * - * Right now just shared memory ... when sockets are working this - * routine will become msg_shmem_rcv - * - * Shared-memory protocol aims for low latency. Each process has - * one buffer for every other process. Thus, to send a message U - * merely have to determine if the receivers buffer for you is empty - * and copy directly into the receivers buffer. - */ -void msg_rcv(long type, char *buf, long lenbuf, long *lenmes, long node) -{ - long me = TCGMSG_nodeid; - ShmemBuf *recvbuf; /* Points to receving buffer */ - long nleft; - long msg_type, msg_tag, msg_len; - long buffer_number = 1; - long expected_tag = TCGMSG_proc_info[node].tag_rcv++; -#ifdef NOTIFY_SENDER - void *busy_flag= &TCGMSG_proc_info[node].sendbuf->flag; -#endif - - if (node<0 || node>=TCGMSG_nnodes) - Error("msg_rcv: node is out of range", node); - - recvbuf = TCGMSG_proc_info[node].recvbuf; - - /* Wait for first part message to be written */ - -#ifdef DEBUG - (void) fprintf(stdout,"%2ld: receiving from %ld buf=%lx len=%ld\n", - me, node, recvbuf,lenbuf); - (void) fprintf(stdout,"%2ld: user buf=%lx len=%ld\n", me, buf, lenbuf); - (void) fflush(stdout); -#endif - - -#ifdef LAPI - lapi_await(&recvbuf->info[3], buffer_number, &recvbuf->cntr); -#else - local_await(&recvbuf->info[3], buffer_number); -#endif - - /* Copy over the header information */ - - msg_type = recvbuf->info[0]; - msg_len = recvbuf->info[1]; - msg_tag = recvbuf->info[2]; - -#ifdef DEBUG - (void) fprintf(stdout,"%2ld: received msg from %ld len=%ld\n", - me, node, msg_len); - (void) fflush(stdout); -#endif - - /* Check type and size information */ - if(msg_tag != expected_tag) { - pair_t pair; - pair.fromto = recvbuf->info[3]; - fprintf(stdout, - "rcv: me=%ld from=%ld type=%ld expectedtag=%ld lenbuf=%ld\ngot: to=%d from=%d type=%ld msg_tag=%ld msg_len=%ld info[3]=%ld\n", - me, node, type, expected_tag, lenbuf, - (int)pair.n.to, (int)pair.n.from, msg_type, msg_tag, msg_len, - recvbuf->info[3]); - fflush(stdout); - Error("msg_rcv: tag mismatch ... transport layer failed????", 0L); - } - - if (msg_type != type) { - (void) fprintf(stderr, - "rcv: me=%ld from=%ld type=(%ld != %ld) tag=%ld len=%ld\n", - me, node, type, msg_type, msg_tag, msg_len); - Error("msg_rcv: type mismatch ... strong typing enforced\n", 0L); - } - - if (msg_len > lenbuf) { - (void) fprintf(stderr, - "rcv: me=%ld from=%ld type=%ld tag=%ld len=(%ld > %ld)\n", - me, node, type, msg_tag, msg_len, lenbuf); - Error("msg_rcv: message too long for buffer\n", 0L); - } - - nleft = *lenmes = msg_len; - - if (nleft) { - long ncopy = nleft; - - /* for short messages data is in local buffer, for long in remote buffer */ - - if(nleft <= SHMEM_BUF_SIZE) { - - FLUSH_CACHE; - - COPY_FROM_LOCAL(recvbuf->buf, buf, ncopy); - - }else { - - char *addr = *((char**)recvbuf->buf); - - COPY_FROM_REMOTE(addr, buf, nleft, node); - - } - } - - recvbuf->info[3] = false; -#ifdef NOTIFY_SENDER - /* confirm that data has been transfered */ - set_remote_flag(busy_flag,false,node); -#endif - -} - - -long MatchShmMessage(long node, long type) -{ - ShmemBuf *recvbuf; - long msg_type; - - recvbuf = TCGMSG_proc_info[node].recvbuf; - - if(recvbuf->info[3] == false) return (0); /* no message to receive */ - - /* we have a message but let's see if want it */ - - FLUSH_CACHE_LINE(recvbuf->info); - COPY_FROM_LOCAL(recvbuf->info, &msg_type, sizeof(long)); - if(type == msg_type) return (1); - return (0); -} diff --git a/armci/tcgmsg/ipcv5.0/busy.c b/armci/tcgmsg/ipcv5.0/busy.c deleted file mode 100644 index 02ddb3809..000000000 --- a/armci/tcgmsg/ipcv5.0/busy.c +++ /dev/null @@ -1,12 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -int fred; - -void Busy(int n) -{ - fred = 0; - while (n-- >= 0) - fred++; -} diff --git a/armci/tcgmsg/ipcv5.0/checkbyte.c b/armci/tcgmsg/ipcv5.0/checkbyte.c deleted file mode 100644 index f7c039556..000000000 --- a/armci/tcgmsg/ipcv5.0/checkbyte.c +++ /dev/null @@ -1,25 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/checkbyte.c,v 1.2 1994-12-30 20:55:37 d3h325 Exp $ */ - -unsigned char CheckByte(unsigned char *c, long n) -{ -/* - unsigned char sum = (char) 0; - while (n-- > 0) - sum = sum ^ *c++; - - return sum; -*/ - - unsigned int sum = 0; - unsigned int mask = 0xff; - - while (n-- > 0) - sum += (int) *c++; - - sum = (sum + (sum>>8) + (sum>>16) + (sum>>24)) & mask; - return (unsigned char) sum; -} diff --git a/armci/tcgmsg/ipcv5.0/copyall.c b/armci/tcgmsg/ipcv5.0/copyall.c deleted file mode 100644 index ad3a09b71..000000000 --- a/armci/tcgmsg/ipcv5.0/copyall.c +++ /dev/null @@ -1,218 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_MEMORY_H -# include -#endif - - -/** - * A copy optimized for DESTINATIONS in shared memory that - * are aligned and data is to be read by other processes. - * - * Both prefetch and poststore the destination. - */ -void copyto(const unsigned char *src, unsigned char *dest, long n) -{ - if (n < 32 || (dest - src) & 7) { - - /* small n, or - not possible to get src and dest even word aligned */ - - while (n--) - *dest++ = *src++; - /* memcpy(dest, src, (size_t) n); */ - return; - } - - /* Read ahead so that dest is aligned on a page boundary */ - - { - register long nbytes = (127 & (unsigned long) dest); - if (nbytes > 0) nbytes = 128 - nbytes; - if (nbytes > n) nbytes = n; - n -= nbytes; - - while (nbytes--) - *dest++ = *src++; - - if (n == 0) return; - } - - { - /* src is at least word aligned and dest is subpage aligned */ - - register long npage = n>>7; - register const unsigned long *from = (unsigned long *) src; - register unsigned long *to = (unsigned long *) dest; - register unsigned long a, b, c, d, e, f, g, h; - - src += npage<<7; - dest += npage<<7; - n -= npage<<7; - - /* _pcsp(to+16, "ex", "nbl"); - _pcsp(to+32, "ex", "nbl"); - _pcsp(to+48, "ex", "nbl"); */ - - while (npage--) { - - /* _pcsp(to+64, "ex", "nbl"); */ - - a = from[0]; - b = from[1]; - c = from[2]; - d = from[3]; - e = from[4]; - f = from[5]; - g = from[6]; - h = from[7]; - to[0] = a; - to[1] = b; - to[2] = c; - to[3] = d; - to[4] = e; - to[5] = f; - to[6] = g; - to[7] = h; - - a = from[8]; - b = from[9]; - c = from[10]; - d = from[11]; - e = from[12]; - f = from[13]; - g = from[14]; - h = from[15]; - to[8] = a; - to[9] = b; - to[10] = c; - to[11] = d; - to[12] = e; - to[13] = f; - to[14] = g; - to[15] = h; - - /* _pstsp((char *) to); */ - - to += 16; from+= 16; - } - } - - { - register long nbytes = n; - register const unsigned char *from = (unsigned char *) src; - register unsigned char *to = (unsigned char *) dest; - - while (nbytes--) - *to++ = *from++; - } -} - - -/** - * A copy optimized for SOURCES in shared memory that - * are aligned. - * - * Prefetch sources only. - */ -void copyfrom(const unsigned char *src, unsigned char *dest, long n) -{ - if (n < 32 || (dest - src) & 7) { - - /* small n, or - not possible to get src and dest even word aligned */ - - while (n--) - *dest++ = *src++; - /* memcpy(dest, src, (size_t) n);*/ - return; - } - - /* Read ahead so that src is aligned on a page boundary */ - - { - register long nbytes = (127 & (unsigned long) src); - if (nbytes > 0) nbytes = 128 - nbytes; - if (nbytes > n) nbytes = n; - n -= nbytes; - - while (nbytes--) - *dest++ = *src++; - - if (n == 0) return; - } - - { - /* dest is at least word aligned and src is subpage aligned */ - - register long npage = n>>7; - register const unsigned long *from = (unsigned long *) src; - register unsigned long *to = (unsigned long *) dest; - register unsigned long a, b, c, d, e, f, g, h; - - src += npage<<7; - dest += npage<<7; - n -= npage<<7; - - /* _pcsp(from+16, "ro", "nbl"); - _pcsp(from+32, "ro", "nbl"); - _pcsp(from+48, "ro", "nbl"); */ - - while (npage--) { - - /* _pcsp(from+64, "ro", "nbl"); */ - - a = from[0]; - b = from[1]; - c = from[2]; - d = from[3]; - e = from[4]; - f = from[5]; - g = from[6]; - h = from[7]; - to[0] = a; - to[1] = b; - to[2] = c; - to[3] = d; - to[4] = e; - to[5] = f; - to[6] = g; - to[7] = h; - - a = from[8]; - b = from[9]; - c = from[10]; - d = from[11]; - e = from[12]; - f = from[13]; - g = from[14]; - h = from[15]; - to[8] = a; - to[9] = b; - to[10] = c; - to[11] = d; - to[12] = e; - to[13] = f; - to[14] = g; - to[15] = h; - - /* _pstsp((char *) to); */ - - to += 16; from+= 16; - } - } - - { - register long nbytes = n; - register const unsigned char *from = (unsigned char *) src; - register unsigned char *to = (unsigned char *) dest; - - while (nbytes--) - *to++ = *from++; - } -} diff --git a/armci/tcgmsg/ipcv5.0/copyall.ksr.s b/armci/tcgmsg/ipcv5.0/copyall.ksr.s deleted file mode 100644 index 9550d9145..000000000 --- a/armci/tcgmsg/ipcv5.0/copyall.ksr.s +++ /dev/null @@ -1,330 +0,0 @@ - .file "/home/d3g681/tcgmsg/ipcv4.0/" - .file "copyall.c" - .vstamp 7 -# KSR1 ccom -OLM -X28 -X92 -X115 -X151 -X153 -X155 -X156 -X157 -X158 -X159 -# -X172 -X187 -# ccom: version 1.1.1. built Sun Dec 26 22:03:57 1993. - - .text - - .data - .def copyto$TXT; .val copyto$TXT; .scl 2; .endef - - .text - .def copyto; .val copyto; .scl 2; .type 513; .endef -copyto$TXT: - finop ; cxnop - finop ; cxnop - .def .bf; .val .; .scl 101; .line 11; .endef - mov8_8 %i3, %i9 ; ssub8.ntr 0, %sp, 128, %sp - itstle8 128, %i4 ; movb8_8 %i2, %c8 - add8.ntr 10, %i31, %i31 ; st8 %i13, 80(%sp) - mov8_8 %i4, %i13 ; st8 %cp, 112(%sp) - finop ; st8 %fp, 120(%sp) - finop ; mov8_8 %c10, %cp - finop ; sadd8.ntr 0, %sp, 128, %fp - finop ; bcc.qn @citst, .L2 - finop ; st8 %c14, 104(%sp) - finop ; st8 %i12, 88(%sp) - clrh8 7, %i9, %i5 ; movb8_8 %c8, %i1 - sub8.ntr 128, %i5, %i2 ; cxnop - add8.ntr 7, %i31, %i31 ; cxnop - sub8.ntr %i9, %i1, %i1 ; cxnop - clrh8 3, %i1, %i1 ; cxnop - itsteq8 0, %i1 ; cxnop - itstge8 0, %i5 ; bcs.qt @citst, .L10 -.L2: - mov8_8 %i13, %i4 ; ld8 16(%cp), %c6 - mov8_8 %i9, %i2 ; movb8_8 %c8, %i3 - add8.ntr 4, %i31, %i31 ; ld8 8(%cp), %c10 - finop ; jsr %c14, 16(%c6) - finop ; cxnop - finop ; cxnop - movi8 3, %i0 ; movi8 0, %c8 - .ln 7, .-32 # 17 - - add8.ntr 8, %i31, %i31 ; ld8 104(%sp), %c14 - finop ; ld8 112(%sp), %cp - finop ; ld8 120(%sp), %fp - finop ; ld8 88(%sp), %i12 - finop ; ld8 80(%sp), %i13 - finop ; jmp 32(%c14) - finop ; sadd8.ntr 0, %sp, 128, %sp - finop ; cxnop -.L10: - selsc8 %i5, %i2, %i5 ; cxnop - itstle8 %i5, %i13 ; cxnop - selsc8 %i5, %i13, %i5 ; cxnop - sub8.ntr %i13, %i5, %i13 ; cxnop - itsteq8 0, %i5 ; cxnop - sub8.ntr %i5, 1, %i5 ; bcs.qt @citst, .L8 -.L9: - add8.ntr 1, %i9, %i9 ; movb8_8 %i9, %c4 - itsteq8 0, %i5 ; ld1 0(%c8), %i10 - sub8.ntr %i5, 1, %i5 ; bcc.qn @citst, .L9 - add8.ntr 3, %i31, %i31 ; sadd8.ntr 0, %c8, 1, %c8 - finop ; st1 %i10, -1(%c4) -.L8: - itstne8 0, %i13 ; movb8_8 %i9, %c7 - ash8.ntr -7, %i13, %i10 ; bcc.qt @citst, .L1 - ash8.ntr 7, %i10, %i0 ; movb8_8 %i0, %c5 - add8.ntr %i9, %i0, %i9 ; mov8_8 %c8, %c6 - finop ; pcsp.ex.bl 128(%c7) - finop ; pcsp.ex.bl 256(%c7) - finop ; pcsp.ex.bl 384(%c7) - itsteq8 0, %i10 ; movb8_8 %i9, %c9 - sub8.ntr %i13, %i0, %i13 ; sadd8.ntr 0, %c5, %c8, %c8 - sub8.ntr %i10, 1, %i10 ; bcs.qt @citst, .L5 -.L6: - finop ; pcsp.ex.bl 512(%c7) - itsteq8 0, %i10 ; ld8.ro 0(%c6), %i11 - sub8.ntr %i10, 1, %i10 ; ld8.ro 8(%c6), %i0 - add8.ntr 33, %i31, %i31 ; ld8.ro 16(%c6), %i1 - finop ; ld8.ro 24(%c6), %i2 - finop ; ld8.ro 32(%c6), %i3 - finop ; ld8.ro 40(%c6), %i4 - finop ; ld8.ro 48(%c6), %i5 - finop ; ld8.ro 56(%c6), %i12 - finop ; sadd8.ntr 0, %c7, 128, %c7 - finop ; st8 %i11, -128(%c7) - finop ; st8 %i0, -120(%c7) - finop ; st8 %i1, -112(%c7) - finop ; st8 %i2, -104(%c7) - finop ; st8 %i3, -96(%c7) - finop ; st8 %i4, -88(%c7) - finop ; st8 %i5, -80(%c7) - finop ; st8 %i12, -72(%c7) - finop ; ld8.ro 120(%c6), %i12 - finop ; ld8.ro 112(%c6), %i5 - finop ; ld8.ro 104(%c6), %i4 - finop ; ld8.ro 96(%c6), %i3 - finop ; ld8.ro 88(%c6), %i2 - finop ; ld8.ro 80(%c6), %i1 - finop ; ld8.ro 72(%c6), %i0 - finop ; ld8.ro 64(%c6), %i11 - finop ; st8 %i1, -48(%c7) - finop ; st8 %i2, -40(%c7) - finop ; st8 %i0, -56(%c7) - finop ; st8 %i11, -64(%c7) - finop ; st8 %i3, -32(%c7) - finop ; st8 %i4, -24(%c7) - finop ; st8 %i5, -16(%c7) - finop ; bcc.qn @citst, .L6 - finop ; st8 %i12, -8(%c7) - finop ; sadd8.ntr 0, %c6, 128, %c6 -# finop ; pstsp 0(%c7) -.L5: - itsteq8 0, %i13 ; cxnop - sub8.ntr %i13, 1, %i4 ; bcs.qt @citst, .L1 -.L4: - itsteq8 0, %i4 ; ld1 0(%c8), %i11 - sub8.ntr %i4, 1, %i4 ; sadd8.ntr 0, %c9, 1, %c9 - add8.ntr 3, %i31, %i31 ; bcc.qn @citst, .L4 - finop ; sadd8.ntr 0, %c8, 1, %c8 - finop ; st1 %i11, -1(%c9) -.L1: - add8.ntr 8, %i31, %i31 ; ld8 104(%sp), %c14 - finop ; ld8 112(%sp), %cp - finop ; ld8 120(%sp), %fp - finop ; ld8 88(%sp), %i12 - finop ; ld8 80(%sp), %i13 - finop ; jmp 32(%c14) - finop ; sadd8.ntr 0, %sp, 128, %sp - finop ; cxnop - .def .ef; .val .; .scl 101; .line 93; .endef - .def copyto; .scl -1; .endef - - .data -# nbytes %i5 local -# npage %i10 local -# from %c6 local -# to %c7 local -# a %i11 local -# b %i0 local -# c %i1 local -# d %i2 local -# e %i3 local -# f %i4 local -# g %i5 local -# h %i12 local -# nbytes %i4 local -# from %c8 local -# to %c9 local - .half 0x0, 0x0, 0x60003000, 0x5800 -.L21: -copyto: .word copyto$TXT - .word memcpy - .word memcpy$TXT - -# src %c8 local -# dest %i9 local -# n %i13 local - - .text - - .data - .def copyfrom$TXT; .val copyfrom$TXT; .scl 2; .endef - - .text - .def copyfrom; .val copyfrom; .scl 2; .type 513; .endef -copyfrom$TXT: - finop ; cxnop - finop ; cxnop - .def .bf; .val .; .scl 101; .line 112; .endef - itstle8 128, %i4 ; ssub8.ntr 0, %sp, 128, %sp - add8.ntr 10, %i31, %i31 ; movb8_8 %i3, %c8 - finop ; st8 %i13, 80(%sp) - mov8_8 %i4, %i13 ; st8 %cp, 112(%sp) - finop ; st8 %fp, 120(%sp) - finop ; mov8_8 %c10, %cp - finop ; sadd8.ntr 0, %sp, 128, %fp - finop ; bcc.qn @citst, .L25 - finop ; st8 %c14, 104(%sp) - finop ; st8 %i12, 88(%sp) - clrh8 7, %i2, %i5 ; movb8_8 %c8, %i0 - sub8.ntr 128, %i5, %i1 ; cxnop - add8.ntr 7, %i31, %i31 ; cxnop - sub8.ntr %i0, %i2, %i0 ; cxnop - clrh8 3, %i0, %i0 ; cxnop - itsteq8 0, %i0 ; cxnop - itstge8 0, %i5 ; bcs.qt @citst, .L33 -.L25: - mov8_8 %i2, %i3 ; ld8 16(%cp), %c6 - mov8_8 %i13, %i4 ; movb8_8 %c8, %i2 - add8.ntr 4, %i31, %i31 ; ld8 8(%cp), %c10 - finop ; jsr %c14, 16(%c6) - finop ; cxnop - finop ; cxnop - movi8 3, %i0 ; movi8 0, %c8 - .ln 7, .-32 # 118 - - add8.ntr 8, %i31, %i31 ; ld8 104(%sp), %c14 - finop ; ld8 112(%sp), %cp - finop ; ld8 120(%sp), %fp - finop ; ld8 88(%sp), %i12 - finop ; ld8 80(%sp), %i13 - finop ; jmp 32(%c14) - finop ; sadd8.ntr 0, %sp, 128, %sp - finop ; cxnop -.L33: - selsc8 %i5, %i1, %i5 ; cxnop - itstle8 %i5, %i13 ; cxnop - selsc8 %i5, %i13, %i5 ; cxnop - sub8.ntr %i13, %i5, %i13 ; cxnop - itsteq8 0, %i5 ; cxnop - sub8.ntr %i5, 1, %i5 ; bcs.qt @citst, .L31 -.L32: - add8.ntr 1, %i2, %i2 ; movb8_8 %i2, %c4 - itsteq8 0, %i5 ; sadd8.ntr 0, %c8, 1, %c8 - sub8.ntr %i5, 1, %i5 ; cxnop - add8.ntr 5, %i31, %i31 ; ld1 -1(%c4), %i9 - finop ; bcc.qn @citst, .L32 - finop ; cxnop - finop ; st1 %i9, -1(%c8) -.L31: - itstne8 0, %i13 ; movb8_8 %i2, %c6 - finop ; pcsp.ro.bl 128(%c6) - finop ; pcsp.ro.bl 256(%c6) - finop ; pcsp.ro.bl 384(%c6) - ash8.ntr -7, %i13, %i9 ; bcc.qt @citst, .L24 - ash8.ntr 7, %i9, %i11 ; movb8_8 %i11, %c5 - add8.ntr %i2, %i11, %i2 ; mov8_8 %c8, %c7 - itsteq8 0, %i9 ; movb8_8 %i2, %c9 - sub8.ntr %i13, %i11, %i13 ; sadd8.ntr 0, %c5, %c8, %c8 - sub8.ntr %i9, 1, %i9 ; bcs.qt @citst, .L28 -.L29: - finop ; pcsp.ro.bl 512(%c6) - itsteq8 0, %i9 ; ld8.ro 0(%c6), %i10 - sub8.ntr %i9, 1, %i9 ; ld8.ro 8(%c6), %i11 - add8.ntr 33, %i31, %i31 ; ld8.ro 16(%c6), %i0 - finop ; ld8.ro 24(%c6), %i1 - finop ; ld8.ro 32(%c6), %i3 - finop ; ld8.ro 40(%c6), %i4 - finop ; ld8.ro 48(%c6), %i5 - finop ; ld8.ro 56(%c6), %i12 - finop ; sadd8.ntr 0, %c7, 128, %c7 - finop ; st8 %i10, -128(%c7) - finop ; st8 %i11, -120(%c7) - finop ; st8 %i0, -112(%c7) - finop ; st8 %i1, -104(%c7) - finop ; st8 %i3, -96(%c7) - finop ; st8 %i4, -88(%c7) - finop ; st8 %i5, -80(%c7) - finop ; st8 %i12, -72(%c7) - finop ; ld8.ro 120(%c6), %i12 - finop ; ld8.ro 112(%c6), %i5 - finop ; ld8.ro 104(%c6), %i4 - finop ; ld8.ro 96(%c6), %i3 - finop ; ld8.ro 88(%c6), %i1 - finop ; ld8.ro 80(%c6), %i0 - finop ; ld8.ro 72(%c6), %i11 - finop ; ld8.ro 64(%c6), %i10 - finop ; st8 %i0, -48(%c7) - finop ; st8 %i1, -40(%c7) - finop ; st8 %i11, -56(%c7) - finop ; st8 %i10, -64(%c7) - finop ; st8 %i3, -32(%c7) - finop ; st8 %i4, -24(%c7) - finop ; st8 %i5, -16(%c7) - finop ; bcc.qn @citst, .L29 - finop ; sadd8.ntr 0, %c6, 128, %c6 - finop ; st8 %i12, -8(%c7) -.L28: - itsteq8 0, %i13 ; cxnop - sub8.ntr %i13, 1, %i4 ; bcs.qt @citst, .L24 -.L27: - itsteq8 0, %i4 ; ld1 0(%c9), %i10 - sub8.ntr %i4, 1, %i4 ; sadd8.ntr 0, %c8, 1, %c8 - add8.ntr 3, %i31, %i31 ; bcc.qn @citst, .L27 - finop ; sadd8.ntr 0, %c9, 1, %c9 - finop ; st1 %i10, -1(%c8) -.L24: - add8.ntr 8, %i31, %i31 ; ld8 104(%sp), %c14 - finop ; ld8 112(%sp), %cp - finop ; ld8 120(%sp), %fp - finop ; ld8 88(%sp), %i12 - finop ; ld8 80(%sp), %i13 - finop ; jmp 32(%c14) - finop ; sadd8.ntr 0, %sp, 128, %sp - finop ; cxnop - .def .ef; .val .; .scl 101; .line 93; .endef - .def copyfrom; .scl -1; .endef - - .data -# nbytes %i5 local -# npage %i9 local -# from %c6 local -# to %c7 local -# a %i10 local -# b %i11 local -# c %i0 local -# d %i1 local -# e %i3 local -# f %i4 local -# g %i5 local -# h %i12 local -# nbytes %i4 local -# from %c9 local -# to %c8 local - .half 0x0, 0x0, 0x60003000, 0x5800 -.L44: -copyfrom: .word copyfrom$TXT - .word memcpy - .word memcpy$TXT - -# src %i2 local -# dest %c8 local -# n %i13 local - - .text - - .data - - .align 128 -.L47: - .globl copyfrom - .globl copyfrom$TXT - .globl copyto - .globl copyto$TXT - - .text diff --git a/armci/tcgmsg/ipcv5.0/copyall.save.c b/armci/tcgmsg/ipcv5.0/copyall.save.c deleted file mode 100644 index 0ecaff432..000000000 --- a/armci/tcgmsg/ipcv5.0/copyall.save.c +++ /dev/null @@ -1,213 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_MEMORY_H -# include -#endif - - -/** - * A copy optimized for DESTINATIONS in shared memory that - * are aligned and data is to be read by other processes. - * - * Both prefetch and poststore the destination. - */ -void copyto(const unsigned char *src, unsigned char *dest, long n) -{ - if (n < 128 || (dest - src) & 7) { - - /* small n, or - not possible to get src and dest even word aligned */ - - memcpy(dest, src, (size_t) n); - return; - } - - /* Read ahead so that dest is aligned on a page boundary */ - - { - register long nbytes = (127 & (unsigned long) dest); - if (nbytes > 0) nbytes = 128 - nbytes; - if (nbytes > n) nbytes = n; - n -= nbytes; - - while (nbytes--) - *dest++ = *src++; - - if (n == 0) return; - } - - { - /* src is at least word aligned and dest is subpage aligned */ - - register long npage = n>>7; - register const unsigned long *from = (unsigned long *) src; - register unsigned long *to = (unsigned long *) dest; - register unsigned long a, b, c, d, e, f, g, h; - - src += npage<<7; - dest += npage<<7; - n -= npage<<7; - - /* _pcsp(to+16, "ex", "nbl"); - _pcsp(to+32, "ex", "nbl"); - _pcsp(to+48, "ex", "nbl"); */ - - while (npage--) { - - /* _pcsp(to+64, "ex", "nbl"); */ - - a = from[0]; - b = from[1]; - c = from[2]; - d = from[3]; - e = from[4]; - f = from[5]; - g = from[6]; - h = from[7]; - to[0] = a; - to[1] = b; - to[2] = c; - to[3] = d; - to[4] = e; - to[5] = f; - to[6] = g; - to[7] = h; - - a = from[8]; - b = from[9]; - c = from[10]; - d = from[11]; - e = from[12]; - f = from[13]; - g = from[14]; - h = from[15]; - to[8] = a; - to[9] = b; - to[10] = c; - to[11] = d; - to[12] = e; - to[13] = f; - to[14] = g; - to[15] = h; - - /* _pstsp((char *) to); */ - - to += 16; from+= 16; - } - } - - { - register long nbytes = n; - register const unsigned char *from = (unsigned char *) src; - register unsigned char *to = (unsigned char *) dest; - - while (nbytes--) - *to++ = *from++; - } -} - - -/** - * A copy optimized for SOURCES in shared memory that are aligned. - * - * Prefetch sources only. - */ -void copyfrom(const unsigned char *src, unsigned char *dest, long n) -{ - if (n < 128 || (dest - src) & 7) { - - /* small n, or - not possible to get src and dest even word aligned */ - - memcpy(dest, src, (size_t) n); - return; - } - - /* Read ahead so that src is aligned on a page boundary */ - - { - register long nbytes = (127 & (unsigned long) src); - if (nbytes > 0) nbytes = 128 - nbytes; - if (nbytes > n) nbytes = n; - n -= nbytes; - - while (nbytes--) - *dest++ = *src++; - - if (n == 0) return; - } - - { - /* dest is at least word aligned and src is subpage aligned */ - - register long npage = n>>7; - register const unsigned long *from = (unsigned long *) src; - register unsigned long *to = (unsigned long *) dest; - register unsigned long a, b, c, d, e, f, g, h; - - src += npage<<7; - dest += npage<<7; - n -= npage<<7; - - /* _pcsp(from+16, "ro", "nbl"); - _pcsp(from+32, "ro", "nbl"); - _pcsp(from+48, "ro", "nbl"); */ - - while (npage--) { - - /* _pcsp(from+64, "ro", "nbl"); */ - - a = from[0]; - b = from[1]; - c = from[2]; - d = from[3]; - e = from[4]; - f = from[5]; - g = from[6]; - h = from[7]; - to[0] = a; - to[1] = b; - to[2] = c; - to[3] = d; - to[4] = e; - to[5] = f; - to[6] = g; - to[7] = h; - - a = from[8]; - b = from[9]; - c = from[10]; - d = from[11]; - e = from[12]; - f = from[13]; - g = from[14]; - h = from[15]; - to[8] = a; - to[9] = b; - to[10] = c; - to[11] = d; - to[12] = e; - to[13] = f; - to[14] = g; - to[15] = h; - - /* _pstsp((char *) to); */ - - to += 16; from+= 16; - } - } - - { - register long nbytes = n; - register const unsigned char *from = (unsigned char *) src; - register unsigned char *to = (unsigned char *) dest; - - while (nbytes--) - *to++ = *from++; - } -} diff --git a/armci/tcgmsg/ipcv5.0/drand48.c b/armci/tcgmsg/ipcv5.0/drand48.c deleted file mode 100644 index 1f2faa218..000000000 --- a/armci/tcgmsg/ipcv5.0/drand48.c +++ /dev/null @@ -1,21 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/drand48.c,v 1.2 1994-12-30 20:55:40 d3h325 Exp $ */ - -#include "srftoc.h" - -extern long random(); -extern int srandom(); - -double DRAND48_() -{ - return ( (double) random() ) * 4.6566128752458e-10; -} - -void SRAND48_(seed) - unsigned *seed; -{ - (void) srandom(*seed); -} diff --git a/armci/tcgmsg/ipcv5.0/error.c b/armci/tcgmsg/ipcv5.0/error.c deleted file mode 100644 index 05c789a41..000000000 --- a/armci/tcgmsg/ipcv5.0/error.c +++ /dev/null @@ -1,63 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_ERRNO_H -# include -#endif -#if HAVE_STDIO_H -# include -#endif -#if HAVE_SIGNAL_H -# include -#endif - -#include "sndrcv.h" -#include "tcgmsgP.h" - -extern void perror(const char *); -extern void exit(int); -extern void ZapChildren(void); - -#define DEV stderr - - -void Error(char *string, long integer) -{ - (void) signal(SIGINT, SIG_IGN); - (void) signal(SIGCHLD, SIG_DFL); /* Death of children to be expected */ - - (void) fflush(stdout); - if (TCGMSG_caught_sigint) { - (void) fprintf(DEV,"%2ld: interrupt\n",(long)NODEID_()); - } - else { - (void) fprintf(DEV,"%3ld: %s %ld (%#lx).\n", (long)NODEID_(), string, - (long)integer,(long)integer); - if (errno != 0) - perror("system error message"); - } - (void) fflush(DEV); - - /* Shut down the sockets and remove shared memory and semaphores to - propagate an error condition to anyone that is trying to communicate - with me */ - -#ifndef LAPI - ZapChildren(); /* send interrupt to children which should trap it - and call Error in the handler */ - DeleteSharedRegion(TCGMSG_shmem_id); -#endif - - exit(1); -} - - -/** - * Interface from fortran to c error routine - */ -void PARERR_(long *code) -{ - long lcode = (long)(*code); - Error("User detected error in FORTRAN", lcode); -} diff --git a/armci/tcgmsg/ipcv5.0/evlog.c b/armci/tcgmsg/ipcv5.0/evlog.c deleted file mode 100644 index 5baf3852a..000000000 --- a/armci/tcgmsg/ipcv5.0/evlog.c +++ /dev/null @@ -1,384 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/evlog.c,v 1.3 2003-06-27 13:53:12 manoj Exp $ */ - -/** Event logging routine with key driven varargs interface */ - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STDARG_H -# include -#endif -#if HAVE_STRINGS_H -# include -#endif -#if HAVE_STRING_H -# include -#endif - -extern long nodeid_(); - -#include "evlog.h" -#include "sndrcv.h" - -#define ERROR_RETURN() do { \ - error = 1; \ - return; \ -} while (0) - -#define DUMPBUF() do { \ - (void) fputs(buffer, file); \ - (void) fflush(file); \ - if (ferror(file)) { \ - ERROR_RETURN; \ - } \ - bufpt = buffer; \ - left = BUFLEN; \ -} while (0) - -#define RECORD(A) do { \ - A; \ - nchars = strlen(bufpt); \ - bufpt += nchars; \ - left -= nchars; \ -} while (0) - -static double walltime(); - - -/** - * The format of the argument list is as follows: - * - * evlog([(int) key, [values, ...]], ..., EVKEY_LAST_ARG) - * - * Arguments are read as keys with corresponding values. Recognised keys - * are defined in evlog.h and are described in detail below. - * - * Logging is enabled/disabled by calling evlog with one of EVKEY_ENABLE - * or EVKEY_DISABLE specified. Note that EVKEY_ENABLE must be the - * first key specified for it to be recognized and that all keys - * in the argument list after EVKEY_DISABLE are ignored. By default - * events are logged in the file events. This can be overridden with - * the key EVKEY_FILENAME, which takes the filename as its value. - * - * The model for logging events assumed by the post-analysis routines - * assumes that upon logging an event: - * - * a) no state chage occurs (EVKEY_EVENT). The event is just recorded. - * - * b) the process changes state by pushing the event onto the state stack - * (EVKEY_BEGIN). - * - * c) the process changes state by popping an event off the state stack - * (EVKEY_END). If the event or state popped off the stack does not - * match the specified event then the post-analysis may get confused - * but this does not interfere with the actual logging. - * - * EVKEY_EVENT, EVKEY_BEGIN or EVKEY_END must be the first key specified other - * than a possible EVKEY_ENABLE. - * - * Internally an event is stored as a large character string to simplify - * post-analysis. Users specify data for storage in addition to - * that which is automatically stored (only the time and process) with - * key, value combinations (EVKEY_STR_INT, EVKEY_STR_DBL, EVKEY_STR). - * Many such key-value combinations as required may be specified. - * Since the internal data format uses colons ':', double quotation - * marks '"' and carriage returns users should avoid these in their - * string data. - * - * ---------------------------- - * Sample calling sequence: - * - * evlog(EVKEY_ENABLE, EVKEY_FILENAME, "events.log", EVKEY_LAST_ARG); - * - * evlog(EVKEY_EVENT, "Finished startup code", - * EVKEY_STR, "Now do some real work", - * EVKEY_LAST_ARG); - * - * evlog(EVKEY_BEGIN, "Get Matrix", EVKEY_LAST_ARG); - * - * evlog(EVKEY_END, "Get matrix", - * EVKEY_STR_INT, "Size of matrix", (int) N, - * EVKEY_STR_DBL, "Norm of matrix", (double) matrix_norm, - * EVKEY_LAST_ARG); - * - * evlog(EVKEY_BEGIN, "Transform matrix", - * EVKEY_STR_DBL, "Recomputed norm", (double) matrix_norm, - * EVKEY_LAST_ARG); - * - * evlog(EVKEY_END, "Transform matrix", - * EVKEY_STR_INT, "No. of iterations", (int) niters, - * EVKEY_LAST_ARG); - * - * evlog(EVKEY_DUMP, EVKEY_DISABLE, EVKEY_LAST_ARG); - * - * evlog(EVKEY_EVENT, "Logging is disabled ... this should not print", - * EVKEY_DUMP, EVKEY_LAST_ARG); - * - * ---------------------------- - * - * EVKEY_LAST_ARG -* Terminates list ... takes no value and must be present -* -* EVKEY_EVENT, (char *) event -* Simply log occurence of the event -* -* EVKEY_BEGIN, (char *) event -* Push event onto process state stack -* -* EVKEY_END, (char *) event -* Pop event off process state stack -* -* EVKEY_MSG_LEN, (int) length -* Value is (int) mesage length SND/RCV only -* -* EVKEY_MSG_TO, (int) to -* Value is (int) to process id SND/RCV only -* -* EVKEY_MSG_FROM, (int) from -* Value is (int) from process SND/RCV only -* -* EVKEY_MSG_TYPE, (int) type -* Value is (int) message type SND/RCV only -* -* EVKEY_STR_INT, (char *) string, (int) data -* User data value pair -* - * EVKEY_STR_DBL, (char *) string, (double) data -* User data value pair (char *), (double) - * - * EVKEY_STR, (char *) string -* User data value (char *) - * - * EVKEY_ENABLE - * Enable logging - * - * EVKEY_DISABLE - * Disable logging - * - * EVKEY_DUMP - * Dump out the current buffer to disk - * - * EVKEY_FILE, (char *) filename - * Use specified file to capture events. Default is "events". - */ -void evlog(int farg_key, ...) -{ - static int logging=0; /* Boolean flag for login enabled/disabled */ - static int error=0; /* Boolean flag for error detected */ - static int ncall=0; /* Need to do stuff on first entry */ - static char *buffer; /* Logging buffer ... null terminated */ - static char *bufpt; /* Pointer to next free char in buffer */ - static int left; /* Amount of free space in buffer */ -#define BUFLEN 262144 /* Size allocated for buffer ... biggish */ -#define MAX_EV_LEN 1000 /* Assumed maximum size of single event record */ - static FILE *file; /* File where events will be dumped */ - static char *filename = "events"; /* Default name of events file */ - - va_list ap; /* For variable argument list */ - int key; /* Hold key being processed */ - int nchars; /* No. of chars printed by sprintf call */ - char *temp; /* Temporary copy of bufpt */ - char *string; /* Temporary */ - int integer; /* Temporary */ - double dbl; /* Temporary */ - int valid; /* Temporary */ - - /* If an error was detected on a previous call don't even try to - do anything */ - - if (error) { - ERROR_RETURN(); - } - - /* First time in need to allocate the buffer, open the file etc */ - - if (ncall == 0) { - ncall = 1; - if (!(bufpt = buffer = malloc((unsigned) BUFLEN))) { - ERROR_RETURN(); - } - left = BUFLEN; - - if (!(file = fopen(filename, "w"))) { - ERROR_RETURN(); - } - } - - /* Parse the arguments */ - - temp = bufpt; /* Save to check if anything has been logged */ - valid = 0; /* One of BEGIN, END or EVENT must preceed most keys */ - - va_start(ap, farg_key); - key = farg_key; - while (key != EVKEY_LAST_ARG) { - - if ( (!logging) && (key != EVKEY_ENABLE) ) - return; - - switch (key) { - - case EVKEY_ENABLE: - logging = 1; - break; - - case EVKEY_DISABLE: - logging = 0; - goto done; - /* break; */ - - case EVKEY_FILENAME: - if (!(filename = strdup(va_arg(ap, char *)))) - {ERROR_RETURN();} - if (!(file = freopen(filename, "w", file))) {ERROR_RETURN();} - break; - - case EVKEY_BEGIN: - valid = 1; - RECORD(sprintf(bufpt, ":BEGIN:%s", va_arg(ap, char *))); - RECORD(sprintf(bufpt, ":TIME:%.2f", walltime())); - break; - - case EVKEY_END: - valid = 1; - RECORD(sprintf(bufpt, ":END:%s", va_arg(ap, char *))); - RECORD(sprintf(bufpt, ":TIME:%.2f", walltime())); - break; - - case EVKEY_EVENT: - valid = 1; - RECORD(sprintf(bufpt, ":EVENT:%s", va_arg(ap, char *))); - RECORD(sprintf(bufpt, ":TIME:%.2f", walltime())); - break; - - case EVKEY_MSG_LEN: - if (!valid) {ERROR_RETURN();} - RECORD(sprintf(bufpt, ":MSG_LEN:%d", va_arg(ap, int))); - break; - - case EVKEY_MSG_TO: - if (!valid) {ERROR_RETURN();} - RECORD(sprintf(bufpt, ":MSG_TO:%d", va_arg(ap, int))); - break; - - case EVKEY_MSG_FROM: - if (!valid) {ERROR_RETURN();} - RECORD(sprintf(bufpt, ":MSG_FROM:%d", va_arg(ap, int))); - break; - - case EVKEY_MSG_TYPE: - if (!valid) {ERROR_RETURN();} - RECORD(sprintf(bufpt, ":MSG_TYPE:%d", va_arg(ap, int))); - break; - - case EVKEY_MSG_SYNC: - if (!valid) {ERROR_RETURN();} - RECORD(sprintf(bufpt, ":MSG_SYNC:%d", va_arg(ap, int))); - break; - - case EVKEY_STR_INT: - if (!valid) {ERROR_RETURN();} - string = va_arg(ap, char *); - integer = va_arg(ap, int); - RECORD(sprintf(bufpt, ":STR_INT:%s:%d", string, integer)); - break; - - case EVKEY_STR_DBL: - if (!valid) {ERROR_RETURN();} - string = va_arg(ap, char *); - dbl = va_arg(ap, double); - RECORD(sprintf(bufpt, ":STR_DBL:%s:%g", string, dbl)); - break; - - case EVKEY_STR: - if (!valid) {ERROR_RETURN();} - RECORD(sprintf(bufpt, ":STR:%s", va_arg(ap, char *))); - break; - - case EVKEY_DUMP: - {DUMPBUF();} - if (temp != bufpt) { - RECORD(sprintf(bufpt, "\n")); - temp = bufpt; - } - break; - - default: - {DUMPBUF();} - {ERROR_RETURN();} - } - key = va_arg(ap, int); - } - -done: - va_end(ap); - - /* Put a linefeed on the end of the record if something is written */ - - if (temp != bufpt) { - RECORD(sprintf(bufpt, "\n")); - temp = bufpt; - } - - /* Should really check on every access to the buffer that there is - enough space ... however just assume a very large maximum size - for a single event log entry and check here */ - - if (left <= 0) { - ERROR_RETURN(); - } - - if (left < MAX_EV_LEN) { - DUMPBUF(); - } -} - - -/** - * return the wall time in seconds as a double - */ -static double walltime() -{ - return ((double) MTIME_()) * 0.01; -} - -/* -int main(int argc, char **argv) -{ - int N = 19; - double matrix_norm = 99.1; - int niters = 5; - - evlog(EVKEY_ENABLE, EVKEY_FILENAME, "events.log", EVKEY_LAST_ARG); - - evlog(EVKEY_EVENT, "Finished startup code", - EVKEY_STR, "Now do some real work", - EVKEY_LAST_ARG); - - evlog(EVKEY_BEGIN, "Get Matrix", EVKEY_LAST_ARG); - - evlog(EVKEY_END, "Get matrix", - EVKEY_STR_INT, "Size of matrix", (int) N, - EVKEY_STR_DBL, "Norm of matrix", (double) matrix_norm, - EVKEY_LAST_ARG); - - evlog(EVKEY_BEGIN, "Transform matrix", - EVKEY_STR_DBL, "Recomputed norm", (double) matrix_norm, - EVKEY_LAST_ARG); - - evlog(EVKEY_END, "Transform matrix", - EVKEY_STR_INT, "No. of iterations", (int) niters, - EVKEY_LAST_ARG); - - evlog(EVKEY_DUMP, EVKEY_LAST_ARG); - - evlog(EVKEY_EVENT, "Logging is disabled ... this should not print", - EVKEY_DUMP, EVKEY_LAST_ARG); - - return 0; -} -*/ diff --git a/armci/tcgmsg/ipcv5.0/evlog.h b/armci/tcgmsg/ipcv5.0/evlog.h deleted file mode 100644 index 333e4d22f..000000000 --- a/armci/tcgmsg/ipcv5.0/evlog.h +++ /dev/null @@ -1,38 +0,0 @@ -/** @file - * Define EVENT and KEY values used when calling evlog. - */ -#ifndef EVLOG_H_ -#define EVLOG_H_ - -extern void evlog(int farg_key, ...); - -/* Values of keys in key value pairs */ - -#define EVKEY_LAST_ARG 0 /**> Terminates list ... takes no value */ - -#define EVKEY_BEGIN 1 /**> Push (char *) value onto state stack */ -#define EVKEY_END 2 /**> Pop (char *) value off state stack */ -#define EVKEY_EVENT 3 /**> Record (char *) value, no stack change */ - -#define EVKEY_MSG_LEN 4 /**> Value is (int) mesage length SND/RCV only */ -#define EVKEY_MSG_TO 5 /**> Value is (int) to process id SND/RCV only */ -#define EVKEY_MSG_FROM 6 /**> Value is (int) from process SND/RCV only */ -#define EVKEY_MSG_TYPE 7 /**> Value is (int) message type SND/RCV only */ -#define EVKEY_MSG_SYNC 8 /**> Value is (int) message sync SND/RCV only */ - -#define EVKEY_STR_INT 9 /**> User data value pair (char *), (int) */ -#define EVKEY_STR_DBL 10 /**> User data value pair (char *), (double) */ -#define EVKEY_STR 11 /**> User data value (char *) */ - -#define EVKEY_ENABLE 12 /**> Enable logging ... takes no value */ -#define EVKEY_DISABLE 13 /**> Disable logging ... takes no value */ - -#define EVKEY_DUMP 14 /**> Dump out the current buffer to disk */ - -#define EVKEY_FILENAME 15 /**> Set the name of the events file */ - -#define EVENT_SND "Snd" /**> Predefined strings for internal events */ -#define EVENT_RCV "Rcv" -#define EVENT_PROCESS "Process" - -#endif /* EVLOG_H_ */ diff --git a/armci/tcgmsg/ipcv5.0/getmem.c b/armci/tcgmsg/ipcv5.0/getmem.c deleted file mode 100644 index 63867df05..000000000 --- a/armci/tcgmsg/ipcv5.0/getmem.c +++ /dev/null @@ -1,40 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_MALLOC_H -# include -#endif -#if HAVE_STDLIB_H -# include -#endif - -#define getmem_ F77_FUNC(getmem,GETMEM) - -/** - * getmem gets n real*8 storage locations and returns its - * address (iaddr) and offset (ioff) within the real*8 array work - * so that the usable memory is (work(i+ioff),i=1,n). - * e.g. - * call getmem(n,work,iaddr,ioff) - * if (iaddr.eq.0) call error - * - * Mods are needed to release this later. - */ -void getmem_( - unsigned long *pn, - double *pwork, - unsigned long *paddr, - unsigned long *pioff) -{ - double *ptemp; - unsigned int size = 8; - -#if HAVE_MEMALIGN - ptemp = (double *) memalign(size, (unsigned) size* *pn); -#else - ptemp = (double *) malloc((unsigned) size* *pn); -#endif - *paddr = (unsigned long) ptemp; - *pioff = ptemp - pwork; -} diff --git a/armci/tcgmsg/ipcv5.0/globalop.c b/armci/tcgmsg/ipcv5.0/globalop.c deleted file mode 100644 index 3d43c4793..000000000 --- a/armci/tcgmsg/ipcv5.0/globalop.c +++ /dev/null @@ -1,235 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "srftoc.h" -#include "tcgmsgP.h" - -#define BUF_SIZE 10000 -#define IBUF_SIZE (BUF_SIZE * sizeof(double)/sizeof(long)) -double _gops_work[BUF_SIZE]; - -long one=1; - -#define TCG_MAX(a,b) (((a) >= (b)) ? (a) : (b)) -#define TCG_MIN(a,b) (((a) <= (b)) ? (a) : (b)) -#define TCG_ABS(a) (((a) >= 0) ? (a) : (-(a))) - - -void BRDCST_(long *type, void *buf, long *len, long *originator) -{ - long me=NODEID_(), nproc=NNODES_(), lenmes, from, root=0; - long up, left, right; - - /* determine location in the binary tree */ - up = (me-1)/2; if(up >= nproc) up = -1; - left = 2* me + 1; if(left >= nproc) left = -1; - right = 2* me + 2; if(right >= nproc) right = -1; - - /* originator sends data to root */ - if (*originator != root ){ - if(me == *originator) SND_(type, buf, len, &root, &one); - if(me == root) RCV_(type, buf, len, &lenmes, originator, &from, &one); - } - - if (me != root) RCV_(type, buf, len, &lenmes, &up, &from, &one); - if (left > -1) SND_(type, buf, len, &left, &one); - if (right > -1) SND_(type, buf, len, &right, &one); -} - - -/** - * implements x = op(x,work) for integer datatype - * x[n], work[n] - arrays of n integers - */ -static void idoop(long n, char *op, long * x, long * work) -{ - if (strncmp(op,"+",1) == 0) { - while(n--) { - *x++ += *work++; - } - } - else if (strncmp(op,"*",1) == 0) { - while(n--) { - *x++ *= *work++; - } - } - else if (strncmp(op,"max",3) == 0) { - while(n--) { - *x = TCG_MAX(*x, *work); - x++; work++; - } - } - else if (strncmp(op,"min",3) == 0) { - while(n--) { - *x = TCG_MIN(*x, *work); - x++; work++; - } - } - else if (strncmp(op,"absmax",6) == 0) { - while(n--) { - register long x1 = TCG_ABS(*x), x2 = TCG_ABS(*work); - *x = TCG_MAX(x1, x2); - x++; work++; - } - } - else if (strncmp(op,"absmin",6) == 0) { - while(n--) { - register long x1 = TCG_ABS(*x), x2 = TCG_ABS(*work); - *x = TCG_MIN(x1, x2); - x++; work++; - } - } - else if (strncmp(op,"or",2) == 0) { - while(n--) { - *x |= *work; - x++; work++; - } - } - /* these are new */ - else if ((strncmp(op, "&&", 2) == 0) || (strncmp(op, "land", 4) == 0)) { - while(n--) { - *x = *x && *work; - x++; work++; - } - } - else if ((strncmp(op, "||", 2) == 0) || (strncmp(op, "lor", 3) == 0)) { - while(n--) { - *x = *x || *work; - x++; work++; - } - } - else if ((strncmp(op, "&", 1) == 0) || (strncmp(op, "band", 4) == 0)) { - while(n--) { - *x &= *work; - x++; work++; - } - } - else if ((strncmp(op, "|", 1) == 0) || (strncmp(op, "bor", 3) == 0)) { - while(n--) { - *x |= *work; - x++; work++; - } - } - else { - Error("idoop: unknown operation requested", n); - } -} - - - -/** - * implements x = op(x,work) for double datatype - * x[n], work[n] - arrays of n doubles - */ -static void ddoop(long n, char *op, double *x, double *work) -{ - if (strncmp(op,"+",1) == 0) { - while(n--) { - *x++ += *work++; - } - } - else if (strncmp(op,"*",1) == 0) { - while(n--) { - *x++ *= *work++; - } - } - else if (strncmp(op,"max",3) == 0) { - while(n--) { - *x = TCG_MAX(*x, *work); - x++; work++; - } - } - else if (strncmp(op,"min",3) == 0) { - while(n--) { - *x = TCG_MIN(*x, *work); - x++; work++; - } - } - else if (strncmp(op,"absmax",6) == 0) { - while(n--) { - register double x1 = TCG_ABS(*x), x2 = TCG_ABS(*work); - *x = TCG_MAX(x1, x2); - x++; work++; - } - } - else if (strncmp(op,"absmin",6) == 0) { - while(n--) { - register double x1 = TCG_ABS(*x), x2 = TCG_ABS(*work); - *x = TCG_MIN(x1, x2); - x++; work++; - } - } - else { - Error("ddoop: unknown operation requested", (long) n); - } -} - - -void DGOP_(long *type, double *x, long *n, char *op, int oplen) -{ - long me=NODEID_(), nproc=NNODES_(), len, lenmes, from, root=0; - double *work = _gops_work, *origx = x; - long ndo, up, left, right, np=*n, orign = *n; - - /* determine location in the binary tree */ - up = (me-1)/2; if(up >= nproc) up = -1; - left = 2* me + 1; if(left >= nproc) left = -1; - right = 2* me + 2; if(right >= nproc) right = -1; - - while ((ndo = (np <= BUF_SIZE) ? np : BUF_SIZE)) { - len = lenmes = ndo*sizeof(double); - - if (left > -1) { - RCV_(type, (char *) work, &len, &lenmes, &left, &from, &one); - ddoop(ndo, op, x, work); - } - if (right > -1) { - RCV_(type, (char *) work, &len, &lenmes, &right, &from, &one); - ddoop(ndo, op, x, work); - } - if (me != root) SND_(type, x, &len, &up, &one); - - np -=ndo; - x +=ndo; - } - - /* Now, root broadcasts the result down the binary tree */ - len = orign*sizeof(double); - BRDCST_(type, (char *) origx, &len, &root); -} - - -void IGOP_(long *type, long *x, long *n, char *op, int oplen) -{ - long me=NODEID_(), nproc=NNODES_(), len, lenmes, from, root=0; - long *work = (long*)_gops_work; - long *origx = x; - long ndo, up, left, right, np=*n, orign =*n; - - /* determine location in the binary tree */ - up = (me-1)/2; if(up >= nproc) up = -1; - left = 2* me + 1; if(left >= nproc) left = -1; - right = 2* me + 2; if(right >= nproc) right = -1; - - while ((ndo = (np<=IBUF_SIZE) ? np : IBUF_SIZE)) { - len = lenmes = ndo*sizeof(long); - - if (left > -1) { - RCV_(type, (char *) work, &len, &lenmes, &left, &from, &one); - idoop(ndo, op, x, work); - } - if (right > -1) { - RCV_(type, (char *) work, &len, &lenmes, &right, &from, &one); - idoop(ndo, op, x, work); - } - if (me != root) SND_(type, x, &len, &up, &one); - - np -=ndo; - x +=ndo; - } - - /* Now, root broadcasts the result down the binary tree */ - len = orign*sizeof(long); - BRDCST_(type, (char *) origx, &len, &root); -} diff --git a/armci/tcgmsg/ipcv5.0/lapi_putget.c b/armci/tcgmsg/ipcv5.0/lapi_putget.c deleted file mode 100644 index b894d480b..000000000 --- a/armci/tcgmsg/ipcv5.0/lapi_putget.c +++ /dev/null @@ -1,307 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_SIGNAL_H -# include -#endif -#if HAVE_UNISTD_H -# include -#endif - -#include -#include - -#include "tcgmsgP.h" - -lapi_handle_t lapi_handle; -lapi_info_t lapi_info; -extern ShmemBuf TCGMSG_receive_buffer[]; - -#define LEN 2 -int nxtval_counter=0; -int *nxtval_cnt_adr = &nxtval_counter; -static lapi_cntr_t req_cnt; - -#define INCR 1 /* increment for NXTVAL */ -#define BUSY -1L /* indicates somebody else updating counter*/ -/*#define TRACEINFO 1*/ - - -/** - * initialize lapi - */ -void lapi_initialize() -{ - int myid, numtasks,rc; - - bzero(&lapi_info,sizeof(lapi_info)); /* needed under Mohonk */ - rc = LAPI_Init(&lapi_handle, &lapi_info); - if(rc) Error("lapi_init failed",rc); - - rc=LAPI_Qenv(lapi_handle, TASK_ID, &myid); - if(rc) Error("lapi_qenv failed",rc); - rc=LAPI_Qenv(lapi_handle, NUM_TASKS, &numtasks); - if(rc) Error("lapi_qenv failed 2",rc); - - TCGMSG_nodeid = (long)myid; - TCGMSG_nnodes = (long)numtasks; - - /* disable LAPI internal error checking */ - LAPI_Senv(lapi_handle, ERROR_CHK, 0); - - rc = LAPI_Setcntr(lapi_handle, &req_cnt, 0); - if(rc)Error("lapi_initialize: setcntr failed",rc); - -#ifdef DEBUG - printf("me=%d initialized %d processes\n", myid, numtasks); -#endif - fflush(stdout); -} - - -void lapi_adr_exchg() -{ - long node, tgt; - int rc; - void **table; - int i; - - table = (void **)malloc(TCGMSG_nnodes * sizeof(void *)); - if (!table) Error(" lapi_adr_exchg: malloc failed", 0); - - /* allocate and initialize send buffers */ - sendbuf_arr = (sendbuf_t*)malloc(SENDBUF_NUM*sizeof(sendbuf_t)); - if(!sendbuf_arr) Error(" lapi_adr_exchg:malloc 2 failed", 0); - /* - bzero(sendbuf_arr,SENDBUF_NUM*sizeof(sendbuf_t)); - */ - - for(i=0; i< SENDBUF_NUM; i++){ - LAPI_Setcntr(lapi_handle,&sendbuf_arr[i].cntr, 1); - sendbuf_arr[i].next = sendbuf_arr+i+1; - } - sendbuf_arr[SENDBUF_NUM-1].next = sendbuf_arr; - localbuf = sendbuf_arr; - if(sizeof(ShmemBuf) < sizeof(sendbuf_t)) - Error("lapi_adr_exchg: buffer size problem",0); - - /* exchange addresses */ - for(node = 0; node < TCGMSG_nnodes; node++){ - - /* Lapi does not like NULL address for buffer that we have - for sending msg to itself - use some invalid address */ - if (node == TCGMSG_nodeid) - TCGMSG_proc_info[node].recvbuf = (ShmemBuf *)1; - else - if(LAPI_Setcntr(lapi_handle, - &(TCGMSG_proc_info[node].recvbuf->cntr),0)) - Error("lapi_adr_exchg: setcntr failed",-1); - - rc = LAPI_Address_init(lapi_handle, TCGMSG_proc_info[node].recvbuf, - table); - if(rc) Error(" lapi_adr_exchg: address_init failed", node); - - if(rc) Error(" lapi_adr_exchg: cntr init failed", node); - - if(TCGMSG_nodeid == node) { - for(tgt=0; tgt 0 ... returns requested value - * mproc < 0 ... server blocks until abs(mproc) processes are queued - * and returns junk - * mproc = 0 ... indicates to server that I am about to terminate - */ -long NXTVAL_(long *mproc) -{ -#define INC 1 - int local; - long stype = INTERNAL_SYNC_TYPE; - lapi_cntr_t req_id; - int rc, inc = INC; - - int server = (int)NNODES_() -1; /* id of server process */ - - if (server>0) { - /* parallel execution */ - if (DEBUG_) { - (void) printf("%2ld: nxtval: mproc=%ld\n",NODEID_(), *mproc); - (void) fflush(stdout); - } - - if (*mproc < 0) { - SYNCH_(&stype); - /* reset the counter value to zero */ - if( NODEID_() == server) nxtval_counter = 0; - SYNCH_(&stype); - } - if (*mproc > 0) { - /* use atomic swap operation to increment nxtval counter */ - rc = LAPI_Setcntr(lapi_handle, &req_id, 0); - if(rc)Error("nxtval: setcntr failed",rc); - rc = LAPI_Rmw(lapi_handle, FETCH_AND_ADD, server, nxtval_cnt_adr, - &inc, &local, &req_id); - if(rc)Error("nxtval: rmw failed",rc); - rc = LAPI_Waitcntr(lapi_handle, &req_id, 1, NULL); - if(rc)Error("nxtval: waitcntr failed",rc); - } - } else { - /* Not running in parallel ... just do a simulation */ - static int count = 0; - if (*mproc == 1){ - int val = count; - count+=INCR; - local = val; - }else if (*mproc == -1) { - count = 0; - local = 0; - } - else - Error("nxtval: sequential version with silly mproc ", (long) *mproc); - } - - return (long)local; -} - - -/** blocking get */ -void lapi_get(void* dest, void* src, long bytes, long node) -{ - int rc; - -#ifdef DEBUG - printf("%ld getting %ld bytes from addr=%lx node %ld to adr=%lx\n", - TCGMSG_nodeid, bytes, src, node, dest ); - fflush(stdout); -#endif - - rc = LAPI_Get(lapi_handle, (uint)node, (uint)bytes, src, dest, NULL,&req_cnt); - if(rc)Error("lapi_get: get failed",rc); - rc = LAPI_Waitcntr(lapi_handle, &req_cnt, 1, NULL); - if(rc)Error("lapi_get: waitcntr failed",rc); -} - - -/** put with nonblocking semantics */ -void lapi_put(void* dest, void* src, long bytes, long node) -{ - int rc; - - /* LAPI_Fence(lapi_handle);*/ -#ifdef DEBUG - printf("%ld puting %ld bytes to addr=%lx node %ld\n", TCGMSG_nodeid, - bytes, dest, node); - fflush(stdout); -#endif -#ifdef ERR_CHECKING - if(dest < (void*)TCGMSG_receive_buffer){ - printf("%ld: Warning: Out of range? %lx(%ld) < %lx\n", - TCGMSG_nodeid, dest, node, TCGMSG_receive_buffer); - fflush(stdout); - } - if(dest + bytes > (void*)(TCGMSG_receive_buffer+MAX_PROC) ){ - printf("%ld: Warning: Out of range? %lx(%ld) < %lx\n", - TCGMSG_nodeid, dest+bytes, node, TCGMSG_receive_buffer+MAX_PROC); - fflush(stdout); - } -#endif - - rc=LAPI_Put(lapi_handle, (uint)node, (uint)bytes, dest, src,NULL, &req_cnt,NULL); - if(rc)Error("lapi_put: sdput failed",rc); - rc = LAPI_Waitcntr(lapi_handle, &req_cnt, 1, NULL); - if(rc)Error("lapi_put: waitcntr failed",rc); - -} - - -/** put with nonblocking semantics and counter */ -void lapi_put_c(void* dest, void* src, long bytes, long node, lapi_cntr_t* cntr) -{ - int rc; - rc = LAPI_Put(lapi_handle, (uint)node, (uint)bytes, dest, src,cntr,NULL,NULL); - if(rc)Error("lapi_put_c: put failed",rc); -} - - -void PBEGINF_() -{ - PBEGIN_(NULL,NULL); -} - - -double fred =0.; -void Busy(int n) -{ - while (n-- >= 0) fred++; - /* LAPI_Probe(lapi_handle); */ -} - - -void SYNCH_(long* type) -{ - int rc; - - rc=LAPI_Gfence(lapi_handle); - if(rc) Error("lapi_gfence failed",rc); -} - - -/** Interface from fortran to c error routine */ -void PARERR_(long *code) -{ - Error("User detected error in FORTRAN", *code); -} diff --git a/armci/tcgmsg/ipcv5.0/llog.c b/armci/tcgmsg/ipcv5.0/llog.c deleted file mode 100644 index 7eb0a1b2f..000000000 --- a/armci/tcgmsg/ipcv5.0/llog.c +++ /dev/null @@ -1,47 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_TIME_H -# include -#endif -#if HAVE_SYS_TIME_H -# include -#endif - -#include "sndrcv.h" - -extern void Error(); - -/** - * close and open stdin and stdout to append to a local logfile - * with the name log. in the current directory -*/ -void LLOG_() -{ - char name[12]; - time_t t; - - (void) sprintf(name, "log.%03ld",(long)NODEID_()); - - (void) fflush(stdout); - (void) fflush(stderr); - - if (freopen(name, "a", stdout) == (FILE *) NULL) { - Error("LLOG_: error re-opening stdout", (long) -1); - } - - if (freopen(name, "a", stderr) == (FILE *) NULL) { - Error("LLOG_: error re-opening stderr", (long) -1); - } - - (void) time(&t); - (void) printf("\n\nLog file opened : %s\n\n",ctime(&t)); - (void) fflush(stdout); -} diff --git a/armci/tcgmsg/ipcv5.0/mdtob.c b/armci/tcgmsg/ipcv5.0/mdtob.c deleted file mode 100644 index c9ad98c05..000000000 --- a/armci/tcgmsg/ipcv5.0/mdtob.c +++ /dev/null @@ -1,24 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** @file - * $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/mdtob.c,v 1.2 1994-12-30 20:55:49 d3h325 Exp $ - * - * These routines use C's knowledge of the sizes of data types - * to generate a portable mechanism for FORTRAN to translate - * between bytes, integers and doubles. Note that we assume that - * FORTRAN integers are the same size as C longs. - */ -#include "sndrcv.h" - -/** - * Return the no. of bytes that n doubles occupy - */ -long MDTOB_(long *n) -{ - if (*n < 0) - Error("MDTOB_: negative argument",*n); - - return (long) (*n * sizeof(double)); -} diff --git a/armci/tcgmsg/ipcv5.0/mdtoi.c b/armci/tcgmsg/ipcv5.0/mdtoi.c deleted file mode 100644 index b304f7ae7..000000000 --- a/armci/tcgmsg/ipcv5.0/mdtoi.c +++ /dev/null @@ -1,25 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** @file - * - * $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/mdtoi.c,v 1.1 1997-03-05 18:42:31 d3e129 Exp $ - * - * These routines use C's knowledge of the sizes of data types - * to generate a portable mechanism for FORTRAN to translate - * between bytes, integers and doubles. Note that we assume that - * FORTRAN integers are the same size as C longs. - */ -#include "sndrcv.h" - -/** - * Return the minimum no. of integers which will hold n doubles. - */ -long MDTOI_(long *n) -{ - if (*n < 0) - Error("MDTOI_: negative argument",*n); - - return (long) ( (MDTOB_(n) + sizeof(long) - 1) / sizeof(long) ); -} diff --git a/armci/tcgmsg/ipcv5.0/misc.c b/armci/tcgmsg/ipcv5.0/misc.c deleted file mode 100644 index 3885fbf2e..000000000 --- a/armci/tcgmsg/ipcv5.0/misc.c +++ /dev/null @@ -1,29 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif - -#include "srftoc.h" -#include "tcgmsgP.h" - - -/** - * Define value of debug flag - */ -void SETDBG_(long *onoff) -{ - DEBUG_ = *onoff; -} - - -/** - * Print out statistics for communications ... not yet implemented - */ -void STATS_() -{ - (void) fprintf(stderr,"STATS_ not yet supported\n"); - (void) fflush(stderr); -} diff --git a/armci/tcgmsg/ipcv5.0/mitob.c b/armci/tcgmsg/ipcv5.0/mitob.c deleted file mode 100644 index 48d78cb92..000000000 --- a/armci/tcgmsg/ipcv5.0/mitob.c +++ /dev/null @@ -1,26 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** @file - * - * $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/mitob.c,v 1.2 1994-12-30 20:55:54 d3h325 Exp $ - * - * These routines use C's knowledge of the sizes of data types - * to generate a portable mechanism for FORTRAN to translate - * between bytes, integers and doubles. Note that we assume that - * FORTRAN integers are the same size as C longs. - */ -#include "sndrcv.h" - - -/** - * Return the no. of bytes that n ints=longs occupy - */ -long MITOB_(long *n) -{ - if (*n < 0) - Error("MITOB_: negative argument",*n); - - return (long) (*n * sizeof(long)); -} diff --git a/armci/tcgmsg/ipcv5.0/mitod.c b/armci/tcgmsg/ipcv5.0/mitod.c deleted file mode 100644 index 799b44c2e..000000000 --- a/armci/tcgmsg/ipcv5.0/mitod.c +++ /dev/null @@ -1,26 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** @file - * - * $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/mitod.c,v 1.1 1997-03-05 18:42:31 d3e129 Exp $ - * - * These routines use C's knowledge of the sizes of data types - * to generate a portable mechanism for FORTRAN to translate - * between bytes, integers and doubles. Note that we assume that - * FORTRAN integers are the same size as C longs. - */ -#include "sndrcv.h" - - -/** - * Return the minimum no. of doubles in which we can store n longs - */ -long MITOD_(long *n) -{ - if (*n < 0) - Error("MITOD_: negative argument",*n); - - return (long) ( (MITOB_(n) + sizeof(double) - 1) / sizeof(double) ); -} diff --git a/armci/tcgmsg/ipcv5.0/mtime.c b/armci/tcgmsg/ipcv5.0/mtime.c deleted file mode 100644 index 517057f74..000000000 --- a/armci/tcgmsg/ipcv5.0/mtime.c +++ /dev/null @@ -1,131 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/mtime.c,v 1.5 2002-03-12 18:59:31 d3h325 Exp $ */ - -#if HAVE_STDIO_H -# include -#endif - -#include "srftoc.h" - - -/** - * return wall clock time in centiseconds - */ -long MTIME_() -{ - double TCGTIME_(); - return (long) (TCGTIME_()*100.0); -} - -#if defined(LAPI) && defined(AIX) -# define LAPI_AIX -#endif - -#ifndef LAPI_AIX - -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_SYS_TIME_H -# include -#endif - -static unsigned firstsec=0; /* Reference for timer */ -static unsigned firstusec=0; /* Reference for timer */ - -void MtimeReset() /* Sets timer reference */ -{ - struct timeval tp; - struct timezone tzp; - - (void) gettimeofday(&tp,&tzp); - - firstsec = tp.tv_sec; - firstusec = tp.tv_usec; -} - - -/** - * Return wall clock time in seconds as accurately as possible - */ -double TCGTIME_() -{ - static int firstcall=1; - double low, high; - - struct timeval tp; - struct timezone tzp; - - if (firstcall) { - MtimeReset(); - firstcall = 0; - } - - (void) gettimeofday(&tp,&tzp); - - low = (double) (tp.tv_usec>>1) - (double) (firstusec>>1); - high = (double) (tp.tv_sec - firstsec); - - return high + 1.0e-6*(low+low); -} - -#endif - -#ifdef LAPI_AIX - -#if HAVE_SYS_TIME_H -# include -#endif -#if HAVE_SYS_SYSTEMCFG_H -# include -#endif - -static int firstsec=0; /* Reference for timer */ -static int firstnsec=0; - -void MtimeReset() /* Sets timer reference */ -{ - timebasestruct_t t; - read_real_time(&t, TIMEBASE_SZ); - time_base_to_time(&t, TIMEBASE_SZ); - - firstsec = t.tb_high; - firstnsec = t.tb_low; -} - - -/** - * Return wall clock time in seconds as accurately as possible - */ -double TCGTIME_() -{ - static int firstcall=1; - timebasestruct_t t; - int low, high; - int secs, nsecs; - - if (firstcall) { - MtimeReset(); - firstcall = 0; - } - - - read_real_time(&t, TIMEBASE_SZ); - time_base_to_time(&t, TIMEBASE_SZ); - - secs = t.tb_high - firstsec; - nsecs = t.tb_low - firstnsec; - - /* If there was a carry from low-order to high-order during - the measurement, we have to undo it */ - if(nsecs < 0){ - secs--; - nsecs+= 1000000000; - } - return (double)(secs + 1.0e-9*nsecs); -} - -#endif /* LAPI_AIX */ diff --git a/armci/tcgmsg/ipcv5.0/niceftn.c b/armci/tcgmsg/ipcv5.0/niceftn.c deleted file mode 100644 index d27eca35a..000000000 --- a/armci/tcgmsg/ipcv5.0/niceftn.c +++ /dev/null @@ -1,19 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_UNISTD_H -# include -#endif - -#include "tcgmsgP.h" - -/** - * Wrapper around nice for FORTRAN users courtesy of Rick Kendall - * ... C has the system interface. - */ -long NICEFTN_(long *ival) -{ - int val = (int)(*ival); - return nice(val); -} diff --git a/armci/tcgmsg/ipcv5.0/nnodes.c b/armci/tcgmsg/ipcv5.0/nnodes.c deleted file mode 100644 index 0c2b461be..000000000 --- a/armci/tcgmsg/ipcv5.0/nnodes.c +++ /dev/null @@ -1,10 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "tcgmsgP.h" - -long NNODES_(void) -{ - return (long) TCGMSG_nnodes; -} diff --git a/armci/tcgmsg/ipcv5.0/nodeid.c b/armci/tcgmsg/ipcv5.0/nodeid.c deleted file mode 100644 index ef480895e..000000000 --- a/armci/tcgmsg/ipcv5.0/nodeid.c +++ /dev/null @@ -1,10 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "tcgmsgP.h" - -long NODEID_(void) -{ - return (long) TCGMSG_nodeid; -} diff --git a/armci/tcgmsg/ipcv5.0/nxtval.shm.c b/armci/tcgmsg/ipcv5.0/nxtval.shm.c deleted file mode 100644 index d97490061..000000000 --- a/armci/tcgmsg/ipcv5.0/nxtval.shm.c +++ /dev/null @@ -1,122 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** @file - * $Id: nxtval.shm.c,v 1.9 2005-02-21 21:51:40 manoj Exp $ - */ -#if HAVE_STDIO_H -# include -#endif - -#include "tcgmsgP.h" - -long nxtval_counter=0; -long *nxtval_shmem = &nxtval_counter; - -#define LEN 2 -#define INCR 1 /* increment for NXTVAL */ -#define BUSY -1L /* indicates somebody else updating counter*/ - - -#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) -# define TESTANDSET testandset - -static inline int testandset(int *spinlock) -{ - int ret; - __asm__ __volatile__("xchgl %0, %1" - : "=r"(ret), "=m"(*spinlock) - : "0"(1), "m"(*spinlock)); - - return ret; -} - -#elif defined(__APPLE__) && defined(__GNUC__) -# define TESTANDSET(x) ( krspin_lock((long int *)(x))) -static int krspin_lock(long int *p) -{ - unsigned long tmp; - int ret; - - __asm__ __volatile__( - "b 1f # spin_lock\n\ -2: lwzx %0,0,%1\n\ - cmpwi 0,%0,0\n\ - bne+ 2b\n\ -1: lwarx %0,0,%1\n\ - cmpwi 0,%0,0\n\ - bne- 2b\n" -" stwcx. %2,0,%1\n\ - bne- 2b\n\ - isync" - : "=&r"(tmp) - : "r"(p), "r"(1) - : "cr0", "memory"); - return ret == 0; -} -#endif - -#ifdef TESTANDSET -# define LOCK if(nproc>1)acquire_spinlock((int*)(nxtval_shmem+1)) -# define UNLOCK if(nproc>1)release_spinlock((int*)(nxtval_shmem+1)) - -static void acquire_spinlock(int *mutex) -{ - int loop=0, maxloop =10; - while (TESTANDSET(mutex)){ - loop++; - if(loop==maxloop){ usleep(1); loop=0; } - } -} - -static void release_spinlock(int *mutex) -{ - *mutex =0; -} - -#endif /* TESTANDSET */ - - -#ifndef LOCK -# define LOCK if(nproc>1)Error("nxtval: sequential version with silly mproc ", (long) *mproc); -# define UNLOCK -#endif - - -/** - * Get next value of shared counter. - * - * mproc > 0 ... returns requested value - * mproc < 0 ... server blocks until abs(mproc) processes are queued - * and returns junk - * mproc = 0 ... indicates to server that I am about to terminate - */ -long NXTVAL_(long *mproc) -{ - long shmem_swap(); - long local=0; - long sync_type= INTERNAL_SYNC_TYPE; - long nproc= NNODES_(); - long server=nproc-1; - - if (DEBUG_) { - (void) printf("%2ld: nxtval: mproc=%ld\n",(long)NODEID_(),(long)*mproc); - (void) fflush(stdout); - } - - if (*mproc < 0) { - SYNCH_(&sync_type); - /* reset the counter value to zero */ - if( NODEID_() == server) *nxtval_shmem = 0; - SYNCH_(&sync_type); - } - if (*mproc > 0) { - LOCK; - local = *nxtval_shmem; - *nxtval_shmem += INCR; - UNLOCK; - } - - return local; -} diff --git a/armci/tcgmsg/ipcv5.0/pbegin.c b/armci/tcgmsg/ipcv5.0/pbegin.c deleted file mode 100644 index 642983388..000000000 --- a/armci/tcgmsg/ipcv5.0/pbegin.c +++ /dev/null @@ -1,228 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STRINGS_H -# include -#endif -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_UNISTD_H -# include -#endif -#if HAVE_STDLIB_H -# include -#endif - -/* extern long atol(const char *nptr); */ -/* extern void exit(int status); */ - -/* Define PBEGIN_C so that global variables in tcgmsgP.h are defined here - and declared extern everywhere else ... SGI linker is a whiner */ -#define PBEGIN_C - -#include "tcgmsgP.h" - -#ifdef LAPI -ShmemBuf TCGMSG_receive_buffer[MAX_PROC]; -void lapi_initialize(); -#endif - -extern void TrapSigint(void); -extern void TrapSigchld(void); -extern int WaitAll(long); - -static int SR_initialized=0; - - -long TCGREADY_() -{ - return (long)SR_initialized; -} - - -/* Define what was externally declared in tcgmsgP.h */ -long TCGMSG_nodeid; -long TCGMSG_nnodes; -long DEBUG_=0; /* debug flag ... see setdbg */ -long TCGMSG_nodeid; -long TCGMSG_nnodes; -char* TCGMSG_shmem; -long TCGMSG_shmem_id; -long TCGMSG_shmem_size; -long TCGMSG_caught_sigint; -ProcInfo* TCGMSG_proc_info; -SendQEntry* TCGMSG_sendq_ring; - - -/** - * shared-memory version of TCGMSG - */ -void tcgi_pbegin(int argc, char **argv) -{ - long arg, node, i, max_n_msg; - - TCGMSG_nodeid = 0; - TCGMSG_nnodes = 1; /* By default just sequential */ - - if(SR_initialized)Error("TCGMSG initialized already???",-1); - else SR_initialized=1; - -#ifdef LAPI - lapi_initialize(); -#else /* LAPI */ - for (arg=1; arg<(argc-1); arg++) - if (strcmp(argv[arg],"-np") == 0) { - TCGMSG_nnodes = atol(argv[arg+1]); - break; - } -#endif /* LAPI */ - - if (TCGMSG_nnodes > MAX_PROC){ - if(NODEID_()){ - sleep(1); - return; - } - fprintf(stderr,"\nTCGMSG has been configured for up to %d processes\n", - MAX_PROC); - fprintf(stderr,"Please change MAX_PROC in `tcgmsgP.h` and recompile\n\n"); - sleep(1); - Error("aborting ... ",0); - } - if (TCGMSG_nnodes == 1) { - return; - }; - - /* Set up handler for SIGINT and SIGCHLD */ - -#ifndef LAPI - TrapSigint(); - TrapSigchld(); -#endif - - /* Allocate the process info structures */ - - if (!(TCGMSG_proc_info = (ProcInfo *) - malloc((size_t) (TCGMSG_nnodes*sizeof(ProcInfo))))) - Error("pbegin: failed to malloc procinfo", - (long) (TCGMSG_nnodes*sizeof(ProcInfo))); - bzero((char *) TCGMSG_proc_info, (int) (TCGMSG_nnodes*sizeof(ProcInfo))); - - /* Allocate a ring of message q entries to avoid having a malloc/free - pair for every message sent */ - - max_n_msg = 2*TCGMSG_nnodes; - if (max_n_msg < MAX_N_OUTSTANDING_MSG) max_n_msg = MAX_N_OUTSTANDING_MSG; - - if (!(TCGMSG_sendq_ring = (SendQEntry *) - malloc((size_t) (max_n_msg*sizeof(SendQEntry))))) - Error("pegin: failed to malloc entries for send q", 0L); - - for (i=0; i 1) { - int status; - int rc; - status = WaitAll(TCGMSG_nnodes-1); /* Wait for demise of children */ - rc=DeleteSharedRegion(TCGMSG_shmem_id); - if(rc)printf("DeleteSharedMem returned %d\n",rc); - if (status) exit(1); - } -#endif -} diff --git a/armci/tcgmsg/ipcv5.0/pbeginf.c b/armci/tcgmsg/ipcv5.0/pbeginf.c deleted file mode 100644 index 4231896b9..000000000 --- a/armci/tcgmsg/ipcv5.0/pbeginf.c +++ /dev/null @@ -1,31 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STRING_H -# include -#endif - -#include "srftoc.h" -#include "sndrcv.h" - -/** - * Hewlett Packard Risc box and new SparcWorks F77 2.* compilers. - * Have to construct the argument list by calling FORTRAN. - */ -void PBEGINF_() -{ -} - - -/** - * Alternative entry for those senstive to FORTRAN making reference - * to 7 character external names - */ -void PBGINF_() -{ - PBEGINF_(); -} diff --git a/armci/tcgmsg/ipcv5.0/pfilecopy.c b/armci/tcgmsg/ipcv5.0/pfilecopy.c deleted file mode 100644 index 32c11f180..000000000 --- a/armci/tcgmsg/ipcv5.0/pfilecopy.c +++ /dev/null @@ -1,150 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/pfilecopy.c,v 1.5 2004-04-01 02:23:05 manoj Exp $ */ - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_STRINGS_H -# include -#endif -#if HAVE_STRING_H -# include -#endif -#if HAVE_STDLIB_H -# include -#endif - -/* extern void free(void *ptr); */ - -#include "msgtypesc.h" -#include "sndrcv.h" -#include "tcgmsgP.h" - -/** - * Process node0 has a file (assumed unopened) named fname. - * This file will be copied to all other processes which must - * simultaneously invoke pfilecopy. Since the processes may be - * using the same directory one probably ought to make sure - * that each process uses a different name in the call. - * - * e.g. - * - * on node 0 pfilecopy(99, 0, 'argosin') - * on node 1 pfilecopy(99, 0, 'argosin_001') - * on node 2 pfilecopy(99, 0, 'argosin_002') - */ -void tcgi_pfilecopy(long *type, long *node0, char *filename) -{ - char *buffer; - FILE *file; - long length, nread=32768, len_nread=sizeof(long); - long typenr = (*type & 32767) | MSGINT; /* Force user type integer */ - long typebuf =(*type & 32767) | MSGCHR; - - if (!(buffer = malloc((unsigned) nread))) - Error("pfilecopy: failed to allocate the I/O buffer",nread); - - if (*node0 == NODEID_()) { - - /* I have the original file ... open and check its size */ - - if ((file = fopen(filename,"r")) == (FILE *) NULL) { - (void) fprintf(stderr,"me=%ld, filename = %s.\n", - (long)NODEID_(),filename); - Error("pfilecopy: node0 failed to open original file", *node0); - } - - /* Quick sanity check on the length */ - - (void) fseek(file, 0L, (int) 2); /* Seek to end of file */ - length = ftell(file); /* Find the length of file */ - (void) fseek(file, 0L, (int) 0); /* Seek to beginning of file */ - if ( (length<0) || (length>1e12) ) - Error("pfilecopy: the file length is -ve or very big", length); - - /* Send the file in chunks of nread bytes */ - - while (nread) { - nread = fread(buffer, 1, (int) nread, file); - BRDCST_(&typenr, (char *) &nread, &len_nread, node0); - typenr++; - if (nread) { - BRDCST_(&typebuf, buffer, &nread, node0); - typebuf++; - } - } - } - else { - - /* Open the file for the duplicate */ - - if ((file = fopen(filename,"w+")) == (FILE *) NULL) { - (void) fprintf(stderr,"me=%ld, filename = %s.\n", - (long)NODEID_(),filename); - Error("pfilecopy: failed to open duplicate file", *node0); - } - - /* Receive data and write to file */ - - while (nread) { - BRDCST_(&typenr, (char *) &nread, &len_nread, node0); - typenr++; - if (nread) { - BRDCST_(&typebuf, buffer, &nread, node0); - typebuf++; - if (nread != fwrite(buffer, 1, (int) nread, file)) - Error("pfilecopy: error data to duplicate file", nread); - } - } - } - - /* Tidy up the stuff we have been using */ - - (void) fflush(file); - (void) fclose(file); - (void) free(buffer); -} - -/** The original C interface to PFCOPY_. */ -void PFILECOPY_(long *type, long *node0, char *filename) -{ - tcgi_pfilecopy(type, node0, filename); -} - -void PFCOPY_(long *type, long *node0, char *fname, int len) -{ - /* Fortran wrapper around pfilecopy */ - - char *filename; - -#ifdef DEBUG - (void) printf("me=%d, type=%d, node0=%d, fname=%x, fname=%.8s, len=%d\n", - NODEID_(), *type, *node0, fname, fname, len); -#endif - - /* Strip trailing blanks off the file name */ - - while ((len > 0) && (fname[len-1] == ' ')) - len--; - if (len <= 0) - Error("pfcopy_: file name length is toast", (long) len); - - /* Generate a NULL terminated string */ - - filename = malloc( (unsigned) (len+1) ); - if (filename) { - (void) bcopy(fname, filename, len); - filename[len] = '\0'; - } - else - Error("PFCOPY_: failed to malloc space for filename", (long) len); - - /* Now call the C routine to do the work */ - - tcgi_pfilecopy(type, node0, filename); - - (void) free(filename); -} diff --git a/armci/tcgmsg/ipcv5.0/queues.c b/armci/tcgmsg/ipcv5.0/queues.c deleted file mode 100644 index e5f8e74e1..000000000 --- a/armci/tcgmsg/ipcv5.0/queues.c +++ /dev/null @@ -1,205 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -extern void free(void *ptr); - -#include "tcgmsgP.h" - -static const long false = 0; -static const long true = 1; - -extern void Busy(int); - -extern long async_send(SendQEntry *); - - -/** - * Given a nodeid return a unqiue integer constructed by - * combining it with the value of a counter - */ -static long NextMsgID(long node) -{ - static long id = 0; - static long mask = (1<<20)-1; - - id = (id + 1) & mask; - if (id == 0) id = 1; - - return (node << 20) + id; -} - - -/** - * Given an id from NextMsgID extract the node - */ -static long NodeFromMsgID(long msgid) -{ - long node = msgid >> 20; - - if (node < 0 || node > NNODES_()) - Error("NodeFromMsgID: invalid msgid", msgid); - - return node; -} - - -/** - * Flush as many messages as possible without blocking from - * the send q to the specified node. - */ -static void flush_send_q_node(long node) -{ - while (TCGMSG_proc_info[node].sendq) { - - if (!async_send(TCGMSG_proc_info[node].sendq)) { - /* Send is incomplete ... stop processing this q*/ - break; - } - else { - SendQEntry *tmp = TCGMSG_proc_info[node].sendq; - - TCGMSG_proc_info[node].sendq = (SendQEntry *) TCGMSG_proc_info[node].sendq->next; - if (tmp->free_buf_on_completion) - (void) free(tmp->buf); - tmp->active = false; /* Matches NewSendQEntry() */ - } - } -} - - -/** - * Flush as many messages as possible without blocking - * from all of the send q's. - */ -void flush_send_q() -{ - long node; - long nproc = NNODES_(); - - for (node=0; nodenext) { - if (entry->msgid == msgid) { - status = 0; - break; - } - } - - return status; -} - - -/** - * Wait for the operation referred to by msgid to complete. - */ -void msg_wait(long msgid) -{ - long nspin = 0; - long spinlim = 1000000; - - while (!msg_status(msgid)) { - nspin++; - if (nspin < spinlim) - Busy(100); - else - usleep(1); - } -} - - -static SendQEntry *NewSendQEntry(void) -{ - SendQEntry *new = TCGMSG_sendq_ring; - - if (new->active) - Error("NewSendQEntry: too many outstanding sends\n", 0L); - - TCGMSG_sendq_ring = (SendQEntry *) TCGMSG_sendq_ring->next_in_ring; - - new->active = true; - - return new; -} - - -long msg_async_snd(long type, char *buf, long lenbuf, long node) -{ - long msgid; - SendQEntry *entry; - - if (node<0 || node>=TCGMSG_nnodes) - Error("msg_async_send: node is out of range", node); - - if (node == TCGMSG_nodeid) - Error("msg_async_send: cannot send to self", node); - - msgid = NextMsgID(node); - entry = NewSendQEntry(); - - /* Insert a new entry into the q */ - - entry->tag = TCGMSG_proc_info[node].n_snd++; /* Increment tag */ - entry->msgid = msgid; - entry->type = type; - entry->buf = buf; - entry->free_buf_on_completion = 0; - entry->lenbuf= lenbuf; - entry->node = node; - entry->next = (SendQEntry *) 0; - entry->written = 0; - entry->buffer_number = 0; - - /* Attach to the send q */ - - if (!TCGMSG_proc_info[node].sendq) - TCGMSG_proc_info[node].sendq = entry; - else { - SendQEntry *cur = TCGMSG_proc_info[node].sendq; - - while (cur->next) - cur = cur->next; - cur->next = entry; - } - - /* Attempt to flush the send q */ - - flush_send_q(); - - return msgid; -} - - -/** - * synchronous send of message to a process - * - * long *type = user defined integer message type (input) - * char *buf = data buffer (input) - * long *lenbuf = length of buffer in bytes (input) - * long *node = node to send to (input) - * - * for zero length messages only the header is sent - */ -void msg_snd(long type, char *buf, long lenbuf, long node) -{ - msg_wait(msg_async_snd(type, buf, lenbuf, node)); -} diff --git a/armci/tcgmsg/ipcv5.0/shmem.c b/armci/tcgmsg/ipcv5.0/shmem.c deleted file mode 100644 index 6cf22bade..000000000 --- a/armci/tcgmsg/ipcv5.0/shmem.c +++ /dev/null @@ -1,240 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/** @file - * - * $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/shmem.c,v 1.4 2002-01-24 22:07:27 d3h325 Exp $ - * - * This stuff attempts to provide a simple interface to temporary shared - * memory regions, loosely modelled after that of Alliant Concentrix 5.0 - * - * - * Note that the input arguments switch between integers and pointers - * to integers depending on if they are modified on return. - * - * - * Create a shared region of at least size bytes, returning the actual size, - * the id associated with the region. The return value is a pointer to the - * the region. Any error is a hard fail. - * - * (char *) CreateSharedRegion((long *) id, (long *) size) - * - * - * Detach a process from a shared memory region. 0 is returned on success, - * -1 for failure. id, size, and addr must match exactly those items returned - * from CreateSharedRegion - * - * long DetachSharedRegion((long) id, (long) size, (char *) addr) - * - * - * Delete a shared region from the system. This has to be done on the SUN - * to remove it from the system. On the Alliant the shared region disappears - * when the last process dies or detaches. Returns 0 on success, -1 on error. - * - * long DeleteSharedRegion((long) id) - * - * - * Delete all the shared regions associated with this process. - * - * long DeleteSharedAll() - * - * - * Attach to a shared memory region of known id and size. Returns the - * address of the mapped memory. Size must exactly match the size returned - * from CreateSharedRegion (which in turn is the requested size rounded - * up to a multiple of 4096). Any error is a hard fail. - * - * (char *) AttachSharedRegion((long) id, (long) size)) - */ - -extern void Error(const char *, long); - -#if !defined(MMAP) || defined(MACX) - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_SYS_IPC_H -# include -#endif -#if HAVE_SYS_SHM_H -# include -#endif - -#ifdef SUN -extern int shmget(key_t, int, int); -extern int shmdt(void *); -extern int shmctl(int, int, struct shmid_ds *); -extern void *shmat(int, const void *, int); -#endif - - -char *CreateSharedRegion(long *id, long *size) -{ - char *temp; - - /* Create the region */ - - if ( (*id = shmget(IPC_PRIVATE, (int) *size, - (int) (IPC_CREAT | 00600))) < 0 ) - Error("CreateSharedRegion: failed to create shared region", (long) *id); - - /* Attach to the region */ - - if ( (long) (temp = shmat((int) *id, (char *) NULL, 0)) == -1L) - Error("CreateSharedRegion: failed to attach to shared region", 0L); - - return temp; -} - - -long DetachSharedRegion(long id, long size, char *addr) -{ - return shmdt(addr); -} - - -long DeleteSharedRegion(long id) -{ - return shmctl((int) id, IPC_RMID, (struct shmid_ds *) NULL); -} - - -char *AttachSharedRegion(long id, long size) -{ - char *temp; - - if ( (long) (temp = shmat((int) id, (char *) NULL, 0)) == -1L) - Error("AttachSharedRegion: failed to attach to shared region", 0L); - - return temp; -} - -#else /* MMAP */ - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_SYS_TIME_H -# include -#endif -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_SYS_FILE_H -# include -#endif -#if HAVE_SYS_MMAN_H -# include -#endif - -extern char *strdup(); -extern char *mktemp(); - -#define MAX_ID 20 -static struct id_list_struct { - char *addr; /* pointer to shmem region */ - unsigned size; /* size of region */ - char *filename; /* associated file name */ - int fd; /* file descriptor */ - int status; /* = 1 if in use */ -} id_list[MAX_ID]; - -static int next_id = 0; -static char template[] = "/tmp/SHMEM.XXXXXX"; - -char *CreateSharedRegion(id, size) - long *size, *id; -{ - char *temp; - - if (next_id == MAX_ID) - Error("CreateSharedRegion: MAX_ID exceeded ", MAX_ID); - *id = next_id; - - if ( (temp = strdup(template)) == (char *) NULL) - Error("CreateSharedRegion: failed to get space for filename", 0); - - /* Generate scratch file to identify region ... need to know this - name to attach to the region so need to establish some policy - before AttachtoSharedRegion can work */ - - id_list[*id].filename = mktemp(temp); - if ( (id_list[*id].fd = open(id_list[*id].filename, - O_RDWR|O_CREAT, 0666)) < 0) - Error("CreateSharedRegion: failed to open temporary file",0); - - id_list[*id].addr = mmap((caddr_t) 0, (size_t)*size, - PROT_READ|PROT_WRITE, - MAP_ANON|MAP_SHARED, id_list[*id].fd, 0); - if (id_list[*id].addr == (char *) -1) - Error("CreateSharedRegion: mmap failed",-1); - - id_list[*id].size = *size; - id_list[*id].status = 1; - - next_id++; - return id_list[*id].addr; -} - - -long DetachSharedRegion(long id, long size, char *addr) -{ - if ( (id < 0) || (id > next_id)) - return (long) -1; - - if (id_list[id].status != 1) - return (long) -1; - - id_list[id].status = 0; - - return (long) munmap(id_list[id].addr, 0); -} - - -long DeleteSharedRegion(long id) -{ - if ( (id < 0) || (id > next_id) ) - return (long) -1; - - if (id_list[id].status != 1) - return (long) -1; - - (void) DetachSharedRegion(id, 0, (char *) 0); - - if (id_list[id].fd >= 0) { - (void) close(id_list[id].fd); - (void) unlink(id_list[id].filename); - } - - return (long) 0; -} - - -char *AttachSharedRegion(long id, long size) -{ - Error("AttachSharedRegion: need mods for this to work on CONVEX", - (long) -1); -} - - -long DeleteSharedAll() -{ - long id; - long status = 0; - - for (id=0; id -#endif -#if HAVE_SYS_WAIT_H -# include -#endif -#if HAVE_SYS_TYPES_H -# include -#endif - -#include "tcgmsgP.h" - -#ifndef SIG_ERR -# define SIG_ERR (RETSIGTYPE (*)(int))-1 -#endif - - -RETSIGTYPE SigintHandler(int sig) -{ - TCGMSG_caught_sigint = 1L; - Error("SigintHandler: signal was caught",0L); -} - - -/** - * Trap the signal SIGINT so that we can propagate error - * conditions and also tidy up shared system resources in a - * manner not possible just by killing everyone - */ -void TrapSigint() -{ - if ( signal(SIGINT, SigintHandler) == SIG_ERR) - Error("TrapSigint: error from signal setting SIGINT",(long) SIGINT); -} - - -/** - * kill -SIGINT all of my beloved children - */ -void ZapChildren() -{ - long node; - - for (node=0; node -#endif - -extern void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, const void *)); - -#include "srftoc.h" -#include "sndrcv.h" -#include "tcgmsgP.h" - -extern long MatchShmMessage(); -extern void msg_wait(); -extern long DEBUG_; - -#define INVALID_NODE -3333 /* used to stamp completed msg in the queue */ -#define MAX_Q_LEN MAX_PROC /* Maximum no. of outstanding messages */ -static volatile long n_in_msg_q = 0; /* actual no. in the message q */ -static struct msg_q_struct{ - long msg_id; - long node; - long type; -} msg_q[MAX_Q_LEN]; - - -/** - * Return 1/0 (TRUE/FALSE) if a message of the given type is available - * from the given node. If the node is specified as -1, then all nodes - * will be examined. Some attempt is made at ensuring fairness. - * - * If node is specified as -1 then this value is overwritten with the - * node that we got the message from. - */ -long ProbeNode(long *type, long *node) -{ - static long next_node = 0; - - long nproc = NNODES_(); - long me = NODEID_(); - long found = 0; - long cur_node; - int i, proclo, prochi; - - if (*node == me) - Error("PROBE_ : cannot recv message from self, msgtype=", *type); - - if (*node == -1) { /* match anyone */ - - proclo = 0; - prochi = nproc-1; - cur_node = next_node; - - } else - proclo = prochi = cur_node = *node; - - for(i = proclo; i<= prochi; i++) { - - if (cur_node != me){ /* can't receive from self */ - found = MatchShmMessage(cur_node, *type); - if (found) break; - } - cur_node = (cur_node +1)%nproc; - - } - - if(found) *node = cur_node; - - /* if wildcard node, determine which node we'll start with next time */ - if(*type == -1) next_node = (cur_node +1)%nproc; - return(found); -} - - -/** - * Return 1/0 (TRUE/FALSE) if a message of the given type is available - * from the given node. If the node is specified as -1, then all nodes - * will be examined. Some attempt is made at ensuring fairness. - */ -long PROBE_(long *type, long *node) -{ - long nnode = *node; - long result; - - result = ProbeNode(type, &nnode); - - return(result); -} - - -/** - * long *type = user defined type of received message (input) - * char *buf = data buffer (output) - * long *lenbuf = length of buffer in bytes (input) - * long *lenmes = length of received message in bytes (output) - * (exceeding receive buffer is hard error) - * long *nodeselect = node to receive from (input) - * -1 implies that any pending message of the specified - * type may be received - * long *nodefrom = node message is received from (output) - * long *sync = flag for sync(1) or async(0) receipt (input) - */ -void RCV_(long *type, void *buf, long *lenbuf, long *lenmes, long *nodeselect, long *nodefrom, long *sync) -{ - static long ttype; - static long node; - long me = NODEID_(); - void msg_rcv(); - - node = *nodeselect; - - ttype = *type; - - if (DEBUG_) { - printf("RCV_: node %ld receiving from %ld, len=%ld, type=%ld, sync=%ld\n", - (long)me, (long)*nodeselect, (long)*lenbuf, (long)*type, (long)*sync); - fflush(stdout); - } - - /* wait for a matching message */ - if(node==-1) while(ProbeNode(type, &node) == 0); - msg_rcv(ttype, buf, *lenbuf, lenmes, node); - *nodefrom = node; - - if (DEBUG_) { - (void) printf("RCV: me=%ld, from=%ld, len=%ld\n", - (long)me, (long)*nodeselect, (long)*lenbuf); - (void) fflush(stdout); - } -} - - -/** - * long *type = user defined integer message type (input) - * char *buf = data buffer (input) - * long *lenbuf = length of buffer in bytes (input) - * long *node = node to send to (input) - * long *sync = flag for sync(1) or async(0) communication (input) - */ -void SND_(long *type, void *buf, long *lenbuf, long *node, long *sync) -{ - long me = NODEID_(); - long msg_async_snd(); - - /*asynchronous communication not supported under LAPI */ -#ifdef LAPI - long block = 1; -#else - long block = *sync; -#endif - - if (DEBUG_) { - (void)printf("SND_: node %ld sending to %ld, len=%ld, type=%ld, sync=%ld\n", - (long)me, (long)*node, (long)*lenbuf, (long)*type, (long)*sync); - (void) fflush(stdout); - } - - if (block) - msg_wait(msg_async_snd(*type, buf, *lenbuf, *node)); - - else { - - if (n_in_msg_q >= MAX_Q_LEN) - Error("SND: overflowing async Q limit", n_in_msg_q); - - msg_q[n_in_msg_q].msg_id = msg_async_snd(*type, buf, *lenbuf, *node); - msg_q[n_in_msg_q].node = *node; - msg_q[n_in_msg_q].type = *type; - n_in_msg_q++; - } - - if (DEBUG_) { - (void) printf("SND: me=%ld, to=%ld, len=%ld \n", - (long)me, (long)*node, (long)*lenbuf); - (void) fflush(stdout); - } -} - - -int compare_msg_q_entries(const void* entry1, const void* entry2) -{ - /* nodes are nondistiguishable unless one of them is INVALID_NODE */ - if( ((struct msg_q_struct*)entry1)->node == - ((struct msg_q_struct*)entry2)->node) return 0; - if( ((struct msg_q_struct*)entry1)->node == INVALID_NODE) return 1; - if( ((struct msg_q_struct*)entry2)->node == INVALID_NODE) return -1; - return 0; -} - - -/** - * Wait for all messages (send/receive) to complete between - * this node and node *nodesel or everyone if *nodesel == -1. - */ -void WAITCOM_(long *nodesel) -{ - long i, found = 0; - - for (i=0; i -#endif -#if HAVE_UNISTD_H -# include -#endif -#if HAVE_SIGNAL_H -# include -#endif - -#ifdef LAPI -# include -#endif - -#include "tcgshmem.h" -#include "sndrcv.h" -#include "srftoc.h" - -/* TODO autoconf way to detect this?? */ -#define MAX_PROC 512 -/* #define MAX_PROC 16 */ -/* under Cygnus we got only serial execution */ -/* #define MAX_PROC 1 */ - -#define INTERNAL_SYNC_TYPE 33333 -#define MAX_N_OUTSTANDING_MSG 64 - -extern void USleep(long); -#ifndef LAPI -extern long *nxtval_shmem; -#endif - -extern long DEBUG_; -extern long TCGMSG_nodeid; /**> The id of this process */ -extern long TCGMSG_nnodes; /**> Total no. of processes */ -extern char * TCGMSG_shmem; /**> Pointer to shared-memory segment */ -extern long TCGMSG_shmem_id; /**> ID of shared-memory segment */ -extern long TCGMSG_shmem_size; /**> Size of shared-memory segment */ -extern long TCGMSG_caught_sigint; /**> True if SIGINT was trapped */ - -/* Structure defines shared memory buffer ... each process has - one for every process that can send to it via shared memory. - - Adjust SHMEM_BUF_SIZE so that sizeof(ShmemBuf) is an integer - multiple of page sizes. Structure of this buffer is exploited - in T3D code. */ - -#ifdef NOTIFY_SENDER -# ifdef LAPI -# define RESERVED (6*sizeof(long) + sizeof(lapi_cntr_t)) -# else -# define RESERVED 6*sizeof(long) -# endif -#else -# define RESERVED 4*sizeof(long) -#endif - -#if defined(MACX) -# define WHOLE_BUF_SIZE 2*65536 -#elif defined(LAPI) -# define WHOLE_BUF_SIZE (3*4096) -#else -# define WHOLE_BUF_SIZE (16*8192) -#endif - -#define SHMEM_BUF_SIZE (WHOLE_BUF_SIZE - RESERVED) - -#ifdef LAPI -# define SND_RESERVED (4*sizeof(long) + sizeof(lapi_cntr_t) + sizeof(void*)) -# define SEND_BUF_SIZE (WHOLE_BUF_SIZE - SND_RESERVED) -# define SENDBUF_NUM 2 -typedef struct { - lapi_cntr_t cntr; - void *next; - long info[4]; - char buf[SEND_BUF_SIZE]; -} sendbuf_t; -sendbuf_t *sendbuf_arr, *localbuf; -#endif - -typedef struct { - long info[4]; /**< 0=type, 1=length, 2=tag, 3=full */ - char buf[SHMEM_BUF_SIZE]; /**< Message buffer */ -#ifdef NOTIFY_SENDER - long stamp; -# ifdef LAPI - lapi_cntr_t cntr; -# endif - long flag; /**< JN: used by receiver to signal sender */ -#endif -} ShmemBuf; - -/* Structure defines an entry in the send q */ - -typedef struct { - long msgid; /**< Message id for msg_status */ - long type; /**< Message type */ - long node; /**< Destination node */ - long tag; /**< Message tag */ - char *buf; /**< User or internally malloc'd buffer */ - long lenbuf; /**< Length of user buffer in bytes */ - long written; /**< Amount already sent */ - long buffer_number; /**< No. of buffers alread sent */ - long free_buf_on_completion; /* Boolean true if free buffer using free */ - void *next; /**< Pointer to next entry in linked list */ - void *next_in_ring; /**< Pointer to next entry in ring of free entries */ - long active; /**< 0/1 if free/allocated */ -} SendQEntry; - -/* This structure holds basically all process specific information */ - -#define COMM_MODE_NONE 0 -#define COMM_MODE_SHMEM 1 -#define COMM_MODE_SOCK 2 - -typedef struct { - ShmemBuf *sendbuf; /**< Shared-memory buffer for sending to node*/ - ShmemBuf *recvbuf; /**< Shared-memory buffer for receiving from */ - int sock; /**< Socket for send/receive */ - int comm_mode; /**< Defines communication info */ - pid_t pid; /**< Unix process id (or 0 if unknown) */ - long tag_rcv; /**< Expected tag from next rcv() */ - long n_snd; /**< No. of messages sent from this process */ - long n_rcv; /**< No. of messages recv from this process */ - SendQEntry *sendq; /**< Queue of messages to be sent */ -} ProcInfo; - -extern ProcInfo *TCGMSG_proc_info; /**< Will point to array of structures */ - -extern SendQEntry *TCGMSG_sendq_ring; /**< Circular ring of SendQEntry - structures for fast allocation/free */ - -#endif /* TCGMSGP_H_ */ diff --git a/armci/tcgmsg/ipcv5.0/tcgshmem.h b/armci/tcgmsg/ipcv5.0/tcgshmem.h deleted file mode 100644 index 0e217cd24..000000000 --- a/armci/tcgmsg/ipcv5.0/tcgshmem.h +++ /dev/null @@ -1,39 +0,0 @@ -/** @file - * Header file which declares stubs for the shared memory interface. - * Note that the input arguments switch between integers and pointers - * to integers depending on if they are modified on return. - */ -#ifndef SHMEM_H_ -#define SHMEM_H_ - -/** - * Create a shared region of at least size bytes, returning the actual size, - * the id associated with the region. The return vaue is a pointer to the - * the region. Any error is a hard fail. - */ -extern char *CreateSharedRegion(long *id, long *size); - -/** - * Detach a process from a shared memory region. 0 is returned on success, - * -1 for failure. id, size, and addr much match exactly those items returned - * from CreateSharedRegion - * - */ -extern long DetachSharedRegion(long id, long size, char *addr); - -/** - * Delete a shared region from the system. This has to be done on the SUN - * to remove it from the system. On the Alliant the shared region disappears - * when the last process dies or detaches. Returns 0 on success, -1 on error. - */ -extern long DeleteSharedRegion(long id); - -/** - * Attach to a shared memory region of known id and size. Returns the - * address of the mapped memory. Size must exactly match the size returned - * from CreateSharedRegion (which in turn is the requested size rounded - * up to a multiple of 4096). Any error is a hard fail. - */ -extern char *AttachSharedRegion(long id, long size); - -#endif /* SHMEM_H_ */ diff --git a/armci/tcgmsg/ipcv5.0/usleep.c b/armci/tcgmsg/ipcv5.0/usleep.c deleted file mode 100644 index 96fd1959a..000000000 --- a/armci/tcgmsg/ipcv5.0/usleep.c +++ /dev/null @@ -1,50 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $Header: /tmp/hpctools/ga/tcgmsg/ipcv5.0/usleep.c,v 1.4 1997-11-07 23:44:20 d3h325 Exp $ */ - -#if HAVE_STDIO_H -# include -#endif -#if HAVE_SYS_SELECT_H -# include -#endif -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_SYS_TIME_H -# include -#endif - -#include "tcgmsgP.h" - -#ifdef STUPIDUSLEEP -void USleep(long us) -{ - int s = us/1000000; - if (s == 0) - s = 1; - (void) sleep(s); -} -#else /* STUPIDUSLEEP */ -/** - * Sleep for the specified no. of micro-seconds ... uses the timeout - * on select ... it seems to be accurate to about a few centiseconds - * on a sun. I don't know how much system resources it eats. - */ -void USleep(long us) -{ - int width=0; - struct timeval timelimit; - - /* printf("%2ld: sleeping for %ldus\n", TCGMSG_nodeid, us); - fflush(stdout);*/ - - timelimit.tv_sec = (int) (us/1000000); - timelimit.tv_usec = (int) (us - timelimit.tv_sec*1000000); - - (void) select(width, (fd_set *) 0, (fd_set *) 0, (fd_set *) 0, - &timelimit); -} -#endif /* STUPIDUSLEEP */ diff --git a/armci/tcgmsg/ipcv5.0/waitall.c b/armci/tcgmsg/ipcv5.0/waitall.c deleted file mode 100644 index 92923b6a2..000000000 --- a/armci/tcgmsg/ipcv5.0/waitall.c +++ /dev/null @@ -1,64 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* $$ */ - -#if HAVE_SYS_TYPES_H -# include -#endif -#if HAVE_SYS_WAIT_H -# include -#endif -#if HAVE_STDIO_H -# include -#endif - -#include "sndrcv.h" - - -/** - * Wait for all children to finish and return appropriate status - * 0 = OK - * 1 = bad news - */ -int WaitAll(long nchild) -{ - int status, pid, child, stat=0, lo, hi; - - for (child=0; child> 8) & 0xff; - - if ( lo == 0177 ) - (void) fprintf(stderr, "(stopped by signal %d).\n", hi); - else if ( (lo != 0) && (lo & 0200) ) - (void) fprintf(stderr, "(killed by signal %d, dumped core).\n", - lo & 0100); - else if ( lo != 0 ) - (void) fprintf(stderr, "(killed by signal %d).\n",lo); - else - (void) fprintf(stderr, "(exited with code %d).\n",hi); - - (void) fflush(stderr); - stat = 1; - } - - } - - return stat; -} diff --git a/armci/tcgmsg/msgtypesc.h b/armci/tcgmsg/msgtypesc.h deleted file mode 100644 index bcf1d983a..000000000 --- a/armci/tcgmsg/msgtypesc.h +++ /dev/null @@ -1,12 +0,0 @@ -/** @file - * This defines bit masks that can be OR'ed with user types (1-32767) - * to indicate the nature of the data to the message passing system - */ -#ifndef MSGTYPES_H_ -#define MSGTYPES_H_ - -#define MSGDBL 65536 -#define MSGINT 131072 -#define MSGCHR 262144 - -#endif /* MSGTYPES_H_ */ diff --git a/armci/tcgmsg/msgtypesf.h b/armci/tcgmsg/msgtypesf.h deleted file mode 100644 index 3dbc6bd2f..000000000 --- a/armci/tcgmsg/msgtypesf.h +++ /dev/null @@ -1,6 +0,0 @@ -C -C This defines bit masks that can be ORed with user types (1-32767) -C to indicate the nature of the data to the message passing system -C - integer msgdbl, msgint, msgchr - parameter (msgdbl=65536, msgint=131072, msgchr=262144) diff --git a/armci/tcgmsg/sndrcv.h b/armci/tcgmsg/sndrcv.h deleted file mode 100644 index 4f11eaa97..000000000 --- a/armci/tcgmsg/sndrcv.h +++ /dev/null @@ -1,64 +0,0 @@ -/** @file - * This header file declares stubs and show prototypes of the - * public sndrcv calls - * - * srftoc.h contains macros which define the names of c routines - * accessible from FORTRAN and vice versa - */ -#ifndef SNDRCV_H_ -#define SNDRCV_H_ - -#include "msgtypesc.h" -#include "srftoc.h" - -#ifdef __cplusplus -extern "C" { -#endif - -extern void BRDCST_(long *type, void *buf, long *lenbuf, long *originator); -extern void DGOP_(long *type, double *x, long *n, char *op, int oplen); -extern double DRAND48_(); -extern void IGOP_(long *type, long *x, long *n, char *op, int oplen); -extern void LLOG_(); -extern long MDTOB_(long *n); -extern long MDTOI_(long *n); -extern long MITOB_(long *n); -extern long MITOD_(long *n); -extern long MTIME_(); -extern long NICEFTN_(long *ival); -extern long NNODES_(); -extern long NODEID_(); -extern long NXTVAL_(long *mproc); -extern void PARERR_(long *code); -extern void PBEGINF_(); -extern void PBGINF_(); -extern void PEND_(); -extern void PFCOPY_(long *type, long *node0, char *filename, int flen); -extern void PFILECOPY_(long *type, long *node0, char *filename); -extern long PROBE_(long *type, long *node); -extern void RCV_(long *type, void *buf, long *lenbuf, long *lenmes, long *nodeselect, long *nodefrom, long *sync); -extern void SETDBG_(long *value); -extern void SND_(long *type, void *buf, long *lenbuf, long *node, long *sync); -extern void SRAND48_(long *seed); -extern void STATS_(); -extern void SYNCH_(long *type); -extern long TCGREADY_(); -extern double TCGTIME_(); -extern void WAITCOM_(long *node); - -/* - Miscellaneous routines for internal use only? -*/ - -extern void Error(char *string, long integer); -extern void MtimeReset(); -extern void PrintProcInfo(); -extern void RemoteConnect(long a, long b, long c); -extern void tcgi_pbegin(int argc, char **argv); -extern void USleep(long us); - -#ifdef __cplusplus -} -#endif - -#endif /* SNDRCV_H_ */ diff --git a/armci/tcgmsg/srftoc.h b/armci/tcgmsg/srftoc.h deleted file mode 100644 index 44935bfe1..000000000 --- a/armci/tcgmsg/srftoc.h +++ /dev/null @@ -1,45 +0,0 @@ -/** @file - This header file provides definitions for c for the names of the - c message passing routines accessible from FORTRAN. It need not - be included directly in user c code, assuming that sndrcv.h has already. - - It is needed as the FORTRAN naming convention varies between machines - and it is the FORTRAN interface that is portable, not the c interface. - However by coding with the macro defnition names c portability is - ensured. -*/ -#ifndef SRFTOC_H_ -#define SRFTOC_H_ - -#define BRDCST_ armci_tcgmsg_brdcst -#define DGOP_ armci_tcgmsg_dgop -#define DRAND48_ armci_tcgmsg_drand48 -#define IGOP_ armci_tcgmsg_igop -#define LLOG_ armci_tcgmsg_llog -#define MDTOB_ armci_tcgmsg_mdtob -#define MDTOI_ armci_tcgmsg_mdtoi -#define MITOB_ armci_tcgmsg_mitob -#define MITOD_ armci_tcgmsg_mitod -#define MTIME_ armci_tcgmsg_mtime -#define NICEFTN_ armci_tcgmsg_niceftn -#define NNODES_ armci_tcgmsg_nnodes -#define NODEID_ armci_tcgmsg_nodeid -#define NXTVAL_ armci_tcgmsg_nxtval -#define PARERR_ armci_tcgmsg_parerr -#define PBEGINF_ armci_tcgmsg_pbeginf -#define PBGINF_ armci_tcgmsg_pbginf -#define PEND_ armci_tcgmsg_pend -#define PFCOPY_ armci_tcgmsg_pfcopy -#define PFILECOPY_ armci_tcgmsg_pfilecopy -#define PROBE_ armci_tcgmsg_probe -#define RCV_ armci_tcgmsg_rcv -#define SETDBG_ armci_tcgmsg_setdbg -#define SND_ armci_tcgmsg_snd -#define SRAND48_ armci_tcgmsg_srand48 -#define STATS_ armci_tcgmsg_stats -#define SYNCH_ armci_tcgmsg_synch -#define TCGREADY_ armci_tcgmsg_tcgready -#define TCGTIME_ armci_tcgmsg_tcgtime -#define WAITCOM_ armci_tcgmsg_waitcom - -#endif /* SRFTOC_H_ */ diff --git a/armci/testing/clone.c b/armci/testing/clone.c deleted file mode 100644 index 8ca7a596d..000000000 --- a/armci/testing/clone.c +++ /dev/null @@ -1,94 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include -#include -#include -#include -#include -#include - -#define FORK_BEFORE_NI_INIT -#ifndef FORK_BEFORE_NI_INIT -#define FORK_AFTER_NI_INIT -#endif - -char child_stack[256*1024]; -char *child_stack_top = &child_stack[256*1024-1]; -int iv; - -int -server(void *arg) -{ - - int ret; - int num_interfaces; - ptl_handle_ni_t nih; - ptl_handle_eq_t eqh; - ptl_ni_limits_t ptl_limits; - ptl_event_t ev_t; - ptl_event_t *ev = &ev_t; - - - printf("IN SERVER\n"); - - if ((ret = PtlInit(&num_interfaces)) != PTL_OK) { - printf("%s: PtlInit failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } - printf("%s: PtlInit succeeds (%d:%d)\n", FUNCTION_NAME, ret, num_interfaces); - - if (((ret = PtlNIInit( - IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK, PTL_IFACE_SS), - PTL_PID_ANY, NULL, &ptl_limits, &nih)) != PTL_OK) && (ret != PTL_IFACE_DUP)) { - printf("%s: PtlNIInit failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } - printf("%s: PtlNIInit succeeds (%d)\n", FUNCTION_NAME, ret); - - if ((ret = PtlEQAlloc(nih, 4096, NULL, &eqh)) != PTL_OK) { - printf("%s: PtlEQAlloc failed: %d\n", - FUNCTION_NAME, ret); - exit(1); - } - iv = 11; - ret = PtlEQWait(nih, ev); - printf("%s: PtlEQAlloc succeeds\n", FUNCTION_NAME); - printf("%d\n", iv); - iv = 13; - - while (1); -} - -int -main(int argc, char **argv, char **envp) -{ - int ret; - pid_t child; - int status; - iv = 12; - - child = clone(server, (void *)child_stack_top, - CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, NULL); - - if (child == -1) { - perror("clone"); - exit(1); - } - printf("clone returns...(ret=%d)\n", child); - while (iv != 11); - printf("\nbetween after %d\n", iv); - - - MPI_Init(&argc, &argv); - MPI_Barrier(MPI_COMM_WORLD); - MPI_Finalize(); - - printf("waiting...\n"); - waitpid(-1, &status, __WALL); - printf("\nafter %d\n", iv); - printf("done (%d)\n", status); -} - diff --git a/armci/testing/fork.c b/armci/testing/fork.c deleted file mode 100644 index 4f0b19e04..000000000 --- a/armci/testing/fork.c +++ /dev/null @@ -1,83 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include -#include - -#include -#include -#include - -int -main(int argc, char **argv, char **envp) -{ - int i, ret, npes; - int num_interfaces; - ptl_handle_ni_t nih; - ptl_handle_eq_t eqh; - ptl_ni_limits_t ptl_limits; - pid_t child; - ptl_process_id_t rnk; - - child = fork(); - if ((ret = PtlInit(&num_interfaces)) != PTL_OK) { - printf("%s: PtlInit failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } - printf("%s: PtlInit succeeds (%d)\n", FUNCTION_NAME, ret); - - - if ((ret = PtlNIInit( - IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK, PTL_IFACE_SS), - PTL_PID_ANY, NULL, &ptl_limits, &nih)) != PTL_OK) { - printf("%s: PtlNIInit 1 failed: %d\n", FUNCTION_NAME, ret); - } - - if ((ret = PtlNIFini(nih)) != PTL_OK) { - printf("%s: PtlNIFini failed: %d\n", FUNCTION_NAME, ret); - } - PtlFini(); - - if ((ret = PtlInit(&num_interfaces)) != PTL_OK) { - printf("%s: PtlInit failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } - if ((ret = PtlNIInit( - IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK, PTL_IFACE_SS), - PTL_PID_ANY, NULL, &ptl_limits, &nih)) != PTL_OK) { - printf("%s: PtlNIInit 2 failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } -#if 0 - if ((ret = PtlNIInit( - IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK, PTL_IFACE_SS), - PTL_PID_ANY, NULL, &ptl_limits, &nih)) != PTL_OK) { - printf("%s: PtlNIInit failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } -#endif - printf("%s: PtlNIInit succeeds (%d)\n", FUNCTION_NAME, ret); - - if ((ret = PtlEQAlloc(nih, 4096, NULL, &eqh)) != PTL_OK) { - printf("%s: PtlEQAlloc failed: %d(%d)\n", - FUNCTION_NAME, ret, child); - exit(1); - } - printf("%s: PtlEQAlloc succeeds (%d:%d)\n", FUNCTION_NAME, child, ret); - - - if ((ret = PtlGetId(nih, &rnk)) != PTL_OK) { - printf("%s: PtlGetId failed: %d(%d)\n", - FUNCTION_NAME, ret, child); - exit(1); - } - printf("%s: nid=%d pid=%d(%d)\n", FUNCTION_NAME, rnk.nid, rnk.pid, child); - - if (child) { - MPI_Init(&argc, &argv); - MPI_Finalize(); - printf("%s: mpi_init and finalize succeed(%d)\n", FUNCTION_NAME, child); - } -} diff --git a/armci/testing/msgcheck.c b/armci/testing/msgcheck.c index 047025720..e51d62ba9 100644 --- a/armci/testing/msgcheck.c +++ b/armci/testing/msgcheck.c @@ -12,8 +12,6 @@ #include "armci.h" #include "message.h" -#define armci_msg_brdcst__ armci_msg_bcast_lapi - int me, nproc; #define LOOP 20 diff --git a/armci/testing/origptl.c b/armci/testing/origptl.c deleted file mode 100644 index f9501a9ef..000000000 --- a/armci/testing/origptl.c +++ /dev/null @@ -1,58 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include -#include -#include -#include -#include - -#define FORK_BEFORE_NI_INIT -#ifndef FORK_BEFORE_NI_INIT -#define FORK_AFTER_NI_INIT -#endif - -int -main(int argc, char **argv, char **envp) -{ - int ret; - int num_interfaces; - ptl_handle_ni_t nih; - ptl_handle_eq_t eqh; - ptl_ni_limits_t ptl_limits; - pid_t child; - - if ((ret = PtlInit(&num_interfaces)) != PTL_OK) { - printf("%s: PtlInit failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } - printf("%s: PtlInit succeeds (%d)\n", FUNCTION_NAME, ret); - -#ifdef FORK_BEFORE_NI_INIT - child = fork(); -#endif - if ((ret = PtlNIInit( - IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK, PTL_IFACE_SS), - PTL_PID_ANY, NULL, &ptl_limits, &nih)) != PTL_OK) { - printf("%s: PtlNIInit failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } - printf("%s: PtlNIInit succeeds (%d)\n", FUNCTION_NAME, ret); - -#ifdef FORK_AFTER_NI_INIT - child = fork(); -#endif - if ((ret = PtlEQAlloc(nih, 4096, NULL, &eqh)) != PTL_OK) { - printf("%s: PtlEQAlloc failed: %d(%d)\n", - FUNCTION_NAME, ret, child); - exit(1); - } - printf("%s: PtlEQAlloc succeeds (%d:%d)\n", FUNCTION_NAME, child, ret); - if (child) { - MPI_Init(&argc, &argv); - MPI_Finalize(); - } -} - diff --git a/armci/testing/perf_aggr.c b/armci/testing/perf_aggr.c index 7b97b5b50..70ad4c175 100644 --- a/armci/testing/perf_aggr.c +++ b/armci/testing/perf_aggr.c @@ -60,11 +60,7 @@ #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif /***************************** macros ************************/ @@ -77,54 +73,6 @@ int me, nproc; void *work[MAXPROC]; /* work array for propagating addresses */ - - -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - /*void create_array(void *a[], int elem_size, int ndim, int dims[])*/ void create_array(double *a[], int ndim, int dims[]) { diff --git a/armci/testing/perf_nb.c b/armci/testing/perf_nb.c index 0dc2d29db..8f7c33259 100644 --- a/armci/testing/perf_nb.c +++ b/armci/testing/perf_nb.c @@ -63,11 +63,7 @@ #define MAXPROC 8 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif #define MAXELEMS 131072 /* 262144 */ #define MAX_REQUESTS MAXELEMS @@ -89,52 +85,6 @@ int me, nproc; void *work[MAXPROC]; /* work array for propagating addresses */ double *ddst[MAXPROC]; -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - /*void create_array(void *a[], int elem_size, int ndim, int dims[])*/ void create_array(double *a[], int ndim, int dims[]) { @@ -392,39 +342,6 @@ void test_perf_nb(int dry_run) ARMCI_Barrier(); } -#if PORTALS - /* See the note below why this part is disabled */ - /* ---------------------- nb-Accumulate ------------------------ */ - for (i = 0; i < elems[1]; i++) { - dsrc[me][i] = 1.0; - } - ARMCI_Barrier(); - stride = elems[1] * sizeof(double); - scale = 1.0; - for (j = 0; j < ntimes; j++) { - stime = armci_timer(); - if ((rc = ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, - &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) { - ARMCI_Error("armci_nbacc failed\n", rc); - } - t8 += armci_timer() - stime; - stime = armci_timer(); - ARMCI_Wait(&hdl_acc); - t9 += armci_timer() - stime; - - ARMCI_Barrier(); - ARMCI_AllFence(); - ARMCI_Barrier(); - if (VERIFY) { - verify_results(ACC, elems); - } - for (i = 0; i < elems[0]*elems[1]; i++) { - ddst[me][i] = 0.0; - } - ARMCI_Barrier(); - } -#endif - /* print timings */ if (!dry_run) if (me == 0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4 / ntimes, t5 / ntimes, t6 / ntimes, t1 / ntimes, diff --git a/armci/testing/ptltest.c b/armci/testing/ptltest.c deleted file mode 100644 index 99e9ced67..000000000 --- a/armci/testing/ptltest.c +++ /dev/null @@ -1,102 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include -#include -#include -#include -#include - -#define FORK_BEFORE_NI_INIT -#ifndef FORK_BEFORE_NI_INIT -#define FORK_AFTER_NI_INIT -#endif - - -#ifndef PMI_SUCCESS -#define PMI_SUCCESS 0 -#endif -extern int PMI_CNOS_Get_nidpid_map(void **); -int -main(int argc, char **argv, char **envp) -{ - int i, ret, *npes; - int num_interfaces; - ptl_handle_ni_t nih; - ptl_handle_eq_t eqh; - ptl_ni_limits_t ptl_limits; - pid_t child; - ptl_process_id_t rnk, *procid_map; - int spv, *spawned = &spv; - - - if ((ret = PtlInit(&num_interfaces)) != PTL_OK) { - printf("%s: PtlInit failed: %d\n", FUNCTION_NAME, ret); - exit(1); - } - printf("%s: PtlInit succeeds (%d)\n", FUNCTION_NAME, ret); - -#ifdef FORK_BEFORE_NI_INIT - child = fork(); -#endif - - if ((ret = PtlNIInit(IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK, PTL_IFACE_SS), - PTL_PID_ANY, NULL, &ptl_limits, &nih)) != PTL_OK) { - printf("%s: PtlNIInit failed: %d\n", FUNCTION_NAME, ret); - /*exit(1);*/ - } - else { - printf("%s: PtlNIInit succeeds (%d)\n", FUNCTION_NAME, ret); - } - -#ifdef FORK_AFTER_NI_INIT - child = fork(); -#endif - - if ((ret = PtlEQAlloc(nih, 4096, NULL, &eqh)) != PTL_OK) { - printf("%s: PtlEQAlloc failed: %d(%d)\n", - FUNCTION_NAME, ret, child); - exit(1); - } - printf("%s: PtlEQAlloc succeeds (%d:%d)\n", FUNCTION_NAME, child, ret); - -#if 1 - if (child) { - MPI_Init(&argc, &argv); - } - - if (child) { - PMI_Init(spawned); - printf("\n%d:spanwned=%d", child, *spawned); - if ((ret = PMI_Get_size(npes)) != PMI_SUCCESS) { - printf("%s: PMI_Get_size failed: %d\n", FUNCTION_NAME, ret); - /*exit(1);*/ - } - else { - printf("%s: PMI_Get_size succeeds (%d)\n", FUNCTION_NAME, *npes); - } - /*procid_map = (ptl_process_id_t *)malloc(sizeof(ptl_process_id_t)*(*npes)); - if(procid_map==NULL)exit(1);*/ - if ((ret = PMI_CNOS_Get_nidpid_map(&procid_map)) != PMI_SUCCESS) { - printf("Getting proc map failed (npes=%d)\n", *npes); - } - for (i = 0; i < *npes; i++) { - printf("\npid=%d nid=%d npes=%d(%d)", procid_map[i].pid, procid_map[i].nid, *npes, child); - } - } -#endif - - if ((ret = PtlGetId(nih, &rnk)) != PTL_OK) { - printf("%s: PtlGetId failed: %d(%d)\n", - FUNCTION_NAME, ret, child); - exit(1); - } - printf("%s: nid=%d pid=%d(%d)\n", FUNCTION_NAME, rnk.nid, rnk.pid, child); - if (child) { - MPI_Finalize(); - printf("%s: mpi_init and finalize succeed(%d)\n", FUNCTION_NAME, child); - } - -} diff --git a/armci/testing/shmclean.c b/armci/testing/shmclean.c index a4baf3355..70bdeb796 100644 --- a/armci/testing/shmclean.c +++ b/armci/testing/shmclean.c @@ -25,10 +25,6 @@ # include #endif -#ifdef SUN -char *shmat(); -#endif - #define MAXID 1000000 int main(int argc, char **argv) { diff --git a/armci/testing/shmtest.c b/armci/testing/shmtest.c index 47b2b64de..b65730314 100644 --- a/armci/testing/shmtest.c +++ b/armci/testing/shmtest.c @@ -22,9 +22,6 @@ # include #endif -#ifdef SUN -char *shmat(); -#endif int armci_test_allocate(long size) { diff --git a/armci/testing/te.c b/armci/testing/te.c deleted file mode 100644 index e1a09d3dd..000000000 --- a/armci/testing/te.c +++ /dev/null @@ -1,48 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - - -#include -#include -#include -#include -#include -#include - - -#include -#include -#include - -int variable, fd; - -int do_something() -{ - variable = 42; - close(fd); - _exit(0); -} - -int main(int argc, char *argv[]) -{ - void **child_stack; - char tempch; - - variable = 9; - fd = open("test.file", O_RDONLY); - child_stack = (void **) malloc(16384); - printf("The variable was %d\n", variable); - - clone(do_something, child_stack, CLONE_VM | CLONE_FILES, NULL); - sleep(1); - - printf("The variable is now %d\n", variable); - if (read(fd, &tempch, 1) < 1) { - perror("File Read Error"); - exit(1); - } - printf("We could read from the file\n"); - return 0; -} - diff --git a/armci/testing/test.c b/armci/testing/test.c index 4c88c0f0e..97ecb8576 100644 --- a/armci/testing/test.c +++ b/armci/testing/test.c @@ -66,11 +66,7 @@ extern void armci_unlockmem(void); #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif /***************************** macros ************************/ @@ -83,56 +79,6 @@ extern void armci_unlockmem(void); int me, nproc; int work[MAXPROC]; /* work array for propagating addresses */ - - -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) { - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) { - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - /*\ generate random range for a section of multidimensional array \*/ void get_range(int ndim, int dims[], int lo[], int hi[]) diff --git a/armci/testing/test2.c b/armci/testing/test2.c index ae2c81dc9..f4d1233ba 100644 --- a/armci/testing/test2.c +++ b/armci/testing/test2.c @@ -74,11 +74,7 @@ #define MAXPROC 1024 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif typedef struct { float real; @@ -100,54 +96,6 @@ typedef struct { int me, nproc; void *work[MAXPROC]; /* work array for propagating addresses */ - - -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - void create_array(void *a[], int elem_size, int ndim, int dims[]) { int bytes = elem_size, i, rc; diff --git a/armci/testing/test_groups.c b/armci/testing/test_groups.c index 4b9e57e13..1cbf15881 100644 --- a/armci/testing/test_groups.c +++ b/armci/testing/test_groups.c @@ -36,52 +36,6 @@ int me, nproc; void *work[MAXPROC]; /* work array for propagating addresses */ -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - void create_array(void *a[], int elem_size, int ndim, int dims[]) { int bytes = elem_size, i, rc; diff --git a/armci/testing/testnotify.c b/armci/testing/testnotify.c index 99667bf49..5fb6582b2 100644 --- a/armci/testing/testnotify.c +++ b/armci/testing/testnotify.c @@ -65,12 +65,7 @@ #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif - /***************************** macros ************************/ @@ -83,54 +78,6 @@ int me, nproc; void *work[MAXPROC]; /* work array for propagating addresses */ - - -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - /*\ generate random range for a section of multidimensional array \*/ void get_range(int ndim, int dims[], int lo[], int hi[]) diff --git a/armci/testing/timer.h b/armci/testing/timer.h index 16201a1ef..54f4cecbf 100644 --- a/armci/testing/timer.h +++ b/armci/testing/timer.h @@ -1,7 +1,7 @@ #ifndef ARMCI_TESTING_TIMER_H_ #define ARMCI_TESTING_TIMER_H_ -#if (defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)) && !defined(_CRAYC) +#if (defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)) # define HAVE_RDTSC 1 # if defined(__i386__) static __inline__ unsigned long long rdtsc(void) diff --git a/cca/INSTALL.cca b/cca/INSTALL.cca deleted file mode 100644 index 5e1535528..000000000 --- a/cca/INSTALL.cca +++ /dev/null @@ -1,19 +0,0 @@ - -The following packages are need to compile cca directory.These packages -can be obtained from www.cca-forum.org -CCAFFEINE - - cca-spec-classic - - ccafe - - -Set the following Environment Variables appropriately. Other envs are -taken from $CCAFE_HOME/Makefile.Vars (Makefile.Vars is included in -Makefile.Rules) - For example: - export CCAFE_HOME=/msrc/proj/cca/manoj/dccafe/cxx - export GA_HOME=/msrc/home/manoj/GA - -Supported Platforms: - GA-CCA stuff is curently tested only in Linux and Solaris as the - CCA software (CCAFFEINE) is still under development. - diff --git a/cca/ga_cca_classic/CCAFERC_Sample b/cca/ga_cca_classic/CCAFERC_Sample deleted file mode 100644 index 0e7fdeee4..000000000 --- a/cca/ga_cca_classic/CCAFERC_Sample +++ /dev/null @@ -1,32 +0,0 @@ -#!ccaffeine bootstrap file. -# ------- don't change anything ABOVE this line.------------- -path set /msrc/proj/cca/manoj/dccafe/cxx/dc/component:/msrc/proj/cca/manoj/dccafe/lib/babel-components - -# Added by Manoj -path append /msrc/home/manoj/ga_cca_classic -path append /msrc/home/manoj/ga_cca_classic/TestComponent - -! The above line should be path /usr/local/ccafe/cxx/dc/component -! unless you hack/add your own component directories in a : separated list. -repository get-global StarterComponent -repository get-global TimeStamper -repository get-global Timer -repository get-global PrinterComponent -repository get-global RevalidateTest -repository get ccafe1.StarterComponent -repository get ccafe0.PrinterComponent -repository get ccafe_eg.PortTranslatorStarter - -repository get-global GA::GAServices -repository get-global TestComponent - -# create component instatiations -create GA::GAServices ga -create TestComponent tc - -#connect the components -connect tc ga_classic_port ga ga_classic_port - -go - -quit \ No newline at end of file diff --git a/cca/ga_cca_classic/DADFAxisInfo.cxx b/cca/ga_cca_classic/DADFAxisInfo.cxx deleted file mode 100644 index abb1bc438..000000000 --- a/cca/ga_cca_classic/DADFAxisInfo.cxx +++ /dev/null @@ -1,142 +0,0 @@ -#include "DADFAxisInfo.h" - -/****************************************************************************** - * CollapsedAxisInfo - * - * Very little to implement here, since a collapsed axis has no info. - *****************************************************************************/ - -CollapsedAxisInfo::~CollapsedAxisInfo(){} - -DistArrayTemplate::DistType CollapsedAxisInfo::getDistType() { - return DistArrayTemplate::Collapsed; -} - -void CollapsedAxisInfo::printAxisInfo() { - cerr << "(collapsed axis has no info)"; -} - -/****************************************************************************** - * BlockAxisInfo - *****************************************************************************/ - -BlockAxisInfo::BlockAxisInfo(){ - _blockSize = -1; - _first = -1; -} - -BlockAxisInfo::BlockAxisInfo(const BlockAxisInfo & original) - : _blockSize( original._blockSize ), - _first( original._first) -{} - -BlockAxisInfo::~BlockAxisInfo(){} - -DistArrayTemplate::DistType BlockAxisInfo::getDistType() { - return DistArrayTemplate::Block; -} - -void BlockAxisInfo::setDistParameters(const int blockSize, const int first) { - _blockSize = blockSize; - _first = first; -} - -void BlockAxisInfo::getDistParameters(int & blockSize, int & first) { - blockSize = _blockSize; - first = _first; -} - -void BlockAxisInfo::printAxisInfo() { - cerr << "block size = " << _blockSize << - ", first block on process = " << _first; -} - -/****************************************************************************** - * GenBlockAxisInfo - *****************************************************************************/ - -GenBlockAxisInfo::GenBlockAxisInfo(const int size) { - _blockSizes.resize( size ); -} - -GenBlockAxisInfo::GenBlockAxisInfo(const GenBlockAxisInfo & original) - : _blockSizes( original._blockSizes ) -{} - -GenBlockAxisInfo::~GenBlockAxisInfo(){} - -DistArrayTemplate::DistType GenBlockAxisInfo::getDistType() { - return DistArrayTemplate::GenBlock; -} - -void GenBlockAxisInfo::setDistParameters(const int blockSizes[]) { - unsigned int i; - - for ( i=0; i < _blockSizes.size(); ++i ) { - _blockSizes[i] = blockSizes[i]; - } -} - -void GenBlockAxisInfo::getDistParameters(int blockSizes[]) { - unsigned int i; - - for ( i=0; i < _blockSizes.size(); ++i ) { - blockSizes[i] = _blockSizes[i]; - } -} - -void GenBlockAxisInfo::printAxisInfo() { - unsigned int i; - - cerr << "block sizes = ("; - - for ( i=0; i < _blockSizes.size()-1; ++i ) { - cerr << _blockSizes[i] << ", "; - } - cerr << _blockSizes[i] << ")"; -} - -/****************************************************************************** - * ImplicitAxisInfo - *****************************************************************************/ - -ImplicitAxisInfo::ImplicitAxisInfo(const int size) { - _map.resize( size ); -} - -ImplicitAxisInfo::ImplicitAxisInfo(const ImplicitAxisInfo & original) - : _map( original._map ) -{} - -ImplicitAxisInfo::~ImplicitAxisInfo(){} - -DistArrayTemplate::DistType ImplicitAxisInfo::getDistType() { - return DistArrayTemplate::Implicit; -} - -void ImplicitAxisInfo::setDistParameters(const int map[]) { - unsigned int i; - - for ( i=0; i < _map.size(); ++i ) { - _map[i] = map[i]; - } -} - -void ImplicitAxisInfo::getDistParameters(int map[]) { - unsigned int i; - - for ( i=0; i < _map.size(); ++i ) { - map[i] = _map[i]; - } -} - -void ImplicitAxisInfo::printAxisInfo() { - unsigned int i; - - cerr << "element to process map = ("; - - for ( i=0; i < _map.size()-1; ++i ) { - cerr << _map[i] << ", "; - } - cerr << _map[i] << ")"; -} diff --git a/cca/ga_cca_classic/DADFAxisInfo.h b/cca/ga_cca_classic/DADFAxisInfo.h deleted file mode 100644 index 2c50c3f04..000000000 --- a/cca/ga_cca_classic/DADFAxisInfo.h +++ /dev/null @@ -1,218 +0,0 @@ -#ifndef DADFAxisInfo_h_seen -#define DADFAxisInfo_h_seen - -#include -#include "DistArrayTemplate.h" - -/** A set of classes to hold distribution parameters on a per-axis basis. - - $Id: DADFAxisInfo.h,v 1.1 2003-08-01 00:41:53 manoj Exp $ - - DADFAxisInfo is the abstract base class, but includes only two - functions -- one to report the distribution type, and one to print - the distribution parameters on the stdout stream (as an aid in - debugging). - - Each type of distribution has a concrete class that inherits from - DADFAxisInfo and adds specialized functions to set and get the - parameters appropriate to the type of distribution. (Exception: - since Collapsed distributions have no such data, they have no such - functions.) - - At present, no error checking is done here, on the assumption that - whoever creates us is better equipped to do it -- our purpose is - mainly storage. -*/ - -class DADFAxisInfo { - public: - - /** The usual destructor. */ - virtual ~DADFAxisInfo(){} - - /** Get the type of distribution this object represents. - */ - virtual DistArrayTemplate::DistType getDistType() = 0; - - /** Print distribution parameters to stdout stream in a - human-readable form. - - This is mainly intended to facilitate debugging. - */ - virtual void printAxisInfo() = 0; -}; - -/** Concrete DADFAxisInfo for collapsed distribution. */ - -class CollapsedAxisInfo : public DADFAxisInfo { - public: - - /** The usual destructor. */ - virtual ~CollapsedAxisInfo(); - - /** Get the type of distribution this object represents. - */ - virtual DistArrayTemplate::DistType getDistType(); - - /** Print distribution parameters to stdout stream in a - human-readable form. - - This is mainly intended to facilitate debugging. - */ - virtual void printAxisInfo(); -}; - -/** Concrete DADFAxisInfo for block distribution. */ - -class BlockAxisInfo : public DADFAxisInfo { - public: - - /** Normal constructor. - */ - BlockAxisInfo(); - - /** Copy constructor. - - @param original (in) Instance of BlockAxisInfo from which to - initialize new object. - */ - BlockAxisInfo(const BlockAxisInfo & original); - - /** The usual destructor */ - virtual ~BlockAxisInfo(); - - /** Get the type of distribution this object represents. - */ - virtual DistArrayTemplate::DistType getDistType(); - - /** Set block distribution parameters. - - @param blockSize (in) Size of block. - @param first (in) process owning first block of distribution. - */ - void setDistParameters(const int blockSize, const int first); - - /** Get block distribution parameters. - - @param blockSize (out) Size of block. - @param first (out) process owning first block of distribution. - */ - void getDistParameters(int & blockSize, int & first); - - /** Print distribution parameters to stdout stream in a - human-readable form. - - This is mainly intended to facilitate debugging. - */ - virtual void printAxisInfo(); - - private: - /** Block size for block/cyclic distribution. */ - int _blockSize; - - /** Process on which first block resides */ - int _first; -}; - -/** Concrete DADFAxisInfo for generalized block distribution. */ - -class GenBlockAxisInfo : public DADFAxisInfo { - public: - - /** Normal constructor. - - @param size (in) Size of block size vector (should be same as - number of processes in this axis of process topology). - */ - GenBlockAxisInfo(const int size); - - /** Copy constructor. - - @param original (in) Instance of GenBlockAxisInfo from which to - initialize new object. - */ - GenBlockAxisInfo(const GenBlockAxisInfo & original); - - /** The usual destructor */ - virtual ~GenBlockAxisInfo(); - - /** Get the type of distribution this object represents. - */ - virtual DistArrayTemplate::DistType getDistType(); - - /** Set generalized block distribution parameters. - - @param blockSizes (in) Array of block sizes - */ - void setDistParameters(const int blockSizes[]); - - /** Get generalized block distribution parameters. - - @param blockSizes (out) Array of block sizes - */ - void getDistParameters(int blockSizes[]); - - /** Print distribution parameters to stdout stream in a - human-readable form. - - This is mainly intended to facilitate debugging. - */ - virtual void printAxisInfo(); - - private: - - /** Array of block sizes. Should size as the number of processes. */ - std::vector _blockSizes; -}; - -/** Concrete DADFAxisInfo for an implicit distribution. */ - -class ImplicitAxisInfo : public DADFAxisInfo { - public: - - /** Normal constructor. - - @param size (in) Size of element to process map (should be same - as number of elements in this axis of array template). - */ - ImplicitAxisInfo(const int size); - - /** Copy constructor. - - @param original (in) Instance of ImplicitAxisInfo from which to - initialize new object. - */ - ImplicitAxisInfo(const ImplicitAxisInfo & original); - - /** The usual destructor */ - virtual ~ImplicitAxisInfo(); - - /** Get the type of distribution this object represents. - */ - virtual DistArrayTemplate::DistType getDistType(); - - /** Set implicit map distribution parameters. - - @param map (in) Array mapping elements to processes - */ - void setDistParameters(const int map[]); - - /** Get implicit map distribution parameters. - - @param map (out) Array mapping elements to processes - */ - void getDistParameters(int map[]); - - /** Print distribution parameters to stdout stream in a - human-readable form. - - This is mainly intended to facilitate debugging. - */ - virtual void printAxisInfo(); - - private: - /** Mapping of elements to processes. */ - std::vector _map; -}; - -#endif // DADFAxisInfo_h_seen diff --git a/cca/ga_cca_classic/DADFDescriptor.cxx b/cca/ga_cca_classic/DADFDescriptor.cxx deleted file mode 100644 index d57669834..000000000 --- a/cca/ga_cca_classic/DADFDescriptor.cxx +++ /dev/null @@ -1,400 +0,0 @@ -#include "DADFDescriptor.h" -#include "DistArrayTemplate.h" - -/** This is our implementation of DistArrayDescriptor. It is kept - private within the DistArrayDescriptorFactory. Only - DistArrayDescriptor is exposed to the outside. - - $Id: DADFDescriptor.cxx,v 1.1 2003-08-01 00:41:53 manoj Exp $ - */ - -/****************************************************************************** - * Constructors and destructors - *****************************************************************************/ - -/** Constructor sets descriptor name. */ -DADFDescriptor::DADFDescriptor( const std::string name) { - _name = name; - _frozen = false; - _rank = -1; - _type = stv_Int; - _templ = 0; - _isExplicitDist = false; -} - -/** Construct a new descriptor as a copy of an old one. -*/ -DADFDescriptor::DADFDescriptor(const std::string name, DADFDescriptor & original) - : _rank( original._rank ), - _type( original._type), - _lowerBounds( original._lowerBounds ), - _upperBounds( original._upperBounds ), - _topology( original._topology ), - _procCoords( original._procCoords ), - _isExplicitDist( original._isExplicitDist ) -{ - // Use name provided by user rather than from original - _name = name; - - // Regardless of whether original was frozen, ths one should not be - _frozen = false; - - // Duplicate template - DADFTemplate * originalTemplDADF = - dynamic_cast(original._templ); - _templ = new DADFTemplate( *originalTemplDADF ); - - /** _regionList must be treated with similar care to _axisInfo. */ - - DADFRegionInfo * dri; // Used to copy _regionList - std::list::iterator driter; - - for ( driter=original._regionList.begin(); - driter != original._regionList.end(); ++driter ) { - - dri = new DADFRegionInfo( *(*driter) ); - _regionList.push_back( dri ); - } - -} - - DADFDescriptor::~DADFDescriptor() { - delete _templ; - - std::list::iterator driter; - - // Clean up _regionList - for ( driter= _regionList.begin(); driter != _regionList.end(); - ++driter ) { - delete *driter; - } - _regionList.resize(0); -} - -/****************************************************************************** - * Define the descriptor - *****************************************************************************/ - -/** Set data type. */ -int DADFDescriptor::setDataType(const enum DataType type) { - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - _type = type; - - return 0; -} - -/** Associate this data object with a distribution template. */ -int DADFDescriptor::setTemplate(DistArrayTemplate * & templ) { - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - // If there's already one there, replace it - if ( _templ != 0 ) { delete _templ; } - - DADFTemplate * templDADF = dynamic_cast(templ); - if ( templDADF == 0 ) { - cerr << "DADFDescriptor:setTemplate: " << - "Template is of the wrong type." << endl; - return -13; - } - - // Make a private copy of the template - DADFTemplate * newtempl = new DADFTemplate( *templDADF ); - _templ = dynamic_cast(newtempl); - - return 0; -} - -/** Sets this process's location in the process topology. */ -int DADFDescriptor::setMyProcCoords(const int procCoords[] ) { - int i; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - // Check bounds for validity - for ( i=0; i < _rank; ++i ) { - if ( procCoords[i] < 0 || procCoords[i] > _topology[i] ) { - return -6; - } - } - - // Copy arguments into our data structure - for ( i=0; i < _rank; ++i ) { _procCoords[ i ] = procCoords[ i ]; } - - return 0; -} - -/** Align object to template with identity mapping. */ -int DADFDescriptor::setIdentityAlignmentMap() { - int i, err; - int *proc; - int *lower, *upper; - DistArrayTemplate::DistType *dist; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - // Insure we have a template - if ( _templ == 0 ) { return -14; } - - /** At the moment, this simply means we're allowed to extract stuff - from the template more or less with impunity. */ - _rank = _templ->getRank(); - - // Resize our internal vectors - _lowerBounds.resize( _rank ); - _upperBounds.resize( _rank ); - _topology.resize( _rank ); - _procCoords.resize( _rank ); - - proc = new int[ _rank ]; - err = _templ->getProcTopology( &(*proc) ); - if ( err != 0 ) { return err; } - - for ( i=0; i < _rank; ++i ) { - _topology[i] = proc[i]; - } - delete [] proc; - - lower = new int[ _rank ]; - upper = new int[ _rank ]; - err = _templ->getGlobalBounds( &(*lower), &(*upper) ); - if ( err != 0 ) { return err; } - - for ( i=0; i < _rank; ++i ) { - _lowerBounds[i] = lower[i]; - _upperBounds[i] = upper[i]; - } - - delete [] lower; - delete [] upper; - - dist = new DistArrayTemplate::DistType[ _rank ]; - err = _templ->getDistType( &(*dist) ); - if ( err != 0 ) { return err; } - - _isExplicitDist = ( dist[0] == DistArrayTemplate::Explicit ); - - delete [] dist; - - return 0; -} - -/** Set pointer the local piece of the data object. */ -int DADFDescriptor::setLocalDataPointer(void* data, const int - strides[]) { - int i; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - // This call is only valid for non-explicit distributions - if ( _isExplicitDist ) { return -2; } - - /** Sanity check the strides. Until someone come up with a use case - to demonstrate why we need non-positive strides, we'll say - they're bad. - */ - for (i=0; i < _rank; ++i) { - if ( strides[i] < 1 ) { return -16; } - } - - // Create a region info and set the point & stride (but not the bounds) - DADFRegionInfo * dri = new DADFRegionInfo( _rank ); - dri->setDataLocation( data, strides ); - _regionList.push_back( dri ); - - return 0; -} - -/** Set pointer for a region of an explicitly distributed data object. */ -int DADFDescriptor::setRegionDataPointer(const int lower[], const int - upper[], void* data, const int - strides[]) { - int i; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - // This call is only valid for explicit distributions - if ( ! _isExplicitDist ) { return -2; } - - /** Sanity check the strides. Until someone come up with a use case - to demonstrate why we need non-positive strides, we'll say - they're bad. - */ - for (i=0; i < _rank; ++i) { - if ( strides[i] < 1 ) { return -16; } - } - - /** This is incorrect -- it checks our region list, not the - template's. At the moment we don't have query functions on the - template. Alternatively, we could initialize our region list - from the template's when we make the alignment map, but once - again we really need the template queries to do that. So for - now, we just punt, and hope the user is careful! - */ -// // Check that specified region matches one registered -// std::list::iterator driter; - -// driter= _regionList.begin(); -// for ( _regionList.begin(); driter != _regionList.end(); ++driter ) { -// cerr << " compare returns " << -// (*driter)->compareBounds( lower, upper) << endl; -// if ( (*driter)->compareBounds( lower, upper) == 1 ) { break; } -// } -// // If we reached the end, there's no match -// if ( driter == _regionList.end() ) { return -101; } - - // Create a region info and set everything - DADFRegionInfo * dri = new DADFRegionInfo( _rank ); - dri->setBounds( lower, upper ); - dri->setDataLocation( data, strides ); - _regionList.push_back( dri ); - - return 0; -} - -/** Signal that descriptor is completely defined. */ -int DADFDescriptor::commit() { - // Insure we haven't been commit()ed already - if ( _frozen ) { - return -11; - } else { - _frozen = true; - return 0; - } - - // Perform global consistency checks - - // Should test topology if dist is explicit - // Test if area of explicit regions == area of descriptor -} - -/****************************************************************************** - * Query the descriptor - *****************************************************************************/ - -std::string DADFDescriptor::getName() { - return _name; -} - -bool DADFDescriptor::isDefined() { - if ( _frozen ) { - return true; - } else { - return false; - } -} - -/** Get data type. */ -DistArrayDescriptor::DataType DADFDescriptor::getDataType() { - return _type; -} - -/** Return pointer to distribution template associated with this - data object. -*/ -DistArrayTemplate * DADFDescriptor::getTemplate() { - return _templ; -} - -/** Get this process's location in the process topology. */ -int DADFDescriptor::getMyProcCoords(int procCoords[] ) { - int i; - - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Copy arguments out of our data structures - for ( i=0; i < _rank; ++i ) { procCoords[ i ] = _procCoords[ i ]; } - - return 0; -} - -// Kludges to check GenBlock and Explicit descriptors. -int DADFDescriptor::getNumLocalRegions() { - // if ( ! _frozen ) { return -100; } - - if ( _isExplicitDist ) { - return _regionList.size(); - } else { - return 1; - } -} -// Kludges to check GenBlock and Explicit descriptors. -int DADFDescriptor::getLocalRegionInfo(int region, int lower[], int upper[], - void * & data, int strides[]) { - int i; - - // if ( ! _frozen ) { return -100; } - - if ( _isExplicitDist ) { - if ( region >= (int)_regionList.size() ) { return -15; } - } else { - if ( region != 0 ) { return -15; } - } - - // Find the right region (assumes list not rearranged btw calls!) - std::list::iterator driter; - driter= _regionList.begin(); - for ( i = 0; i < region; ++i ) { ++driter; } - - (*driter)->getBounds( lower, upper ); - (*driter)->getDataLocation(data, strides ); - - return 0; -} - -/** Print the contents of the descriptor (for debugging) */ -void DADFDescriptor::printDescriptor() { - int i; - static const std::string typeLabels[12] - = { "stvInt", "stvFloat", "stvCplx", "stvDouble", "stvDcplx", - "stvLong", "stvShort", "stvStr", "stvUshort", "stvUint", - "stvUlong", "stvByte" }; - - cerr << "Distributed array descriptor `" << _name << "' rank " << - _rank << " type `" << typeLabels[ _type ] << "'" << - ((_frozen)? " (" : " (not" ) << " frozen)" << endl; - - if ( _templ != 0 ) { - cerr << " Associated with template: `" << _templ->getName() - << "'" << endl; - } - - if ( _rank < 1 ) { return; } - - cerr << " Bounds: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << _lowerBounds[i] << ":" << _upperBounds[i] << ", "; - } - cerr << _lowerBounds[i] << ":" << _upperBounds[i] << endl; - - cerr << " Process topology: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << _topology[i] << ", "; - } - cerr << _topology[i] << endl; - - cerr << " Process coordinates: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << _procCoords[i] << ", "; - } - cerr << _procCoords[i] << endl; - - // Print out the region list - std::list::iterator driter; - - cerr << " Regions registered: " << _regionList.size() << endl; - - for ( driter= _regionList.begin(); - driter != _regionList.end(); ++driter ) { - cerr << " Region "; - (*driter)->printRegionInfo(); - cerr << endl; - } -} diff --git a/cca/ga_cca_classic/DADFDescriptor.h b/cca/ga_cca_classic/DADFDescriptor.h deleted file mode 100644 index 2ba693880..000000000 --- a/cca/ga_cca_classic/DADFDescriptor.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef DADFDescriptor_h_seen -#define DADFDescriptor_h_seen - -/** This is our implementation of DistArrayDescriptor. It is kept - private within the DistArrayDescriptorFactory. Only - DistArrayDescriptor is exposed to the outside. - - $Id: DADFDescriptor.h,v 1.1 2003-08-01 00:41:53 manoj Exp $ - */ - -#include -#include - -#include "DistArrayDescriptor.h" -#include "DADFTemplate.h" - -class DADFDescriptor : public DistArrayDescriptor { - public: - - /**************************************************************************** - * Constructors and destructors - ***************************************************************************/ - - /** Simple constructor for internal use. */ - DADFDescriptor() ; - - /** Normal constructor -- forces setting of name */ - DADFDescriptor( const std::string name ) ; - - /** Copy constructor */ - DADFDescriptor( const std::string name, DADFDescriptor & original ); - - /** The usual destructor */ - virtual ~DADFDescriptor(); - - /** Set data type. */ - virtual int setDataType(const enum DataType type); - - /** Associate this data object with a distribution template. */ - virtual int setTemplate(DistArrayTemplate * & templ); - - /** Sets this process's location in the process topology. */ - virtual int setMyProcCoords(const int procCoords[] ); - - /** Align object to template with identity mapping. */ - virtual int setIdentityAlignmentMap(); - - /** Set pointer for the local region of the data object. */ - virtual int setLocalDataPointer(void* data, const int strides[]); - - /** Set pointer for a local region of an explicitly distributed - data object. - */ - virtual int setRegionDataPointer(const int lower[], const int - upper[], void* data, const int - strides[]); - - /** Signal that data object is completely defined. */ - virtual int commit(); - - /**************************************************************************** - * Query the descriptor - ***************************************************************************/ - - /** Return name given to descriptor */ - virtual std::string getName(); - - /** Has commit() been called on this descriptor? */ - virtual bool isDefined(); - - /** Get data type. */ - virtual DistArrayDescriptor::DataType getDataType(); - - /** Return pointer to distribution template associated with this - data object. - */ - virtual DistArrayTemplate * getTemplate(); - - /** Get this process's location in the process topology. */ - virtual int getMyProcCoords(int procCoords[] ); - - /** Mainly for testing and debugging */ - virtual void printDescriptor(); - - /** Part of a kludge for debugging. */ - virtual int getNumLocalRegions(); - - /** Part of a kludge for debugging. */ - virtual int getLocalRegionInfo(int region, int lower[], int upper[], - void * & data, int strides[]); - - /**************************************************************************** - * Internals - ***************************************************************************/ - private: - - // Human-readable name for this descriptor - std::string _name; - - // Whether or not commit() has been called - int _frozen; - - // Rank of array - int _rank; - - // Type of data - DataType _type; - - // Shorthand check if this is an explicit distribution - bool _isExplicitDist; - - // Array distribution template for this descriptor - DistArrayTemplate * _templ; - - // Global lower bounds of array - std::vector _lowerBounds; - - // Global upper bounds of array - std::vector _upperBounds; - - // Process topology - std::vector _topology; - - // Coordinates of this process in topology - std::vector _procCoords; - - // List of regions associated with this process of array - std::list _regionList; -}; -#endif // DADFDescriptor_h_seen diff --git a/cca/ga_cca_classic/DADFRegionInfo.cxx b/cca/ga_cca_classic/DADFRegionInfo.cxx deleted file mode 100644 index c2a55959a..000000000 --- a/cca/ga_cca_classic/DADFRegionInfo.cxx +++ /dev/null @@ -1,109 +0,0 @@ -#include "DADFRegionInfo.h" - -DADFRegionInfo::DADFRegionInfo(const int size) { - _lowerBounds.resize( size ); - _upperBounds.resize( size ); - _strides.resize( size ); - _data = 0; -} - -DADFRegionInfo::DADFRegionInfo(const DADFRegionInfo & original) - : _lowerBounds( original._lowerBounds ), - _upperBounds( original._upperBounds ), - _strides( original._strides ), - _data( original._data ) -{} - -DADFRegionInfo::~DADFRegionInfo(){} - -void DADFRegionInfo::setBounds(const int lower[], const int upper[]) { - unsigned int i; - - for ( i=0; i < _lowerBounds.size(); ++i ) { - _lowerBounds[i] = lower[i]; - _upperBounds[i] = upper[i]; - } -} - -void DADFRegionInfo::getBounds(int lower[], int upper[]) { - unsigned int i; - - for ( i=0; i < _lowerBounds.size(); ++i ) { - lower[i] = _lowerBounds[i]; - upper[i] = _upperBounds[i]; - } -} - -/** Compare region bounds. */ -int DADFRegionInfo::compareBounds(const int lower[], const int upper[]) { - unsigned int i; int minUpper, maxLower; - bool identical = true; - bool overlap = true; - - /** Ranges in an axis overlap if the minimum upper bound is greater - than or equal to the maximum lower bound. If overlap occurs in - all axes, the regions overlap. Identity implies overlap. - */ - i = 0; - while ( (identical || overlap) && i < _lowerBounds.size() ) { - identical = identical && ( lower[i] == _lowerBounds[i]); - identical = identical && ( upper[i] == _upperBounds[i]); - - minUpper = (upper[i] < _upperBounds[i] ) ? upper[i] : _upperBounds[i]; - maxLower = (lower[i] > _lowerBounds[i] ) ? lower[i] : _lowerBounds[i]; - overlap = overlap && ( minUpper >= maxLower); - i++; - } - // If regions are identical they will also overlap. Identity has priority. - if ( identical ) { return 1; } - if ( overlap ) { return -1; } - - // The regions are disjoint - return 0; -} - -void DADFRegionInfo::setDataLocation(void * dataPtr, - const int strides[] ) { - unsigned int i; - - _data = dataPtr; - for ( i=0; i < _strides.size(); ++i ) { _strides[i] = strides[i]; } -} - -void DADFRegionInfo::getDataLocation( void * & dataPtr, int strides[] ) { - unsigned int i; - - dataPtr = _data; - - for ( i=0; i < _strides.size(); ++i ) { strides[i] = _strides[i]; } -} - -void DADFRegionInfo::printRegionInfo() { - unsigned int i; - - cerr << "("; - - for ( i=0; i < _lowerBounds.size()-1; ++i ) { - cerr << _lowerBounds[i] << ", "; - } - cerr << _lowerBounds[i] << ")"; - - cerr << " --> ("; - - for ( i=0; i < _upperBounds.size()-1; ++i ) { - cerr << _upperBounds[i] << ", "; - } - cerr << _upperBounds[i] << ")"; - - if ( _data != 0 ) { - cerr << " at " << _data << " with strides ("; - - for ( i=0; i < _strides.size()-1; ++i ) { - cerr << _strides[i] << ", "; - } - cerr << _strides[i] << ")"; - - // WARNING! Assumes double data type. Not general! - cerr << " first elem = " << *(static_cast(_data)); - } -} diff --git a/cca/ga_cca_classic/DADFRegionInfo.h b/cca/ga_cca_classic/DADFRegionInfo.h deleted file mode 100644 index 5355b54e0..000000000 --- a/cca/ga_cca_classic/DADFRegionInfo.h +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef DADFRegionInfo_h_seen -#define DADFRegionInfo_h_seen - -#include - -/** A concrete class to hold a region specification for a - multidimensional array. - - $Id: DADFRegionInfo.h,v 1.1 2003-08-01 00:41:53 manoj Exp $ - - There's nothing fancy here, just lower bounds and upper bounds. - Maybe eventually we'll need something fancier, with IDs/handles for - the regions and other stuff. But for now this suffices. - */ - -class DADFRegionInfo { - public: - - /** Normal constructor. - - @param size (in) length of the upper/lower bounds arrays - */ - DADFRegionInfo( int size ); - - /** Copy constructor. - - @param original (in) Instance of DADFRegionInfo from which to - initialize new object. - */ - DADFRegionInfo( const DADFRegionInfo & original); - - ~DADFRegionInfo(); - - /** Set region bounds. - - @param lower (in) lower bounds of region - @param upper (in) upper bounds of region - */ - void setBounds(const int lower[], const int upper[]); - - /** Get region bounds. - - @param lower (in) lower bounds of region - @param upper (in) upper bounds of region - */ - void getBounds(int lower[], int upper[]); - - /** Compare region bounds. Determine if object's region and - argument region are identical, disjoint, or overlapping. - - @param lower (in) lower bounds of region - @param upper (in) upper bounds of region - - @retval 1 Object region and argument region are identical - @retval 0 Object region and argument region are disjoint - @retval -1 Object region and argument region overlap but aren't - identical - */ - int compareBounds(const int lower[], const int upper[]); - - /** Set data location. - - @param dataPtr (in) pointer to data - @param strides (in) stride in each dimension to access data locations - - */ - void setDataLocation(void * dataPtr, const int strides[] ); - - /** Get data location. - - @param dataPtr (out) pointer to data - @param strides (out) stride in each dimension to access data locations - - */ - void getDataLocation(void * & dataPtr, int strides[] ); - - void printRegionInfo(); - - private: - std::vector _lowerBounds; - std::vector _upperBounds; - std::vector _strides; - void * _data; -}; - -#endif // DADFRegionInfo_h_seen diff --git a/cca/ga_cca_classic/DADFTemplate.cxx b/cca/ga_cca_classic/DADFTemplate.cxx deleted file mode 100644 index 5e5763685..000000000 --- a/cca/ga_cca_classic/DADFTemplate.cxx +++ /dev/null @@ -1,646 +0,0 @@ -#include -#include "DADFTemplate.h" -#include "DADFAxisInfo.h" - -/** This is our implementation of DistArrayTemplate. It is kept - private within the DistArrayDescriptorFactory. Only - DistArrayTemplate is exposed to the outside. - - $Id: DADFTemplate.cxx,v 1.1 2003-08-01 00:41:53 manoj Exp $ - */ - -/****************************************************************************** - * Constructors and destructors - *****************************************************************************/ - -/** Basic constructor. - - Probably should not be used, since we prefer to force the user to - provide a name for the object. -*/ -// DADFTemplate::DADFTemplate() { - -// _name = "_UNNAMED"; -// _frozen = false; -// _rank = -1; -// _volume = 1; // Allows use of *= -// _volDefined = 0; -// } - -/** Constructor sets template name. */ -DADFTemplate::DADFTemplate( const std::string name) { - _name = name; - _frozen = false; - _rank = -1; - _volume = 1; // Allows use of *= - _volDefined = 0; -} - -/** Construct a new template as a copy of an old one. Of course a - name must be provided for the new one. -*/ -DADFTemplate::DADFTemplate( const std::string name, DADFTemplate & original) - : _rank( original._rank ), - _volume( original._volume ), - _volDefined( original._volDefined ), - _lowerBounds( original._lowerBounds ), - _upperBounds( original._upperBounds ), - _topology( original._topology ), - _dist( original._dist ) -{ - // Use name in argument instead of copied from original - _name = name; - - // Regardless of whether original was frozen, ths one should not be - _frozen = false; - - /** Setting up _axisInfo is a little complicated. We expect it to - be of size original._rank unless original hasn't had setRank() - called on it yet. This should take care of things. - */ - _axisInfo.resize( original._axisInfo.size() ); - - unsigned int i; - BlockAxisInfo * bai; // Used for casting to copy _axisInfo - GenBlockAxisInfo * gbai; // Used for casting to copy _axisInfo - ImplicitAxisInfo * iai; // Used for casting to copy _axisInfo - - // Regular distributions are per-axis - for ( i=0; i < original._axisInfo.size(); ++i ) { - if ( original._axisInfo[i] ) { - switch ( (original._axisInfo[i])->getDistType() ) { - case Collapsed: - // Nothing to copy, so no need for a copy constructor here - _axisInfo[i] = new CollapsedAxisInfo; - break; - case Block: - bai = dynamic_cast(original._axisInfo[i]); - _axisInfo[i] = new BlockAxisInfo( *bai ); - break; - case GenBlock: - gbai = dynamic_cast(original._axisInfo[i]); - _axisInfo[i] = new GenBlockAxisInfo( *gbai ); - break; - case Implicit: - iai = dynamic_cast(original._axisInfo[i]); - _axisInfo[i] = new ImplicitAxisInfo( *iai ); - break; - } - } else { - _axisInfo[i] = 0; - } - } - - /** _regionList must be treated with similar care to _axisInfo. */ - - DADFRegionInfo * dri; // Used to copy _regionList - std::list::iterator driter; - - for ( driter=original._regionList.begin(); - driter != original._regionList.end(); ++driter ) { - - dri = new DADFRegionInfo( *(*driter) ); - _regionList.push_back( dri ); - } - -} - -/** Construct a new template as an identical copy of an old one. */ -DADFTemplate::DADFTemplate( DADFTemplate & original) - : _name( original._name ), - _rank( original._rank ), - _volume( original._volume ), - _volDefined( original._volDefined ), - _lowerBounds( original._lowerBounds ), - _upperBounds( original._upperBounds ), - _topology( original._topology ), - _dist( original._dist ), - _frozen( original._frozen ) -{ - /** Setting up _axisInfo is a little complicated. We expect it to - be of size original._rank unless original hasn't had setRank() - called on it yet. This should take care of things. - */ - _axisInfo.resize( original._axisInfo.size() ); - - unsigned int i; - BlockAxisInfo * bai; // Used for casting to copy _axisInfo - GenBlockAxisInfo * gbai; // Used for casting to copy _axisInfo - ImplicitAxisInfo * iai; // Used for casting to copy _axisInfo - - // Regular distributions are per-axis - for ( i=0; i < original._axisInfo.size(); ++i ) { - if ( original._axisInfo[i] ) { - switch ( (original._axisInfo[i])->getDistType() ) { - case Collapsed: - // Nothing to copy, so no need for a copy constructor here - _axisInfo[i] = new CollapsedAxisInfo; - break; - case Block: - bai = dynamic_cast(original._axisInfo[i]); - _axisInfo[i] = new BlockAxisInfo( *bai ); - break; - case GenBlock: - gbai = dynamic_cast(original._axisInfo[i]); - _axisInfo[i] = new GenBlockAxisInfo( *gbai ); - break; - case Implicit: - iai = dynamic_cast(original._axisInfo[i]); - _axisInfo[i] = new ImplicitAxisInfo( *iai ); - break; - } - } else { - _axisInfo[i] = 0; - } - } - - /** _regionList must be treated with similar care to _axisInfo. */ - - DADFRegionInfo * dri; // Used to copy _regionList - std::list::iterator driter; - - for ( driter=original._regionList.begin(); - driter != original._regionList.end(); ++driter ) { - - dri = new DADFRegionInfo( *(*driter) ); - _regionList.push_back( dri ); - } - -} - -DADFTemplate::~DADFTemplate() { - std::vector::iterator iter; - std::list::iterator driter; - - // Clean up anything in _axisInfo - for ( iter = _axisInfo.begin(); iter != _axisInfo.end(); ++iter ) { - if ( *iter != 0 ) { delete *iter; } - } - - // Clean up _regionList - for ( driter= _regionList.begin(); driter != _regionList.end(); - ++driter ) { - delete *driter; - } - _regionList.resize(0); -} - -/****************************************************************************** - * Define the template - *****************************************************************************/ - -/** Name associated with this distribution. */ -int DADFTemplate::setName(const std::string name) { - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - _name = name; - return 0; -} - -/** Set rank (number of dimensions) of distribution template. */ -int DADFTemplate::setRank(const int rank) { - int i; - bool modifying = false; // Are we modifying the clone of an existing templ? - - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - // Check for invalid rank - if ( rank < 1 ) { return -3; } - - /** If we're modifying a cloned template, we want to do some nice - things, like resize all the arrays of size "rank" while - preserving as much of the original information as possible. To - do this, we depend on the constructor to initialize _rank to an - invalid value */ - if ( _rank > 0 ) { modifying = true; } - - // Rank defines the size of many arrays, so allocate them now. - - _lowerBounds.resize( rank ); - _upperBounds.resize( rank ); - _topology.resize( rank ); - _dist.resize( rank ); - _axisInfo.resize( rank ); - - // Finish initializing anything new. Do something mostly harmless. - for ( i = (modifying)? _rank : 0 ; i < rank; ++i ) { - _lowerBounds[ i ] = 0; - _upperBounds[ i ] = -1; - _topology[ i ] = 1; - _dist[ i ] = Collapsed; - _axisInfo[ i ] = 0; - } - - /** If the rank is changing, kill any explicit regions that may have - been defined. - */ - - if ( modifying && rank != _rank ) { - std::list::iterator driter; - - for ( driter= _regionList.begin(); - driter != _regionList.end(); ++driter ) { - delete *driter; - } - _regionList.resize(0); - } - - /** Don't forget to store the rank itself! - Intentionally saved for last, since above we use the fact that - _rank holds the original length if we're modifying. - */ - _rank = rank; - - return 0; -} - -/** Set the global upper and lower bounds of the array. */ -int DADFTemplate::setGlobalBounds(const int lower[], const int upper[]) { - int i; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Check bounds for validity and compute volume - for ( i=0; i < _rank; ++i ) { - if ( lower[i] > upper[i] ) { return -1; } - - _volume *= ( upper[i]-lower[i]+1 ); - } - - // Copy arguments into our data structures - for ( i=0; i < _rank; ++i ) { - _lowerBounds[ i ] = lower[ i ]; - _upperBounds[ i ] = upper[ i ]; - } - - return 0; -} - -/** Sets process topology. */ -int DADFTemplate::setProcTopology(const int topology[] ) { - int i; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Check bounds for validity - for ( i=0; i < _rank; ++i ) { if ( topology[i] < 1 ) { return -6; } } - - // Copy arguments into our data structure - for ( i=0; i < _rank; ++i ) { _topology[ i ] = topology[ i ]; } - - return 0; -} - -/** Sets distribution type on each axis. */ -int DADFTemplate::setDistType(const enum DistType dist[] ) { - int i; - //static const std::string distLabels[5] = { "Coll", "Bloc", "GenB", "Impl", "Expl" }; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Check input for validity: if one is explicit, all must be - bool oneExpl = false; - bool allExpl = true; - for ( i=0; i < _rank; ++i) { - oneExpl = ( oneExpl || dist[i] == Explicit ); - allExpl = ( allExpl && dist[i] == Explicit ); - } - if ( oneExpl && ! allExpl ) { return -2; } - - // Copy arguments into our data structure, make sure axisInfo is consistent - for ( i=0; i < _rank; ++i ) { - _dist[ i ] = dist[ i ]; - - /** If we're modifying a cloned object, we only want to preserve - those axisInfos that are consistent - */ - if ( _axisInfo[i] && (_axisInfo[i])->getDistType() != _dist[i] ) { - delete _axisInfo[i]; - _axisInfo[i] = 0; - } - } - - return 0; -} - -/** Set distribution parameters for an axis with a regular distributions. */ -int DADFTemplate::setDistParameters(int axis, int blockSize, - int first) { - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - // Sanity check inputs - if ( axis < 0 || axis >= _rank ) { return -4; } - if ( _dist[ axis ] != Block ) { return -2; } - if ( blockSize < 1 || blockSize > (_upperBounds[ axis ] - - _lowerBounds[ axis ] + 1) ) { - return -5; - } - if ( first < 0 || first >= _topology[ axis ] ) { return -6; } - - // Create axis info object - - _axisInfo[ axis ] = new BlockAxisInfo; - - BlockAxisInfo* bai = dynamic_cast(_axisInfo[axis]); - - bai->setDistParameters( blockSize, first); - - return 0; -} - -/** Set distribution parameters for a GenBlock axis. */ -int DADFTemplate::setGenBlock(int axis, int blockSizes[]) { - int i; - int total = 0; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - // Sanity check inputs - if ( axis < 0 || axis >= _rank ) { return -4; } - if ( _dist[ axis ] != GenBlock ) { return -2; } - - for ( i=0; i < _topology[ axis ]; ++i ) { - if ( blockSizes[i] < 0 ) { return -5; } - total += blockSizes[i]; - } - if ( total > (_upperBounds[ axis ] - _lowerBounds[ axis ] + 1) ) { - return -5; - } - - _axisInfo[ axis ] = new GenBlockAxisInfo( _topology[axis] ); - - GenBlockAxisInfo* gbai = dynamic_cast(_axisInfo[axis]); - gbai->setDistParameters( blockSizes ); - - return 0; -} - -/** Set distribution parameters for an Implicit axis. */ -int DADFTemplate::setImplicitMap(int axis, int map[]) { - int i; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - // Sanity check inputs - if ( axis < 0 || axis >= _rank ) { return -4; } - if ( _dist[ axis ] != Implicit ) { return -2; } - - for ( i=0; i < (_upperBounds[ axis ] - _lowerBounds[ axis ] + 1) ; ++i ) { - if ( map[i] < 0 || map[i] >= _topology[axis] ) { return -6; } - } - - _axisInfo[ axis ] = new ImplicitAxisInfo( (_upperBounds[ axis ] - - _lowerBounds[ axis ] + 1) - ); - - ImplicitAxisInfo* iai = dynamic_cast(_axisInfo[axis]); - iai->setDistParameters( map ); - - return 0; -} - -/** Add a region to an Explicit distribution. */ -int DADFTemplate::addExplicitRegion(int lower[], int upper[]) { - int i; - int regVolume = 1; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -10; } - - // Sanity check inputs - for ( i=0; i < _rank; ++i ) { - if ( lower[i] < _lowerBounds[i] ) { return -1; } - if ( upper[i] > _upperBounds[i] ) { return -1; } - regVolume *= upper[i] - lower[i] + 1; - } - - // Check for overlaps. Note: strictly local to this processr for now - std::list::iterator driter; - - driter= _regionList.begin(); - for ( _regionList.begin(); driter != _regionList.end(); ++driter ) { - if ( (*driter)->compareBounds( lower, upper) != 0 ) { break; } - } - // If we reached the end, there's no overlaps, otherwise there are! - if ( driter != _regionList.end() ) { return -7; } - - // If everything is okay, add this region - DADFRegionInfo * dri = new DADFRegionInfo( _rank ); - dri->setBounds( lower, upper ); - _regionList.push_back( dri ); - - // Eventually in parallel: Check for completeness to give proper return - _volDefined += regVolume; - return 0; -} - -/** Signal that template is completely defined. */ -int DADFTemplate::commit() { - // Insure we haven't been commit()ed already - if ( _frozen ) { - return -10; - } else { - _frozen = true; - return 0; - } - - // Perform global consistency checks - - // Should test topology if dist is explicit - // Test if area of explicit regions == area of template -} - -/****************************************************************************** - * Query the template - *****************************************************************************/ - -/** Name associated with this distribution. (default value is "_UNNAMED") */ -std::string DADFTemplate::getName() { - return _name; -} - -/** Get rank (number of dimensions) of distributed object. */ -int DADFTemplate::getRank() { - return _rank; -} - -/** The global upper and lower bounds of the array. - - @param lower (Out) array of global lower bounds of array - @param upper (Out) array of global upper bounds of array -*/ -int DADFTemplate::getGlobalBounds(int lower[], int upper[]) { - int i; - - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Copy arguments out of our data structures - for ( i=0; i < _rank; ++i ) { - lower[ i ] = _lowerBounds[ i ]; - upper[ i ] = _upperBounds[ i ]; - } - - return 0; -} - -/** Returns process topology. */ -int DADFTemplate::getProcTopology(int topology[] ) { - int i; - - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Copy arguments out of our data structure - for ( i=0; i < _rank; ++i ) { topology[ i ] = _topology[ i ]; } - - return 0; -} - -/** Set distribution parameters for an axis with a regular distributions. */ -int DADFTemplate::getDistParameters(int axis, int blockSize, - int first) { - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Sanity check inputs - if ( axis < 0 || axis >= _rank ) { return -4; } - if ( _dist[ axis ] != Block ) { return -2; } - - // Get axis info object - - BlockAxisInfo* bai = dynamic_cast(_axisInfo[axis]); - - bai->getDistParameters( blockSize, first); - - return 0; -} - -/** Set distribution parameters for a GenBlock axis. */ -int DADFTemplate::getGenBlock(int axis, int blockSizes[]) { - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Sanity check inputs - if ( axis < 0 || axis >= _rank ) { return -4; } - if ( _dist[ axis ] != GenBlock ) { return -2; } - - // Get axis info object - - GenBlockAxisInfo* gbai = dynamic_cast(_axisInfo[axis]); - gbai->getDistParameters( blockSizes ); - - return 0; -} - -/** Set distribution parameters for an Implicit axis. */ -int DADFTemplate::getImplicitMap(int axis, int map[]) { - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Sanity check inputs - if ( axis < 0 || axis >= _rank ) { return -4; } - if ( _dist[ axis ] != Implicit ) { return -2; } - - // Get axis info object - - ImplicitAxisInfo* iai = dynamic_cast(_axisInfo[axis]); - iai->getDistParameters( map ); - - return 0; -} - -/** Returns distribution type on each axis. */ -int DADFTemplate::getDistType(enum DistType dist[] ) { - int i; - - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Copy our data structure to the argument - for ( i=0; i < _rank; ++i ) { dist[ i ] = _dist[ i ]; } - - return 0; -} - -bool DADFTemplate::isDefined() { - if ( _frozen ) { - return true; - } else { - return false; - } -} - -/** Print the contents of the template (for debugging) */ -void DADFTemplate::printTemplate() { - int i; - static const std::string distLabels[5] - = { "Coll", "Bloc", "GenB", "Impl", "Expl" }; - - cerr << "Array distribution template `" << _name << "' rank " << - _rank << ((_frozen)? " (" : " (not" ) << " frozen)" << endl; - - if ( _rank < 1 ) { return; } - - cerr << " Bounds: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << _lowerBounds[i] << ":" << _upperBounds[i] << ", "; - } - cerr << _lowerBounds[i] << ":" << _upperBounds[i] << endl; - - cerr << " Process topology: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << _topology[i] << ", "; - } - cerr << _topology[i] << endl; - - cerr << " Distribution Types: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << distLabels[ _dist[i] ] << ", "; - } - cerr << distLabels[ _dist[i] ] << endl; - - // Print out axiswise details - for ( i=0; i < _rank; ++i ) { - if ( _axisInfo[ i ] ) { - cerr << " Axis " << i << " " << - distLabels[ (_axisInfo[ i ])->getDistType() ]; - - cerr << " "; - (_axisInfo[i])->printAxisInfo(); - cerr << endl; - - } else { - cerr << " Axis " << i << " (no info supplied)" << endl; - } - } - - // Print out the region list - std::list::iterator driter; - - cerr << " Regions registered: " << _regionList.size() << endl; - - for ( driter= _regionList.begin(); - driter != _regionList.end(); ++driter ) { - cerr << " Region "; - (*driter)->printRegionInfo(); - cerr << endl; - } -} diff --git a/cca/ga_cca_classic/DADFTemplate.h b/cca/ga_cca_classic/DADFTemplate.h deleted file mode 100644 index 31bb255d8..000000000 --- a/cca/ga_cca_classic/DADFTemplate.h +++ /dev/null @@ -1,156 +0,0 @@ -#ifndef DADFTemplate_h_seen -#define DADFTemplate_h_seen - -/** This is our implementation of DistArrayTemplate. It is kept - private within the DistArrayDescriptorFactory. Only - DistArrayTemplate is exposed to the outside. - - $Id: DADFTemplate.h,v 1.1 2003-08-01 00:41:53 manoj Exp $ - */ - -#include -#include -#include -#include "DistArrayTemplate.h" -#include "DADFAxisInfo.h" -#include "DADFRegionInfo.h" - -class DADFTemplate : public DistArrayTemplate { - public: - - /**************************************************************************** - * Constructors and destructors - ***************************************************************************/ - - /** Simple constructor for internal use */ - DADFTemplate() ; - - /** Normal constructor. Forces setting of name. */ - DADFTemplate( const std::string name ) ; - - /** Simple copy constructor for internal use */ - DADFTemplate( DADFTemplate & original ); - - /** Normal copy constructor. Forces setting of name. */ - DADFTemplate( const std::string name, DADFTemplate & original ); - - /** The usual destructor */ - virtual ~DADFTemplate(); - - /**************************************************************************** - * Define the template - ***************************************************************************/ - - /** Name associated with this distribution. */ - virtual int setName(const std::string name); - - /** Set rank (number of dimensions) of distribution template. */ - virtual int setRank(const int rank); - - /** Set the global upper and lower bounds of the array. */ - virtual int setGlobalBounds(const int lower[], const int upper[]); - - /** Sets process topology. */ - virtual int setProcTopology(const int topology[] ); - - /** Sets distribution type on each axis. */ - virtual int setDistType(const enum DistType dist[] ); - - /** Set distribution parameters for an axis with a regular distributions. */ - virtual int setDistParameters(int axis, int blockSize, - int first); - - - /** Set distribution parameters for a GenBlock axis. */ - virtual int setGenBlock(int axis, int blockSizes[]); - - /** Set distribution parameters for an Implicit axis. */ - virtual int setImplicitMap(int axis, int map[]); - - - /** Add a region to an Explicit distribution. */ - virtual int addExplicitRegion(int lower[], int upper[]); - - /** Signal that template is completely defined. */ - virtual int commit(); - - /**************************************************************************** - * Query the template - ***************************************************************************/ - - /** Name associated with this distribution. (default value is "_UNNAMED") */ - virtual std::string getName(); - - /** Get rank (number of dimensions) of distributed object. */ - virtual int getRank(); - - /** The global upper and lower bounds of the array. */ - virtual int getGlobalBounds(int lower[], int upper[]); - - /** Returns process topology. */ - virtual int getProcTopology(int topology[] ); - - /** Returns distribution type on each axis. */ - virtual int getDistType(enum DistType dist[] ); - - /** Get distribution parameters for an axis with a regular distributions. */ - virtual int getDistParameters(int axis, int blockSize, - int first); - - /** Get distribution parameters for a GenBlock axis. */ - virtual int getGenBlock(int axis, int blockSizes[]); - - /** Get distribution parameters for an Implicit axis. */ - virtual int getImplicitMap(int axis, int map[]); - - /** Has commit() been called on this template? */ - virtual bool isDefined(); - - /** Mainly for testing and debugging */ - virtual void printTemplate(); - - /**************************************************************************** - * Internals - ***************************************************************************/ - private: - // Human-readable name for this template - std::string _name; - - // Whether or not commit() has been called - int _frozen; - - // Rank of template - int _rank; - - /** Caution: it is possible the following (and how they're used) - could give rise to overflows for large problems. I'm inclined to - think that if the volume is that large, there will be other - overflow problems too so this is not important, but I might be - wrong. - */ - - // Number of elements - int _volume; - - // Running total of volume of explicit regions - int _volDefined; - - // Global lower bounds of array - std::vector _lowerBounds; - - // Global upper bounds of array - std::vector _upperBounds; - - // Process topology - std::vector _topology; - - // Distribution types - std::vector _dist; - - // Per-axis distribution parameters - std::vector _axisInfo; - - // List of regions associated with this process of array - std::list _regionList; -}; -#endif // DADFTemplate_h_seen diff --git a/cca/ga_cca_classic/DistArray.h b/cca/ga_cca_classic/DistArray.h deleted file mode 100644 index 080f761d8..000000000 --- a/cca/ga_cca_classic/DistArray.h +++ /dev/null @@ -1,188 +0,0 @@ -#ifndef DistArray_h_seen -#define DistArray_h_seen - -#include "DistArrayTemplate.h" -#include "DistArrayDescriptor.h" - -class DistArray { - public: - - /**************************************************************************** - * Constructors and destructors - ***************************************************************************/ - - virtual ~DistArray(){} - - /**************************************************************************** - * Define the descriptor - ***************************************************************************/ - - /** Support just CUMULVS's data types for now. - - Retain stv prefix to make them easier to locate in user code - because this _will_ be changed in the future. - */ - enum DataType { - stv_Int, stv_Float, stv_Cplx, stv_Double, stv_Dcplx, stv_Long, - stv_Short, stv_Str, stv_Ushort, stv_Uint, stv_Ulong, stv_Byte - }; - - /** Set data type. - - @param type (In) Data type specification. - - @retval 0 Success - @retval -11 Attempt to change commit()ed descriptor - - @note Called any time prior to commit(). Called by: cohort. - - @note Need a much more general typing mechanism - */ - virtual int setDataType(const enum DataType type) = 0; - - /** Associate this data object with a distribution template. - - @param template (In) Distribution template - - @retval 0 Success - @retval -11 Attempt to change commit()ed descriptor - - @note Calling sequence: 1. Called by: cohort. - */ - virtual int setTemplate(DistArrayTemplate * & templ) = 0; - - /** Sets this process's location in the process topology. - - Example: Consider a 2-d process topology composed of 6 processes - in a 3x2 arrangement. The coordinates might be labeled: -
-      {0,0} {0,1} {0,2}
-      {1,0} {1,1} {1,2}
-      
- and might be assigned to processes as follows: - - proc 0: {0,0} - - proc 1: {0,1} - - proc 2: {0,2} - - proc 3: {1,0} - - proc 4: {1,1} - - proc 5: {1,2} - Note that the association of processes with coordinates in the - process topology is entirely up to you, the user. The entire - purpose of having this routine in the interface is so that we do - not have to make assumptions about it. - - @param location (In) array containing the coordinates of this - process in the process topology. Coordinates are in the - range 0..N-1 for N processes. - - @retval 0 Success - @retval -6 Invalid process (not within declared topology) - @retval -11 Attempt to change commit()ed descriptor - - @todo Should probably throw an exception instead of returning - an error code. - - @note Calling sequence: 2. Called by: active processes, MIMD-style */ - virtual int setMyProcCoords(const int procCoords[] ) = 0; - - /** Align object to template with identity mapping. - - This alignment operation specifies how the actual data object - relates to the referenced distribution template. The data - object then inherits all the decomposition characteristics from - the template according to the specified alignment. This allows - many data objects to use the same distribution template, even if - the data objects are not the same size as each other or the - template, and even if they are positioned somewhere other than - the upperleft corner of the template. In fact, some rather - esoteric mappings are possible through the - setGeneralAlignmentMap() function. Note that alignment uses the - actual index space of the data object and template, based on - their declared global bounds. This means that the index space - of the template must always be at least as large as the index - space of the data object. - - This function specifies a simple identity mapping of data object - axes and elements to those of the template, in other words, - data[i,j,...] maps to template[i,j,...]. - - @retval 0 Success - @retval -8 Internal state invalid (probably in template) - @retval -11 Attempt to change commit()ed descriptor - @retval -13 Template not defined - - @todo Should probably throw an exception rather than returning - an error code. - - @note Calling sequence: 3. Called by: cohort. - */ - virtual int setIdentityAlignmentMap() = 0; - - /** Signal that data object is completely defined. This asserts - that pointers for all regions have been properly registered, and - gives the implementation a chance to verify that the alignment - mappings, etc. are valid. - - @retval 0 Success - @retval -11 Attempt to change commit()ed descriptor - - @note Calling sequence: 5. Called by: cohort. - - @todo Should probably throw an exception instead of returning - an error code. - */ - virtual int commit() = 0; - - /**************************************************************************** - * Query the descriptor - ***************************************************************************/ - /** Return the name given to the descriptor. */ - virtual std::string getName() = 0; - - /** Has the descriptor been commit()ed? */ - virtual bool isDefined() = 0; - - /** Get data type. - */ - virtual DataType getDataType() = 0; - - /** Return pointer to distribution template associated with this - data object. - */ - virtual DistArrayTemplate * getTemplate() = 0; - - /** Get this process's location in the process topology. - - @param location (Out) array containing the coordinates of this - process in the process topology. Coordinates are in the - range 0..N-1 for N processes. - - @retval 0 Success - @retval -8 Method used out of sequence or internal state invalid - */ - virtual int getMyProcCoords(int procCoords[] ) = 0; - - /** Part of a kludge for debugging. Supports only GenBlock and - Explicit distributions at present. - - */ - virtual int getNumLocalRegions() = 0; - - /** Part of a kludge for debugging. Supports only Explicit - distributions at present. - - @retval 0 Success - @retval -15 Invalid region ID - */ - virtual int getLocalRegionInfo(int region, int lower[], int upper[], - void * & data, int strides[]) = 0; - - /** Mainly for testing and debugging */ - virtual void printArray() = 0; - virtual void printArrayDistribution() = 0; - - - -}; -#endif // DistArray_h_seen - diff --git a/cca/ga_cca_classic/DistArrayDescrFactoryPort.h b/cca/ga_cca_classic/DistArrayDescrFactoryPort.h deleted file mode 100644 index 8e1069a46..000000000 --- a/cca/ga_cca_classic/DistArrayDescrFactoryPort.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef DistArrayDescrFactoryPort_h_seen -#define DistArrayDescrFactoryPort_h_seen - -#include -#include "DistArrayDescriptor.h" -#include "DistArray.h" - -namespace classic { - namespace gov { - namespace cca { - - /** An interface for creating distributed array descriptors. - - $Id: DistArrayDescrFactoryPort.h,v 1.1 2003-08-01 00:41:53 manoj Exp $ - - This interface allows users to create, clone, and destroy - descriptors for distributed array objects. - */ - - class DistArrayDescrFactoryPort: public virtual ::classic::gov::cca::Port { - - public: - - /** Return an uninitialized descriptor object. - - @param name (in) Name associated with this descriptor. This - is a "forced convenience" for the user to allow intelligible - error messages, etc. - - @returns a reference to an uninitialized descriptor object. - */ - virtual DistArrayDescriptor * createDescriptor(std::string name) = 0; - - /** Return a descriptor object initialized with the contents of - another, but not frozen against modification. - - @param original (in) Descriptor from which to initialize new - descriptor. - - @param cloneName (in) Name associated with this descriptor. - This is a "forced convenience" for the user to allow - intelligible error messages, etc. - - @returns a reference to an initialized but not frozen - descriptor object. - */ - virtual DistArrayDescriptor * cloneDescriptor(DistArrayDescriptor * original, std::string cloneName) = 0; - - /** Destroy an existing descriptor. - - @param victim (in) Descriptor to be destroyed. - - @retval 0 Success - @retval -1 Descriptor not created by this factory - @retval -2 Descriptor has already been destroyed - @retval -3 Concrete class of descriptor is not is not the type - produced by this factory. - - @notes Should throw exceptions instead of returning an int. - */ - virtual int destroyDescriptor(DistArrayDescriptor * & victim) = 0; - - /** Return an uninitialized ga-dadf distributed array object. */ - virtual DistArray * createArray(std::string name) = 0; - - /** Return an distributed array object initialized with the contents of - another, but not frozen against modification. */ - virtual DistArray * cloneArray(DistArray* original, - std::string cloneName) = 0; - - /** Destroy an existing distributed array. */ - virtual int destroyArray(DistArray* & victim) = 0; - - }; // DistArrayDescrFactoryPort - - } // namespace cca - } // namespace gov -} // namespace classic - -#endif // DistArrayDescrFactoryPort_h_seen diff --git a/cca/ga_cca_classic/DistArrayDescriptor.h b/cca/ga_cca_classic/DistArrayDescriptor.h deleted file mode 100644 index d5b48e4e2..000000000 --- a/cca/ga_cca_classic/DistArrayDescriptor.h +++ /dev/null @@ -1,397 +0,0 @@ -#ifndef DistArrayDescriptor_h_seen -#define DistArrayDescriptor_h_seen - -#include "DistArrayTemplate.h" - -/** This is the public interface (abstract base class) for defining - and querying distributed array descriptors. - - This interface is an experimental first implementation of an - interface which has been under discussion in the CCA Forum's - Scientific Data Components Working Group. This implementation - does not conform exactly to the interface developed by the Working - Group, and is intended to be exploratory rather than normative. - - $Id: DistArrayDescriptor.h,v 1.1 2003-08-01 00:41:53 manoj Exp $ - - This interface is intended to support the creation of distributed - data objects structured like dense multi-dimensional distributed - arrays. The object is constructed from size and type information, - pointers to the local data on each process, and a mapping onto a - distribution template (see DistTemplateCreate). In this way, it - should be possible to describe any array-structured distributed - data object sufficiently to allow the construction of parallel - communications schedules and other data movement-related - operations. This interface is primarily intended to accomodate - existing applications and perhaps low-level use within new - components. We strongly advise developers of new code to use - "first class" scientific data objects appropriate to their - problem. - - This and the DistTemplateCreate interfaces have been modeled in - large measure on the array distribution capability present in High - Performance Fortran version 2.0, including the "Approved - Extensions.". This fact manifests itself most strongly in this - interface in the generality of mapping (or "aligning") the data - object to the distribution. Unfortunately, compilers can do this - a little more neatly than we can, so please read the documentation - carefully and if necessary refer to HPF2's alignment - capabilities. - - Ghost regions are not explicitly supported at this time. The - current suite of interfaces can be used with data objects - including ghost regions, but it may not be as pretty as it could - be. It is also possible to build a new set of interfaces on top - of these which include explicit support for ghosts. We should - revisit the question of how to support ghosts most effectively - once we have a little actual experience using these interfaces and - derived ghost-supporting ones. - - This interface is intended for use in a parallel environment. - Functions are identified by which processes are expected to call - them: - - "collectively" by the entire parallel cohort - - by individual processes ("MIMD style") active in this process - topology. - The term "collective" is used in quotes because all callers must - invoke the function with the same arguments, but except where - noted, no synchronization of the processes is implied or required. - The commit() function is truly collective, in that it does imply - synchronization of the calling processes. - - To construct a data object, the functions of this interface must - be called in an appropriate sequence. Functions with the same - position in the calling sequence can be called in any order. The - sequence is as follows (note that setName() and setDataType() can - be called any time before commit()): - -# setTemplate() - -# setIdentityAlignmentMap() - -# setMyProcCoords() - -# setRegionDataPointer() - -# commit() - - @returns In general, return codes >= 0 signify success and those < - 0 denote error conditions. Errors in this interface should be - user recoverable. Here is a list of error codes used in this - interface: - - @retval -2 Invalid or incompatible distribution type specification - @retval -6 Invalid process or process topology - @retval -8 Method used out of sequence or internal state invalid - @retval -9 Internal failure (i.e. memory allocation) - @retval -11 Attempt to change commit()ed descriptor - @retval -12 Template not commit()ed - @retval -13 Template is not of expected type - @retval -14 Template not defined - @retval -15 Invalid region ID - @retval -16 Invalid strides - - @note Error code definitions should be in harmony with those for - DistArrayTemplate. - - @note This interface miuses the term stride. The intent of - the Working Group was that actual memory strides be used in the - interface. These are the number of memory units to traverse to get - to the next location in a given dimension of an array. Using - strides allows you to express any storage order (i.e. row-major, - column-major) unambiguously. Because of time limitations, we have - used leading dimensions in this implementation, where you specify - just the size of the dimension and it gives no information - as to storage order, thus requiring agreement between the creator - and the user of the array descriptor as to what storage order is - being used. For the purposes of these demonstrations, the only - user was the CUMULVS-based MxN component, which (because it is - implemented in C) expects C storage order. This was done for - expediency and will change in subsequent versions of this - interface. - -*/ - -class DistArrayDescriptor { - public: - - /**************************************************************************** - * Constructors and destructors - ***************************************************************************/ - - virtual ~DistArrayDescriptor(){} - - /**************************************************************************** - * Define the descriptor - ***************************************************************************/ - - /** Support just CUMULVS's data types for now. - - Retain stv prefix to make them easier to locate in user code - because this _will_ be changed in the future. - */ - enum DataType { - stv_Int, stv_Float, stv_Cplx, stv_Double, stv_Dcplx, stv_Long, - stv_Short, stv_Str, stv_Ushort, stv_Uint, stv_Ulong, stv_Byte - }; - - /** Set data type. - - @param type (In) Data type specification. - - @retval 0 Success - @retval -11 Attempt to change commit()ed descriptor - - @note Called any time prior to commit(). Called by: cohort. - - @note Need a much more general typing mechanism - */ - virtual int setDataType(const enum DataType type) = 0; - - /** Associate this data object with a distribution template. - - @param template (In) Distribution template - - @retval 0 Success - @retval -11 Attempt to change commit()ed descriptor - - @note Calling sequence: 1. Called by: cohort. - */ - virtual int setTemplate(DistArrayTemplate * & templ) = 0; - - /** Sets this process's location in the process topology. - - Example: Consider a 2-d process topology composed of 6 processes - in a 3x2 arrangement. The coordinates might be labeled: -
-      {0,0} {0,1} {0,2}
-      {1,0} {1,1} {1,2}
-      
- and might be assigned to processes as follows: - - proc 0: {0,0} - - proc 1: {0,1} - - proc 2: {0,2} - - proc 3: {1,0} - - proc 4: {1,1} - - proc 5: {1,2} - Note that the association of processes with coordinates in the - process topology is entirely up to you, the user. The entire - purpose of having this routine in the interface is so that we do - not have to make assumptions about it. - - @param location (In) array containing the coordinates of this - process in the process topology. Coordinates are in the - range 0..N-1 for N processes. - - @retval 0 Success - @retval -6 Invalid process (not within declared topology) - @retval -11 Attempt to change commit()ed descriptor - - @todo Should probably throw an exception instead of returning - an error code. - - @note Calling sequence: 2. Called by: active processes, MIMD-style */ - virtual int setMyProcCoords(const int procCoords[] ) = 0; - - /** Align object to template with identity mapping. - - This alignment operation specifies how the actual data object - relates to the referenced distribution template. The data - object then inherits all the decomposition characteristics from - the template according to the specified alignment. This allows - many data objects to use the same distribution template, even if - the data objects are not the same size as each other or the - template, and even if they are positioned somewhere other than - the upperleft corner of the template. In fact, some rather - esoteric mappings are possible through the - setGeneralAlignmentMap() function. Note that alignment uses the - actual index space of the data object and template, based on - their declared global bounds. This means that the index space - of the template must always be at least as large as the index - space of the data object. - - This function specifies a simple identity mapping of data object - axes and elements to those of the template, in other words, - data[i,j,...] maps to template[i,j,...]. - - @retval 0 Success - @retval -8 Internal state invalid (probably in template) - @retval -11 Attempt to change commit()ed descriptor - @retval -13 Template not defined - - @todo Should probably throw an exception rather than returning - an error code. - - @note Calling sequence: 3. Called by: cohort. - */ - virtual int setIdentityAlignmentMap() = 0; - - /** Set pointer for the local region of the data object. - - For everything other than explicit distributions, we assume - there is one contiguous block of memory per process which - contains all that process's elements of the array. This implies - certain storage arrangements in the case of block-cyclic and - implicit which may seem a bit strange. I think this is - what other packages typically do anyway, but this needs further - investigation. As noted below, this routine is largely a - stopgap measure, so we might be better off rethinking the whole - thing instead of that little factor. - - @param data (In) Pointer to local memory holding region - @param strides (In) Array of leading dimensions of the actual - memory storage. These are the actual lengths of each axis in - memory. - - @note The combination of an arbitrary pointer and leading - dimensions allow support of windowing into other arrays. For - example, suppose a process holds a 14x14 array as part of a - distributed array. This array consists of a 10x10 patch of - "real" data and a 2 element wide ring of "ghost" elements, which - duplicate elements on adjacent processes. One can create a data - object representing the entire array, explicitly exposing the - ghost regions, by specifying (in the distribution template) that - the process's region is 14x14 and registering the pointer to the - [0,0] element (in C array indexing) with leading dimensions of - [14,14]. One can also create a data object which hides the - ghost regions, giving access to only the "real" data by - registering the region as 10x10 and registering the pointer to - the [2,2] element with the leading dimensions [14,14]. In other - words, the beginning of each row/column are 14 elements apart - instead of 10. - - @retval 0 Success - @retval -2 Incompatible distribution type (only valid for - non-Explicit distributions) - @retval -11 Attempt to change commit()ed descriptor - @retval -16 Invalid strides - - @todo Should probably throw an exception instead of returning an - error code. - - @note Calling sequence: 4. Called by: active processes once per - process. - - @todo This is just a stopgap until I get a better understanding - of SIDL and related interlanguage issues. We already know that - using pointers is not general, we just have to figure out the - right way to handle needs like this. */ - virtual int setLocalDataPointer(void * data, const int - strides[]) = 0; - - /** Set pointer for a local region of an explicitly distributed - data object. - - Each process must call this function for every region of the - data object they own. The region's bounds must be provided in - the data object's global coordinate system, but of course they - must map to the regions defined for the underlying distribution - template, taking into account the alignment of the data object - with the template. This is clearly a little ugly and perhaps - more than a little error-prone. Alternatives welcome! - - @param lower (In) Lower bounds of region, in the data object's - global coordinates - @param upper (In) Upper bounds of region, in the data object's - global coordinates - @param data (In) Pointer to local memory holding region - @param strides (In) Array of leading dimensions of the actual - memory storage. These are the actual lengths of each axis in - memory. - - @note The combination of an arbitrary pointer and leading - dimensions allow support of windowing into other arrays. For - example, suppose a process holds a 14x14 array as part of a - distributed array. This array consists of a 10x10 patch of - "real" data and a 2 element wide ring of "ghost" elements, which - duplicate elements on adjacent processes. One can create a data - object representing the entire array, explicitly exposing the - ghost regions, by specifying (in the distribution template) that - the process's region is 14x14 and registering the pointer to the - [0,0] element (in C array indexing) with leading dimensions of - [14,14]. One can also create a data object which hides the - ghost regions, giving access to only the "real" data by - registering the region as 10x10 and registering the pointer to - the [2,2] element with the leading dimensions [14,14]. In other - words, the beginning of each row/column are 14 elements apart - instead of 10. - - @retval 0 Success - @retval -2 Incompatible distribution type (only valid for - non-Explicit distributions) - @retval -11 Attempt to change commit()ed descriptor - @retval -16 Invalid strides - - @todo Should probably throw an exception instead of returning an - error code. - - @note Calling sequence: 4. Called by: active processes once per - region owned, MIMD-style. - - @todo This is just a stopgap until I get a better understanding - of SIDL and related interlanguage issues. We already know that - using pointers is not general, we just have to figure out the - right way to handle needs like this. */ - virtual int setRegionDataPointer(const int lower[], const int - upper[], void * data, const int - strides[]) = 0; - - /** Signal that data object is completely defined. This asserts - that pointers for all regions have been properly registered, and - gives the implementation a chance to verify that the alignment - mappings, etc. are valid. - - @retval 0 Success - @retval -11 Attempt to change commit()ed descriptor - - @note Calling sequence: 5. Called by: cohort. - - @todo Should probably throw an exception instead of returning - an error code. - */ - virtual int commit() = 0; - - /**************************************************************************** - * Query the descriptor - ***************************************************************************/ - /** Return the name given to the descriptor. */ - virtual std::string getName() = 0; - - /** Has the descriptor been commit()ed? */ - virtual bool isDefined() = 0; - - /** Get data type. - */ - virtual DataType getDataType() = 0; - - /** Return pointer to distribution template associated with this - data object. - */ - virtual DistArrayTemplate * getTemplate() = 0; - - /** Get this process's location in the process topology. - - @param location (Out) array containing the coordinates of this - process in the process topology. Coordinates are in the - range 0..N-1 for N processes. - - @retval 0 Success - @retval -8 Method used out of sequence or internal state invalid - */ - virtual int getMyProcCoords(int procCoords[] ) = 0; - - /** Part of a kludge for debugging. Supports only GenBlock and - Explicit distributions at present. - - */ - virtual int getNumLocalRegions() = 0; - - /** Part of a kludge for debugging. Supports only Explicit - distributions at present. - - @retval 0 Success - @retval -15 Invalid region ID - */ - virtual int getLocalRegionInfo(int region, int lower[], int upper[], - void * & data, int strides[]) = 0; - - /** Mainly for testing and debugging */ - virtual void printDescriptor() = 0; -}; -#endif // DistArrayDescriptor_h_seen - diff --git a/cca/ga_cca_classic/DistArrayTemplFactoryPort.h b/cca/ga_cca_classic/DistArrayTemplFactoryPort.h deleted file mode 100644 index 23c0ec8b5..000000000 --- a/cca/ga_cca_classic/DistArrayTemplFactoryPort.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef DistArrayTemplFactoryPort_h_seen -#define DistArrayTemplFactoryPort_h_seen - -#include -#include "DistArrayTemplate.h" - -namespace classic { - namespace gov { - namespace cca { - - /** An interface for creating distributed array template - descriptors. - - $Id: DistArrayTemplFactoryPort.h,v 1.1 2003-08-01 00:41:54 manoj Exp $ - - This interface allows users to create, clone, and destroy - descriptors for array distribution templates. - */ - - class DistArrayTemplFactoryPort: public virtual ::classic::gov::cca::Port { - - public: - - /** Return an uninitialized template object. - - @param name (in) Name associated with this template. This - is a "forced convenience" for the user to allow intelligible - error messages, etc. - - @returns a reference to an uninitialized template object. - */ - virtual DistArrayTemplate* createTemplate(std::string name) = 0; - - /** Return a template object initialized with the contents of - another, but not frozen against modification. - - @param original (in) Template from which to initialize new - template. - - @param cloneName (in) Name associated with this template. - This is a "forced convenience" for the user to allow - intelligible error messages, etc. - - @returns a reference to an initialized but not frozen - template object. - */ - virtual DistArrayTemplate* cloneTemplate(DistArrayTemplate * original, std::string cloneName) = 0; - - /** Destroy an existing template. - - @param victim (in) Template to be destroyed. - - @retval 0 Success - @retval -1 Template not created by this factory - @retval -2 Template has already been destroyed - @retval -3 Concrete class of template is not is not the type - produced by this factory. - - @notes Should throw exceptions instead of returning an int. - */ - virtual int destroyTemplate(DistArrayTemplate * & victim) = 0; - - }; // DistArrayTemplFactoryPort - - } // namespace cca - } // namespace gov -} // namespace classic - -#endif // DistArrayTemplFactoryPort_h_seen diff --git a/cca/ga_cca_classic/DistArrayTemplate.h b/cca/ga_cca_classic/DistArrayTemplate.h deleted file mode 100644 index b04985a6a..000000000 --- a/cca/ga_cca_classic/DistArrayTemplate.h +++ /dev/null @@ -1,586 +0,0 @@ -#ifndef DistArrayTemplate_h_seen -#define DistArrayTemplate_h_seen - -#include - -/** This is the public interface (abstract base class) for defining - and querying array distribution templates for dense - multi-dimensional rectangular arrays. - - This interface is an experimental first implementation of an - interface which has been under discussion in the CCA Forum's - Scientific Data Components Working Group. This implementation - does not conform exactly to the interface developed by the Working - Group, and is intended to be exploratory rather than normative. - - $Id: DistArrayTemplate.h,v 1.1 2003-08-01 00:41:54 manoj Exp $ - - This interface is intended to support the creation of distribution - templates to which actual data objects can later be - aligned. Following the model of High Performance Fortran, - distribution templates are conceptually arrays which have no real - existence, but for which the decomposition on the processor array - is specified. Data objects can be created referencing the template - for their decomposition. Many data objects can be "aligned" to - the same template. - - This interface is intended to be used in several ways. Many - existing distributed array and related packages do not have the - distribution as a first-class, standlone object. In this case, - the distribution template can be created separately, allowing - other components to access the information through the - complementary DataDistQuery interface. In newer packages, a - distribution template created here might be handed to a factory - to create an actual data object. This interface can also serve - as a guide for development of data distributions specification - interfaces for existing or new distibuted array packages which may - not support the full range of flexibility embodied here. - - This interface is desgned to support most of the array - distribution capability present in High Performance Fortran - version 2.0, including the "Approved Extensions.". It also - supports completely general "explicit" or table-based - distributions. Based on an informal survey of distributed array - and related packages, this seemed to be the best target. - - Data distributions are independent of the data type and storage - order (i.e. row-major vs column-major). - - Data distributions are essentially assumed to be static. It is - important that the user be able to make a set of queries about a - distribution and be confident of getting a consistent set of - results. Therefore, if an implementation wants to support dynamic - distributions, it should provide mechanisms/rules so that the user - can be confident of getting consistent results from properly - formulated inquiries. One way to do this might be by making any - redistribution operations explicit user calls, so that while not - in such a call, the data is consistent. - - Distribution templates do not allow the direct specification of - replication, but in the manner of HPF2, it is possible to align - real data to a template in such a way that the real data is - replicated across many elements of the template. - - Ghost regions are not explicitly supported at this time. The - current suite of interfaces can be used with data objects - including ghost regions, but it may not be as "pretty" as it could - be. It is also possible to build a new set of interfaces on top - of these which include explicit support for ghosts. We should - revisit the question of how to support ghosts most effectively - once we have a little actual experience using these interfaces and - derived ghost-supporting ones. - - This interface is intended for use in a parallel environment. - Functions are identified by which processes are expected to call - them: - - "collectively" by the entire parallel cohort - - by individual participating processes ("MIMD style"). - The term "collective" is used in quotes because all callers must - invoke the function with the same arguments, but except where - noted, no synchronization of the processes is implied or required. - - To construct a distribution template, the functions of this - interface must be called in an appropriate sequence. Functions - with the same position in the calling sequence can be called in - any order. The sequence is as follows: - -# setRank() - -# setGlobalBounds(), setProcTopology(), setDistType() - -# setDistParameters(), setGenBlock(), setImplicitMap(), - addExplicitRegion() - -# commit() - - @returns In general, return codes >= 0 signify success and those < - 0 denote error conditions. Errors in this interface should be - user recoverable. Here is a list of error codes used in this - interface: - - @retval -1 Invalid bounds - @retval -2 Invalid or incompatible distribution type specification - @retval -3 Invalid rank - @retval -4 Invalid axis - @retval -5 Invalid blockSize - @retval -6 Invalid process or process topology - @retval -7 Region overlaps existing region - @retval -8 Method used out of sequence or internal state invalid - @retval -9 Internal failure (i.e. memory allocation) - @retval -10 Attempt to change commit()ed template */ - -class DistArrayTemplate { - public: - - /**************************************************************************** - * Constructors and destructors - ***************************************************************************/ - - /** The usual destructor */ - virtual ~DistArrayTemplate(){} - - /**************************************************************************** - * Define the template - ***************************************************************************/ - - /** Name associated with this distribution. (default value is - "_UNNAMED") - - @param name (In) Name of distribution template, for convenience - of debugging. - - @retval 0 Success - @retval -10 Attempt to modify commit()ed template - - @note May be used at any time before commit() is called. Called - by: cohort. - */ - virtual int setName(const std::string name) = 0; - - /** Set rank (number of dimensions) of distribution template. - - @param rank (In) Rank (number of dimensions) in array template. - - @retval 0 Success - @retval -3 Invalid rank, must be a positive integer. - @retval -10 Attempt to modify commit()ed template - - @note Calling sequence: 1. Called by: cohort. - - @todo Should probably throw an exception instead of returning an - error code. - - @note Implementation note: the current implementation tries to - do sensible things of the rank is changed before the template is - commit()ed. (It is open to debate whether/under what - circumstances it is desirable to change the rank.) If the rank - is reduced, all rank-based data structures will be truncated to - the new rank. If the rank is increased, the new elements are - filled with something mildly invalid (i.e. array bounds of 0:-1) - or something innocuous (i.e collapsed distributions) with the - expectation that they will be properly set later on. For - explicit distributions, a change in rank will cause the region - list to be cleared. - */ - virtual int setRank(const int rank) = 0; - - /** Set the global upper and lower bounds of the array index space. - - @param lower (In) array of global lower bounds of array indices - @param upper (In) array of global upper bounds of array indices - - @retval 0 Success - @retval -1 Bounds invalid (i.e. upper bound smaller than lower) - @retval -8 Method used out of sequence or internal state invalid - @retval -10 Attempt to modify commit()ed template - - @note Calling sequence: 2. Called by: cohort. - - @todo Should probably throw an exception instead of returning an - error code. - */ - virtual int setGlobalBounds(const int lower[], const int upper[]) = 0; - - /** Sets process topology. This is the number of processes in each - in axis of the distribution. The topology array is of length - getRank(). Axes not distributed have an entry of 1 in the - topology array. The product of the elements of the topology - array should be the total number of processes participating in - the distribution template. - - For example, a 3-d array on a 6-process system might have a - topology with three processes in the first dimension, one in - the second (this dimension not distributed) and two in the - third, which would be a topology of [3, 1, 2]. 3x1x2 = 6. - - @param topology (In) array containing the number of processes - involved in each axis of the data distribution. - - @retval 0 Success - @retval -6 Invalid process topology (must be at least one - process in every dimension) - @retval -8 Method used out of sequence or internal state invalid - @retval -10 Attempt to modify commit()ed template - - @note Explicit distributions are by convention described by - effectively one-dimensional process topologies. In other - words, the topology array will be [getNumProcs(), 1, 1, ...]. - - @note Calling sequence: 2. Called by: cohort. - */ - - virtual int setProcTopology(const int topology[] ) = 0; - - /** The types of distributions supported. */ - - enum DistType { - Collapsed, /**< Designates a dimension held entirely on a single - process */ - - Block, /**< HPF-style block distribution. There may be zero or - more blocks on each process, depending on the block - size. Smaller block sizes lead to multiple blocks per - process laid out in cyclic fashion -- the block-cyclic - distribution widely used in linear algebra - packages. This type also describes "cyclic" - distributions. */ - - GenBlock, /**< HPF-style generalized block distribution. - - A generalization of the simple block distribution which allows - blocks to be of arbitrary size on each process, one block per - process. This DistType can be combined with any other - DistType in the other axes except for Explicit. However since - this DistType is irregular, it cannot be represented in the - simple parameters of getDistParameters -- the process-based - inquiry functions or the getMap function must be used instead. - - Here are some examples of 5x5 arrays distributed over 4 - processes using various combinations of GenBlock and other - DistTypes with with block sizes of 4+1 for the GenBlock. - Values indicate process number. -
-	Row DistType: GenBlock      GenBlock      Cyclic
-	Col DistType: GenBlock      Block         GenBlock
-	              0 0 0 0 2     0 0 0 2 2     0 0 0 0 2
-		      0 0 0 0 2     0 0 0 2 2     1 1 1 1 3
-		      0 0 0 0 2     0 0 0 2 2     0 0 0 0 2
-		      0 0 0 0 2     0 0 0 2 2     1 1 1 1 3
-		      1 1 1 1 3     1 1 1 3 3     0 0 0 0 2
-	
*/ - - Implicit, /**< HPF-style implicitly mapped distribution. - - Allows a completely general mapping of elements to processes - on a per-axis basis (overall distribution must be decomposible - as a cartesian product of the per-axis mapping). This - DistType can be combined with any other DistType in the other - axes except for Explicit. However since this DistType is - irregular, it cannot be represented in the simple parameters - of getDistParameters -- the process-based inquiry functions or - the getMaping function must be used instead. - - Each position in an implicit mapping array holds the process - number (on that axis) on which elements at that position of - the array live. By way of example, consider a 9x9 matrix laid - out on a process grid that has three processes across the top - and two down the side. We can label these processes A-F. The - matrix, shown on the left, have been filled in with the - label of the process on which that element lives, and the rows - and columns of the matrix have been labeled with process - number on that axis. The corresponding implicit maps for the - rows and columns are shown on the right. -
-           0 1 1 2 2 2 1 1 0    Row map array: [0, 1, 1, 2, 2, 2, 1, 1, 0]
-           - - - - - - - - -    Column map array: [0, 1, 0, 1, 1, 0, 0]
-        0: A B B C C C B B A
-        1: D E E F F F E E D    Process grid:
-        0: A B B C C C B B A           0 1 2
-        1: D E E F F F E E D        0: A B C
-        1: D E E F F F E E D        1: D E F
-        0: A B B C C C B B A
-        0: A B B C C C B B A
-	
*/ - - Explicit /**< Designates a distribution that is completely user - specified and in general the full distribution cannot be - represented as the cartesian product of the axes. The - Explicit DistType must be used on all dimensions of a - distribution. - - This DistType is a step beyond the GenBlock because this one - cannot in general be expressed one axis at a time. Here - are some examples of Explicit distributions of a 5x5 array - over 4 processes (values indicate process number): -
-	0 0 0 0 2     0 0 0 0 0     0 0 0 0 0
-	0 0 0 0 2     0 0 0 0 0     0 0 2 2 0 
-	1 1 3 3 3     1 1 2 2 2     1 1 2 2 3
-	1 1 3 3 3     1 1 2 2 2     1 1 3 3 3 
-	1 1 3 3 3     1 1 2 2 2     1 1 3 3 3
-	
- Note that in the third case, process 0 has 3 distinct patches - and process 3 has 2 (there are of course several choices for - the exact layout of these patches). */ - }; - - /** Sets distribution type on each axis. - - @param dist (In) array of distribution types of size getRank(). - Note that all axes or none must be specified as Explicit, or it - will return an error. - - @retval 0 Success - @retval -2 Invalid distribution type specification (either all - axes must be Explicit or none) - @retval -8 Method used out of sequence or internal state invalid - @retval -10 Attempt to modify commit()ed template - - @note Calling sequence: 2. Called by: cohort. - - @note An alternative (more object-oriented) approach would - involve the construction of objects defining each axis and then - using an array of them to define the distribution. Note that - this still wouldn't accomodate Explicit distributions, at least - not without further work. Comments welcome! - */ - virtual int setDistType(const enum DistType dist[]) = 0; - - /** Set distribution parameters for an axis with a regular distributions. - - Uses a compact set of parameters which, together with - information about the process topology, fully define the - distribution (only for regular distributions, of course). If - blockSize is smaller than that required to give one block per - process, blocks are laid out in cyclic fashion, allowing the - specification of block-cyclic distributions. - - @param axis (In) Axis for which descriptor is being defined - (0..getRank()-1) - @param blockSize (In) Block size for this axis - @param first (In) Process coordinate (0..N-1) to which first - block is assigned in this axis - - @retval 0 Success - @retval -2 Incompatible distribution type (not Block) - @retval -4 Invalid axis - @retval -5 Invalid blockSize (block size larger than declared bounds) - @retval -6 Invalid process (first does not fit declared topology) - @retval -10 Attempt to modify commit()ed template - - @note Calling sequence: 3. Called by: cohort - - @todo Should probably throw an exception instead of returning - an error code. - */ - virtual int setDistParameters(int axis, int blockSize, - int first) = 0; - - - /** Set distribution parameters for a GenBlock axis. - - @param axis (In) axis for which descriptor is being defined - (0..getRank()-1) - @param blockSizes (In) Array of block size on each process (in - this axis). Length of array is the number of processes on this - axis of the process topology. - - @retval 0 Success - @retval -2 Incompatible distribution type (not GenBlock) - @retval -4 Invalid axis - @retval -5 Invalid blockSize (Size of individual block or total - size inconsistent with declared bounds) - @retval -10 Attempt to modify commit()ed template - - @note Calling sequence: 3. Called by: cohort. - - @todo Should probably throw an exception instead of returning an - error code. - */ - virtual int setGenBlock(int axis, int blockSizes[]) = 0; - - /** Set distribution parameters for an Implicit axis. - - @param axis (In) axis for which descriptor is being defined - (0..getRank()-1) - @param map (In) Array mapping elements to processes. Length of - array is the number of elements in this axis of the template, - values refer processes on this axis of the process topology - (0..N-1). - - @retval 0 Success - @retval -2 Incompatible distribution type (not Implicit) - @retval -4 Invalid axis - @retval -6 Invalid process (in map array) - @retval -10 Attempt to modify commit()ed template - - @note Calling sequence: 3. Called by: cohort. - - @todo Should probably throw an exception instead of returning an - error code. - */ - virtual int setImplicitMap(int axis, int map[]) = 0; - - - /** Add a region to an Explicit distribution. - - Specifies the ownership of a given sub-array of the template. - Applicable only to templates with Explicit distribution on all - axes. - - Such regions must tile the template through repeated calls to - this function. This is an MIMD-style call -- each region must - be specified exactly once across the entire cohort. Typical - usage would involve each process computing its own portion of - the distribution, however it is allowed for one process to add - regions that live on another (in case, for example, the entire - distribution is computed on one node). - - @param lower (In) lower bounds of sub-array region - @param upper (In) upper bounds of sub-array region - - @retval 0 Success - @retval -1 Invalid bounds - @retval -7 Region overlaps with existing region - @retval -10 Attempt to modify commit()ed template - - @note Calling sequence: 3. Called by: individual (MIMD - style). - - @note At one point in its design, this method had an additional - argument: "procCoords (In) Coordinates of process of interest. - Recall that Explicit distributions are by convention described - by effectively one-dimensional process topologies." There is - some question in my mind as to whether this argument is - necessary or desirable. Without it, the owning process must - register the region, while with it any process can register any - region. This may be useful, because it allows an arbitrary - process to compute and register the distribution. - - @note There is no way to remove a region. Perhaps there should - be? This particularly effects cloned templates. - - @todo Should probably throw an exception instead of returning - an error code. - - */ - virtual int addExplicitRegion(int lower[], int upper[]) = 0; - - /** Signal that template is completely defined. Up to this point, - things can be added or changed (as long as there is no other - conflict. After calling commit(), the distribution template is - set in stone and cannot be changed. - - @retval 0 Success - @retval -10 Attempt to modify commit()ed template - - @note Calling sequence: 4. Called by: cohort. - - @todo Should probably throw an exception instead of returning - an error code. - - @note Implementation note: There are a lot of consistency checks - that could/should be done at this point but aren't. If you - build up a template in the normal fashion, many of those checks - will have been applied piecemeal. But if you clone and then - modify a template, or just abuse the calling sequence too much, - it is possible you could have set some things that are not - globally consistent. - -*/ - virtual int commit() = 0; - - /**************************************************************************** - * Query the template - * - * Should these be available only after the template is commit()ed? - ***************************************************************************/ - - /** Name associated with this distribution. (default value is "_UNNAMED") */ - virtual std::string getName() = 0; - - /** Get rank (number of dimensions) of distributed object. */ - virtual int getRank() = 0; - - /** The global upper and lower bounds of the array. - - @param lower (Out) array of global lower bounds of array - @param upper (Out) array of global upper bounds of array - - @retval 0 Success - @retval -8 Internal state invalid - */ - virtual int getGlobalBounds(int lower[], int upper[]) = 0; - - /** Returns process topology. This is the number of processes in - each in axis of the distribution. The topology array is of - length getRank(). Dimensions not distributed have an entry of 1 - in the topology array. The product of the elements of the - topology array should be getNumProcs(). - - For example, a 3-d array on a 6-process system might have a - topology with three processes in the first axis, one in - the second (this axis not distributed) and two in the - third, which would be a topology of [3, 1, 2]. 3x1x2 = 6. - - @param topology (Out) array containing the number of - processes involved in each axis of the data distribution. - - @retval 0 Success - @retval -8 Internal state invalid - - @note Explicit distributions are by convention described by - effectively one-dimensional process topologies. In other - words, the topology array will be [getNumProcs(), 1, 1, ...]. - - */ - virtual int getProcTopology(int topology[] ) = 0; - - /** Returns distribution type on each axis. - - @param dist (Out) array of distribution types of size getRank() - - @retval 0 Success - @retval -8 Internal state invalid - */ - virtual int getDistType(enum DistType dist[] ) = 0; - - /** Get distribution parameters for an axis with a regular distributions. - - Support block/block-cyclic/cyclic distributions - - @param axis (In) Axis for which descriptor is being defined - (0..getRank()-1) - @param blockSize (Out) Block size for this axis - @param first (Out) Process coordinate (0..N-1) to which first - block is assigned in this axis - - @retval 0 Success - @retval -2 Incompatible distribution type (not Block) - @retval -4 Invalid axis - @retval -8 Method used out of sequence or internal state invalid - */ - virtual int getDistParameters(int axis, int blockSize, - int first) = 0; - - - /** Get distribution parameters for a GenBlock axis. - - @param axis (In) axis for which descriptor is being defined - (0..getRank()-1) - @param blockSizes (Out) Array of block size on each process (in - this axis). Length of array is the number of processes on this - axis of the process topology. - - @retval 0 Success - @retval -2 Incompatible distribution type (not GenBlock) - @retval -4 Invalid axis - @retval -8 Method used out of sequence or internal state invalid - */ - virtual int getGenBlock(int axis, int blockSizes[]) = 0; - - /** Get distribution parameters for an Implicit axis. - - @param axis (In) axis for which descriptor is being defined - (0..getRank()-1) - @param map (Out) Array mapping elements to processes. Length of - array is the number of elements in this axis of the template, - values refer processes on this axis of the process topology - (0..N-1). - - @retval 0 Success - @retval -2 Incompatible distribution type (not Implicit) - @retval -4 Invalid axis - @retval -8 Method used out of sequence or internal state invalid - */ - virtual int getImplicitMap(int axis, int map[]) = 0; - - /** Check if the tempate is fully defined (i.e. has been - commit()ed). - - @retval true if commit() has been successfully called on this template - @retval false otherwise - */ - virtual bool isDefined() = 0; - - /** Mainly for testing and debugging */ - virtual void printTemplate() = 0; - -}; -#endif // DistArrayTemplate_h_seen diff --git a/cca/ga_cca_classic/GAClassicPort.h b/cca/ga_cca_classic/GAClassicPort.h deleted file mode 100644 index c98b7d101..000000000 --- a/cca/ga_cca_classic/GAClassicPort.h +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef _GA_CLASSIC_PORT_H -#define _GA_CLASSIC_PORT_H - -class GAClassicPort: public virtual ::classic::gov::cca::Port { - - public: - virtual GlobalArray* createGA(int type, int ndim, int dims[], - char *arrayname, int chunk[]) = 0; - virtual GlobalArray* createGA(int type, int ndim, int dims[], - char *arrayname, int maps[], int block[]) = 0; - virtual GlobalArray* createGA(const GlobalArray *g_b, char *arrayname) = 0; - virtual GlobalArray* createGA(const GlobalArray &g_b) = 0; - virtual GlobalArray* createGA() = 0; - virtual GlobalArray* createGA_Ghosts(int type, int ndim, int dims[], - int width[], char *array_name, - int chunk[]) = 0; - virtual GlobalArray* createGA_Ghosts(int type, int ndim, int dims[], - int width[], char *array_name, - int map[], int nblock[]) = 0; - - virtual void brdcst(void *buf, int lenbuf, int root) = 0; - virtual int clusterNnodes() = 0; - virtual int clusterNodeid() = 0; - virtual int clusterNprocs(int inode) = 0; - virtual int clusterProcid(int inode, int iproc) = 0; - virtual int createMutexes(int number) = 0; - virtual int destroyMutexes() = 0; - virtual void dgop(double x[], int n, char *op) = 0; - virtual int duplicate(int g_a, char* array_name) = 0; - virtual void error(const char *message, int code) = 0; - virtual void fence() = 0; - virtual void igop(int x[], int n, char *op) = 0; - virtual void initFence() = 0; - virtual size_t inquireMemory() = 0; - virtual void lgop(long x[], int n, char *op) = 0; - virtual void lock(int mutex) = 0; - virtual void maskSync(int first, int last) = 0; - virtual int memoryAvailable() = 0; - virtual int memoryLimited() = 0; - virtual int nodeid() = 0; - virtual int nodes() = 0; - virtual void printStats() = 0; - virtual void setMemoryLimit(size_t limit) = 0; - virtual void summarize(int verbose) = 0; - virtual void sync() = 0; - virtual void unlock(int mutex) = 0; - virtual int usesMA() = 0; - virtual int usesFAPI() = 0; - virtual void setServices(::classic::gov::cca::Services *cc) = 0; -}; - -#endif // _GA_CLASSIC_PORT_H diff --git a/cca/ga_cca_classic/GAServices.cca b/cca/ga_cca_classic/GAServices.cca deleted file mode 100644 index f96b39ee4..000000000 --- a/cca/ga_cca_classic/GAServices.cca +++ /dev/null @@ -1,23 +0,0 @@ -# generated CCAFFEINE dynamic library index. -# The ! lines should not contain whitespace before the = -# unless they begin with something other than date,builder,location. -# There may be more than 1 library in a .cca file, in which case -# last ! meta data seen wins for following library and component info. -# Note that if you want to change the user's "apparent" class name -# seen in the UI, you can change the name following the constructor function -# name in this file. -# N.B.: -# This is the exceedingly-poor-mans version of some of the info -# we'd want from a repository. We'd also want a list of ports -# that must be known (dependencies), a list of ports provided or -# at least potentially provided, parallel fabric expectations, -# version knowledge, and the usual ton of other metadata. -# Finally, we'd like to get all this info directly from the -# library itself (perhaps by filtering the source or by running -# 'strings' over the libraries. -!date=Fri Jan 31 11:24:52 PST 2003 -!builder=manoj@ -!location=/msrc/home/manoj/GlobalArray/GA-Component/Version5 -GAServices.so -create_GAServices GA::GAServices - diff --git a/cca/ga_cca_classic/GAServices.cxx b/cca/ga_cca_classic/GAServices.cxx deleted file mode 100644 index 9bfd5ed0d..000000000 --- a/cca/ga_cca_classic/GAServices.cxx +++ /dev/null @@ -1,283 +0,0 @@ -/** - * module: GAServices.cc - * Author: Manoj Kumar Krishnan, PNNL. - */ - -#include "gacca.h" - -#define GA_STACKSIZE 50000 -#define GA_HEAPSIZE 50000 - - -/** - * Constructor and Destructor of GAServices - */ - -GA::GAServices::GAServices() { - - svc = 0; /* services to NULL */ - - // GA Initialization - GA_Initialize(); - // later do it with parameter ports - if(!MA_init(MT_F_DBL, GA_STACKSIZE, GA_HEAPSIZE)) - GA_Error((char *)"MA_init failed", GA_STACKSIZE+GA_HEAPSIZE); -} - -GA::GAServices::~GAServices() { - svc = 0; - - // GA Termination - GA_Terminate(); -} - -GA::GlobalArray * -GA::GAServices::createGA(int type, int ndim, int dims[], - char *arrayname, int chunk[]) { - - GA::GlobalArray * GA = new GA::GlobalArray(type, ndim, dims, arrayname, - chunk); - return GA; -} - -GA::GlobalArray * -GA::GAServices::createGA(int type, int ndim, int dims[], - char *arrayname, int maps[], int block[]) { - - GA::GlobalArray * GA = new GA::GlobalArray(type, ndim, dims, arrayname, - maps, block); - return GA; -} - -GA::GlobalArray * -GA::GAServices::createGA(const GA::GlobalArray *g_b, char *arrayname) { - GA::GlobalArray * GA = new GA::GlobalArray(*g_b, arrayname); - return GA; -} - -GA::GlobalArray * -GA::GAServices::createGA(const GA::GlobalArray &g_b) { - GA::GlobalArray * GA = new GA::GlobalArray(g_b); - return GA; -} - -GA::GlobalArray * -GA::GAServices::createGA() { - GA::GlobalArray * GA = new GA::GlobalArray(); - return GA; -} - -GA::GlobalArray * -GA::GAServices::createGA_Ghosts(int type, int ndim, int dims[], int width[], - char *array_name, int chunk[]) { - /* last argument is a dummy argument, just to increase the count of the - number of arguments, inorder to avoid conflict in # of args */ - GA::GlobalArray * GA = new GA::GlobalArray(type, ndim, dims, width, - array_name, chunk, 'g'); - return GA; -} - -GA::GlobalArray * -GA::GAServices::createGA_Ghosts(int type, int ndim, int dims[], int width[], - char *array_name, int map[], int nblock[]) { - GA::GlobalArray * GA = new GA::GlobalArray(type, ndim, dims, width, - array_name, map, nblock, 'g'); - return GA; -} - -void -GA::GAServices::brdcst(void *buf, int lenbuf, int root) { - GA_Brdcst(buf, lenbuf, root); -} - -int -GA::GAServices::clusterNnodes() { - return GA_Cluster_nnodes(); -} - -int -GA::GAServices::clusterNodeid() { - return GA_Cluster_nodeid(); -} - -int -GA::GAServices::clusterNprocs(int inode) { - return GA_Cluster_nprocs(inode) ; -} - -int -GA::GAServices::clusterProcid(int inode, int iproc) { - return GA_Cluster_procid(inode, iproc); -} - -int -GA::GAServices::createMutexes(int number) { - return GA_Create_mutexes(number); -} - -int -GA::GAServices::destroyMutexes() { - return GA_Destroy_mutexes(); -} - -void -GA::GAServices::dgop(double x[], int n, char *op) { - GA_Dgop(x, n, op); -} - -int -GA::GAServices::duplicate(int g_a, char* array_name) { - return GA_Duplicate(g_a, array_name); -} - -void -GA::GAServices::error(const char *message, int code) { - GA_Error((char *)message, code); -} - -void -GA::GAServices::fence() { - GA_Fence(); -} - -void -GA::GAServices::igop(Integer x[], int n, char *op) { - GA_Igop(x, n, op); -} - -void -GA::GAServices::initFence() { - GA_Init_fence(); -} - -size_t -GA::GAServices::inquireMemory() { - return GA_Inquire_memory(); -} - -void -GA::GAServices::lgop(long x[], int n, char *op) { - GA_Lgop(x, n, op); -} - -void -GA::GAServices::lock(int mutex) { - GA_Lock(mutex); -} - -void -GA::GAServices::maskSync(int first, int last) { - GA_Mask_sync(first, last); -} - -int -GA::GAServices::memoryAvailable() { - return GA_Memory_avail(); -} - -int -GA::GAServices::memoryLimited() { - return GA_Memory_limited(); -} - -int -GA::GAServices::nodeid() { - return GA_Nodeid(); -} - -int -GA::GAServices::nodes() { - return GA_Nnodes(); -} - -void -GA::GAServices::printStats() { - GA_Print_stats(); -} - -void -GA::GAServices::setMemoryLimit(size_t limit) { - GA_Set_memory_limit(limit); -} - -void -GA::GAServices::summarize(int verbose) { - GA_Summarize(verbose); -} - -void -GA::GAServices::sync() { - GA_Sync(); -} - -void -GA::GAServices::unlock(int mutex) { - GA_Unlock(mutex); -} - -int -GA::GAServices::usesMA() { - return GA_Uses_ma(); -} - -int -GA::GAServices::usesFAPI() { - return GA_Uses_fapi(); -} - -void -GA::GAServices::setServices(::classic::gov::cca::Services *cc){ - - int err = 0; - classic::gov::cca::PortInfo* pInfo; - - IO_dn1("In DistArrayDescriptorFactory::setServices entry\n"); - - // We're being shut down - if (cc == 0) { - // Are we shut down already? - if (svc == 0) { return; } - - // Shutdown - svc->removeProvidesPort("ga_classic_port"); - svc->removeProvidesPort("TemplateFactory"); - svc->removeProvidesPort("DescriptorFactory"); - svc = cc; - return; - } - - svc = cc; - - IO_dn1("In GA::GAServices::setServices entry\n"); - - /****** Provide GA Classic Port ******/ - err = svc->addProvidesPort(dynamic_cast(this), - svc->createPortInfo("ga_classic_port", "GAClassicPort", 0)); - if ( err != 0) { - IO_dn1("In GA::GAServices::setServices addProvidesPort(ga_classic) failed\n"); - ::abort(); - } - pInfo = 0; // Current version of spec says we should destroy our copy - - - /****** Provide DADF Template Factory Port *******/ - pInfo = svc->createPortInfo("TemplateFactory", - "DistArrayTemplFactoryPort", 0); - err = svc->addProvidesPort(this, pInfo); - if ( err != 0) { - IO_dn1("In GA::GAServices::setServices addProvidesPort(ga_dadf) failed\n"); - ::abort(); - } - pInfo = 0; // Current version of spec says we should destroy our copy - - - /****** Provide DADF Array Descriptor Factory Port *******/ - pInfo = svc->createPortInfo("DescriptorFactory", - "DistArrayDescrFactoryPort", 0); - err = svc->addProvidesPort(this, pInfo); - if ( err != 0) { - IO_dn1("In GA::GAServices::setServices addProvidesPort(ga_dadf) failed\n"); - ::abort(); - } - pInfo = 0; // Current version of spec says we should destroy our copy -} diff --git a/cca/ga_cca_classic/GAServices.h b/cca/ga_cca_classic/GAServices.h deleted file mode 100644 index 936794de1..000000000 --- a/cca/ga_cca_classic/GAServices.h +++ /dev/null @@ -1,570 +0,0 @@ -#ifndef _GA_SERVICES_H -#define _GA_SERVICES_H - -/** - * GAServices : Global Arrays Services class. - * - * Author: Manoj Kumar Krishnan, PNNL. - * - * Collecting the global information: who am I, and how many processors - * are being used. Initialize the communication library (either MPI or - * TCSMSG) and Global array. Allocate momory to be used by GA by calling MA - * and create global arrays. - */ - -class GAServices : public virtual ::classic::gov::cca::Component, - public virtual GAClassicPort, - public virtual ::classic::gov::cca::DistArrayTemplFactoryPort, - public virtual ::classic::gov::cca::DistArrayDescrFactoryPort { - - public: - /** - * Null-constructor. The component won't really be 'alive' - * much at all until after setServices is called on it. - */ - GAServices(); - - /** Destructor. */ - virtual ~GAServices(); - - /** - * Creates an ndim-dimensional array using the regular distribution model - * and returns integer handle representing the array. - - * The array can be distributed evenly or not. The control over the - * distribution is accomplished by specifying chunk (block) size for all or - * some of array dimensions. - - * For example, for a 2-dimensional array, setting chunk[0]=dim[0] gives - * distribution by vertical strips (chunk[0]*dims[0]); - * setting chunk[1]=dim[1] gives distribution by horizontal strips - * (chunk[1]*dims[1]). Actual chunks will be modified so that they are at - * least the size of the minimum and each process has either zero or one - * chunk. Specifying chunk[i] as <1 will cause that dimension to be - * distributed evenly. - - * As a convenience, when chunk is specified as NULL, the entire array is - * distributed evenly. - - * \n This is a collective operation. - - * @param arrayname - a unique character string [input] - * @param type - data type(MT_F_DBL,MT_F_INT,MT_F_DCPL) [input] - * @param ndim - number of array dimensions [input] - * @param dims[ndim] - array of dimensions [input] - * @param chunk[ndim] - array of chunks, each element specifies - * minimum size that given dimensions should be chunked up into [input] - - * @return Returns pointer to GlobalArray object created. Returns - * NULL if it fails to create a GA object. - */ - GlobalArray * createGA(int type, int ndim, int dims[], char *arrayname, - int chunk[]); - - /** - * Creates an array by following the user-specified distribution and - * returns integer handle representing the array. - - * The distribution is specified as a Cartesian product of distributions - * for each dimension. The array indices start at 0. For example, the - * following figure demonstrates distribution of a 2-dimensional array 8x10 - * on 6 (or more) processors. nblock[2]={3,2}, the size of map array is s=5 - * and array map contains the following elements map={0,2,8, 0, 5}. The - * distribution is nonuniform because, P1 and P4 get 20 elements each and - * processors P0,P2,P3, and P5 only 10 elements each. - * - * - * - * - * - * - *
5 5
P0 P3 2
P1 P4 4
P2 P5 2
- * - * \n This is a collective operation. - * @param arrayname - a unique character string [input] - * @param type - MA data type (MT_F_DBL,MT_F_INT,MT_F_DCPL) [input] - * @param ndim - number of array dimensions [input] - * @param dims - array of dimension values [input] - * @param block[ndim] - no. of blocks each dimension is divided into [input] - * @param maps[s] - starting index for for each block; the size s is a sum - * all elements of nblock array [input] - * @return Returns pointer to GlobalArray object created. Returns - * NULL if it fails to create a GA object. - */ - GlobalArray * createGA(int type, int ndim, int dims[], char *arrayname, - int maps[], int block[]); - - /** - * Creates a new array by applying all the properties of another existing - * array. - * \n This is a collective operation. - * @param arrayname - a character string [input] - * @param g_b - integer handle for reference array [input] - * @return Returns pointer to GlobalArray object created. Returns - * NULL if it fails to create a GA object. - */ - GlobalArray * createGA(const GlobalArray *g_b, char *arrayname); - - /** - * Creates a new array by applying all the properties of another existing - * array. - * \n This is a collective operation. - * @param g_b - integer handle for reference array [input] - * @return Returns pointer to GlobalArray object created. Returns - * NULL if it fails to create a GA object. - */ - GlobalArray * createGA(const GlobalArray &g_b); - - /** - * Creates a 10x10 global array of type "double"(default). - * @return Returns pointer to GlobalArray object created. Returns - * NULL if it fails to create a GA object. - */ - GlobalArray * createGA(); - - /** - * Creates an ndim-dimensional array with a layer of ghost cells around - * the visible data on each processor using the regular distribution - * model and returns an integer handle representing the array. - * The array can be distributed evenly or not evenly. The control over - * the distribution is accomplished by specifying chunk (block) size for - * all or some of the array dimensions. For example, for a 2-dimensional - * array, setting chunk(1)=dim(1) gives distribution by vertical strips - * (chunk(1)*dims(1)); setting chunk(2)=dim(2) gives distribution by - * horizontal strips (chunk(2)*dims(2)). Actual chunks will be modified - * so that they are at least the size of the minimum and each process - * has either zero or one chunk. Specifying chunk(i) as <1 will cause - * that dimension (i-th) to be distributed evenly. The width of the - * ghost cell layer in each dimension is specified using the array - * width(). The local data of the global array residing on each - * processor will have a layer width[n] ghosts cells wide on either - * side of the visible data along the dimension n. - * - * @param array_name - a unique character string [input] - * @param type - data type (MT_DBL,MT_INT,MT_DCPL) [input] - * @param ndim - number of array dimensions [input] - * @param dims[ndim] - array of dimensions [input] - * @param width[ndim] - array of ghost cell widths [input] - * @param chunk[ndim] - array of chunks, each element specifies - * minimum size that given dimensions should be - * chunked up into [input] - * - * @returns Returns pointer to GlobalArray object created. Returns - * NULL if it fails to create a GA object. - */ - GlobalArray * createGA_Ghosts(int type, int ndim, int dims[], - int width[], char *array_name, - int chunk[]); - - /** - * Creates an array with ghost cells by following the user-specified - * distribution and returns integer handle representing the array. - * The distribution is specified as a Cartesian product of distributions - * for each dimension. For example, the following figure demonstrates - * distribution of a 2-dimensional array 8x10 on 6 (or more) processors. - * nblock(2)={3,2}, the size of map array is s=5 and array map contains - * the following elements map={1,3,7, 1, 6}. The distribution is - * nonuniform because, P1 and P4 get 20 elements each and processors - * P0,P2,P3, and P5 only 10 elements each. - * - * - * - * - * - * - *
5 5
P0 P3 2
P1 P4 4
P2 P5 2
- * - * The array width[] is used to control the width of the ghost cell - * boundary around the visible data on each processor. The local data - * of the global array residing on each processor will have a layer - * width[n] ghosts cells wide on either side of the visible data along - * the dimension n. - * - * @param array_name - a unique character string [input] - * @param type - data type (MT_DBL,MT_INT,MT_DCPL) [input] - * @param ndim - number of array dimensions [input] - * @param dims[ndim] - array of dimensions [input] - * @param width[ndim] - array of ghost cell widths [input] - * @param nblock[ndim] - no. of blocks each dimension is divided into[input] - * @param map[s] - starting index for for each block; the size - * s is a sum of all elements of nblock array[input] - * - * @return Returns pointer to GlobalArray object created. Returns - * NULL if it fails to create a GA object. - * \n This is a collective operation. - */ - GlobalArray * createGA_Ghosts(int type, int ndim, int dims[], - int width[], char *array_name, int map[], - int nblock[]); - - /** - * @param lenbuf - length of buffer [input] - * @param buf[lenbuf] - data [input/output] - * @param root - root process [input] - * - * Broadcast from process root to all other processes a message of - * length lenbuf. This is operation is provided only for convenience - * purposes: it is available regardless of the message-passing library - * that GA is running with. - * \n This is a collective operation. - */ - void brdcst(void *buf, int lenbuf, int root); - - /** - * This functions returns the total number of nodes that the program is - * running on. On SMP architectures, this will be less than or equal to - * the total number of processors. - * \n This is a local operation. - */ - int clusterNnodes(); - - /** - * This function returns the node ID of the process. On SMP architectures - * with more than one processor per node, several processes may return the - * same node id. - * \n This is a local operation. - */ - int clusterNodeid(); - - /** - * This function returns the number of processors available on node inode. - * \n This is a local operation. - * @param inode [input] - */ - int clusterNprocs(int inode); - - /** - * This function returns the processor id associated with node inode and - * the local processor id iproc. If node inode has N processors, then the - * value of iproc lies between 0 and N-1. - * @param inode,iproc [input] - * \n This is a local operation. - */ - int clusterProcid(int inode, int iproc); - - /** - * Creates a set containing the number of mutexes. Returns 0 if the - * opereation succeeded or 1 when failed. Mutex is a simple - * synchronization object used to protect Critical Sections. Only one - * set of mutexes can exist at a time. Array of mutexes can be created - * and destroyed as many times as needed. - * Mutexes are numbered: 0, ..., number -1. - * \n This is a collective operation. - * @param number - number of mutexes in mutex array [input] - */ - int createMutexes(int number); - - /** - * Destroys the set of mutexes created with ga_create_mutexes. Returns 0 - * if the operation succeeded or 1 when failed. - * \n This is a collective operation. - */ - int destroyMutexes(); - - /** - * @param n - number of elements [input] - * @param x[n] - array of elements [input/output] - * @param op - operator [input] - * - * Double Global OPeration. - * - * X(1:N) is a vector present on each process. DGOP 'sums' elements of - * X accross all nodes using the commutative operator OP. The result is - * broadcast to all nodes. Supported operations include '+', '*', 'max', - * 'min', 'absmax', 'absmin'. The use of lowerecase for operators is - * necessary. This is operation is provided only for convenience purposes: - * it is available regardless of the message-passing library that GA is - * running with. \n This is a collective operation. - */ - void dgop(double x[], int n, char *op); - - /** - * @param array_name - a character string [input] - * @param g_a - integer handle for reference array [input] - * - * Creates a new array by applying all the properties of another existing - * array. It returns array handle. - * Return value: a non-zero array handle means the call was succesful. - * \n This is a collective operation. - */ - int duplicate(int g_a, char* array_name); - - /** - * To be called in case of an error. Print an error message and an integer - * value that represents error code. Releases some system resources. - * This is the required way of aborting the program execution. - * This operation is local. - * @param message - string to print [input] - * @param code - code to print [input] - */ - void error(const char *message, int code); - - /** - * Blocks the calling process until all the data transfers corresponding to - * GA operations called after ga_init_fence complete. For example, since - * ga_put might return before the data reaches the final destination, - * ga_init_fence and ga_fence allow process to wait until the data tranfer - * is fully completed: - * - * ga_init_fence(); - * ga_put(g_a, ...); - * ga_fence(); - * - * ga_fence must be called after ga_init_fence. A barrier, ga_sync, assures - * completion of all data transfers and implicitly cancels all outstanding - * ga_init_fence calls. ga_init_fence and ga_fence must be used in pairs, - * multiple calls to ga_fence require the same number of corresponding - * ga_init_fence calls. ga_init_fence/ga_fence pairs can be nested. - * - * ga_fence works for multiple GA operations. For example: - * - * ga_init_fence(); - * ga_put(g_a, ...); - * ga_scatter(g_a, ...); - * ga_put(g_b, ...); - * ga_fence(); - * - * The calling process will be blocked until data movements initiated by - *two calls to ga_put and one ga_scatter complete. - */ - void fence(); - - /** - * @param n - number of elements [input] - * @param x[n] - array of elements [input/output] - * @param op - operator [input] - * - * Integer Global OPeration. The integer (more precisely long) version - * of ga_dgop described above, also include the bitwise OR operation. - * This is operation is provided only for convenience purposes: it is - * available regardless of the message-passing library that GA is running - * with. \n This is a collective operation. - */ - void igop(Integer x[], int n, char *op); - - /** - * Initializes tracing of completion status of data movement operations. - * This operation is local. - */ - void initFence(); - - /** - * Returns amount of memory (in bytes) used in the allocated global - * arrays on the calling processor. This operation is local. - */ - size_t inquireMemory(); - - /** - * - * Long Global OPeration. - * - * X(1:N) is a vector present on each process. LGOP 'sums' elements of - * X accross all nodes using the commutative operator OP. The result is - * broadcast to all nodes. Supported operations include '+', '*', 'max', - * 'min', 'absmax', 'absmin'. The use of lowerecase for operators is - * necessary. This is operation is provided only for convenience purposes: - * it is available regardless of the message-passing library that GA is - * running with. \n This is a collective operation. - * @param n - number of elements [input] - * @param x[n] - array of elements [input/output] - * @param op - operator [input] - */ - void lgop(long x[], int n, char *op); - - /** - * @param mutex - mutex object id [input] - * - * Locks a mutex object identified by the mutex number. It is a fatal - * error for a process to attempt to lock a mutex which was already - * locked by this process. - */ - void lock(int mutex); - - /** - * GA Collective calls has Sync calls at the begining and ending of - * of the call. Sometimes there may be some redundacy in sync calls, which - * can be avoided by masking the sync operations. - * @ param first - masks the sync at the begining of the collective call. - * @ param last - masks the sync at the end of the collective call. - * setting the parameters as zero will mask (disable) the call. Any non-zero - * value will enable the call. Initially these params are set to non-zero - * value. - */ - void maskSync(int first, int last); - - /** - * @return Returns amount of memory (in bytes) left for allocation of new - * global arrays on the calling processor. - - * @note If GA_uses_ma returns true, then GA_Memory_avail returns the - * lesser of the amount available under the GA limit and the amount - * available from MA (according to ma_inquire_avail operation). - * If no GA limit has been set, it returns what MA says is available. - * If ( ! GA_Uses_ma() && ! GA_Memory_limited() ) returns < 0, indicating - * that the bound on currently available memory cannot be determined. - * This operation is local. - */ - int memoryAvailable() ; - - /** - * Indicates if limit is set on memory usage in Global Arrays on the - * calling processor. "1" means "yes", "0" means "no". This operation - * is local. - */ - int memoryLimited(); - - /** - * Returns the GA process id (0, ..., ga_Nnodes()-1) of the requesting - * compute process. This operation is local. - */ - int nodeid(); - - /** - * Returns the number of the GA compute (user) processes. - * This operation is local. - */ - int nodes(); - - /** - * This non-collective (MIMD) operation prints information about: - * - * number of calls to the GA create/duplicate, destroy, get, put, scatter, - * gather, and read_and_inc operations total amount of data moved in the - * GA primitive operations amount of data moved in GA primitive operations - * to logicaly remote locations maximum memory consumption in global - * arrays, and number of requests serviced in the interrupt-driven - * implementations by the calling process. This operation is local. - */ - void printStats(); - - /** - * @param limit - the amount of memory in bytes per process [input] - * - * Sets the amount of memory to be used (in bytes) per process. - * \n This is a local operation. - */ - void setMemoryLimit(size_t limit); - - /** - * @param verbose - If true print distribution info [input] - * Prints info about allocated arrays. - */ - void summarize(int verbose); - - /** - * Synchronize processes (a barrier) and ensure that all GA operations - * completed. - * \n This is a collective operation. - */ - void sync(); - - /** - * @param mutex - mutex object id [input] - * - * Unlocks a mutex object identified by the mutex number. It is a fatal - * error for a process to attempt to unlock a mutex which has not been - * locked by this process. - */ - void unlock(int mutex); - - /** - * Returns "1" if memory in global arrays comes from the Memory Allocator - * (MA). "0"means that memory comes from another source, for example - * System V shared memory is used. This operation is local. - */ - int usesMA(); - - /** - * Returns "1" if uses fortran API, else returns "0" - */ - int usesFAPI(); - - /** - * ***************************** - * DADF Interfaces - * ***************************** - */ - /**************************************************************************** - * DistArrayTemplFactoryPort - ***************************************************************************/ - - /** Return an uninitialized template object. */ - virtual DistArrayTemplate * createTemplate(std::string name); - - /** Return a template object initialized with the contents of - another, but not frozen against modification. */ - virtual DistArrayTemplate * cloneTemplate( - DistArrayTemplate * original, std::string cloneName); - - /** Destroy an existing template. */ - virtual int destroyTemplate(DistArrayTemplate * & victim); - - /**************************************************************************** - * DistArrayDescrFactoryPort - ***************************************************************************/ - - /** Return an uninitialized descriptor object. */ - virtual DistArrayDescriptor * createDescriptor(std::string name); - - /** Return a descriptor object initialized with the contents of - another, but not frozen against modification. */ - virtual DistArrayDescriptor * cloneDescriptor( - DistArrayDescriptor * original, std::string cloneName); - - /** Destroy an existing descriptor. */ - virtual int destroyDescriptor(DistArrayDescriptor * & victim); - - /** Return an uninitialized ga-dadf distributed array object. */ - virtual DistArray * createArray(std::string name); - - /** Return an distributed array object initialized with the contents of - another, but not frozen against modification. */ - virtual DistArray * cloneArray(DistArray* original, - std::string cloneName); - - /** Destroy an existing distributed array. */ - virtual int destroyArray(DistArray* & victim); - - - /** - * The components containing framework provides services through - * the Services interface. This will be called with a Services - * when the component is created and with 0/NULL when the component - * is about to be destroyed. - * If the framework unrecoverably misbehaves (a port definition - * call on 'cc' fails, then this call will not return (abort() is invoked) - * This component is not compatible with such a framework. - */ - void setServices(::classic::gov::cca::Services *cc); - - private: - /** The services we receive from the outer framework or container. */ - ::classic::gov::cca::Services *svc; - - /** Map to track templates we've handed out and destroyed */ - std::map templateRecord; - - /** Map to track descriptors we've handed out and destroyed */ - std::map descriptorRecord; - - /** Map to track arrays we've handed out and destroyed */ - std::map arrayRecord; - - /** List outstanding templates on stderr - @param label (in) String to label output - */ - void listTemplates(std::string label); - - /** List outstanding descriptors on stderr - @param label (in) String to label output - */ - void listDescriptors(std::string label); - - /** List outstanding arrays on stderr - @param label (in) String to label output - */ - void listArrays(std::string label); -}; - - -#endif // _GA_SERVICES_H diff --git a/cca/ga_cca_classic/GAServices_DADF.cxx b/cca/ga_cca_classic/GAServices_DADF.cxx deleted file mode 100644 index 15f62fd5c..000000000 --- a/cca/ga_cca_classic/GAServices_DADF.cxx +++ /dev/null @@ -1,609 +0,0 @@ -/** - Creation, cloning, and destruction of array distribution templates and - distributed array descriptors through two CCA ports. - - Templates conform to the DistArrayTemplate abstract base class, - and descriptors to the DistArrayDescriptor abstract base class. - - In this implementation, some care is taken to try to check for bad - pointers and other problems in order to return an error rather - than cause the code to seg fault. -*/ - -#include -#include -#include - -// Our header -#include "gacca.h" - -// Concrete classes for templates, descriptors and distributed arrays -#include "DADFTemplate.h" -#include "DADFDescriptor.h" -#include "GA_DADFArray.h" - -/****************************************************************************** - * DistArrayTemplFactoryPort -******************************************************************************/ - -/** Return an uninitialized template object. */ -DistArrayTemplate* -GA::GAServices::createTemplate(std::string name) { - DADFTemplate* templDADF; - - /** Create a template of our private class */ - - templDADF = new DADFTemplate( name ); - if ( templDADF == 0 ) { - cerr << "GA::GAServices::createTemplate:" << - "Unable to create new template." << endl; - return 0; - } - - /** Record the address for later reference */ - - templateRecord[ templDADF ]++; - - /** Cast from our private class to the abstract base class we - publicized */ - - return dynamic_cast (templDADF); -} - -/** Return a template object initialized with the contents of - another, but not frozen against modification. */ -DistArrayTemplate* -GA::GAServices::cloneTemplate( - DistArrayTemplate * original, std::string cloneName) { - - DADFTemplate * originalDADF, * templDADF; - - /** Check that the original template is one we provided. */ - - if ( templateRecord.count( original ) ) { - - /** How many templates do we have with this address? - 0 means we had one, but its been destroyed already. */ - - if ( templateRecord[ original ] == 0 ) { - cerr << "GA::GAServices::cloneTemplate: " << - "Original template invalid: previously destroyed." << endl; - return 0; - } - - /** Anything other than 1 shouldn't happen. We'll let it go with - a warning -- in case it is our mistake -- but it will probably - bomb. Maybe this should be an error? */ - else if ( templateRecord[ original ] != 1 ) { - cerr << "GA::GAServices::cloneTemplate: " << - "Internal records indicate " << templateRecord[ original ] << - "outstanding templates at address of original." << endl << - "Something has gone badly wrong. Will try to proceed anyway." - << endl; - } - } - - /** If we don't even have a record of this template, it means we - didn't create it. So either its a bad pointer, or it was - created with another factory. In either case, we don't really - want to mess with it. */ - - else { - cerr << "GA::GAServices::cloneTemplate: " << - "Original template not from this factory." << endl; - return 0; - } - - /** Cast it from the abstract class to our private class. This is a - further check that we should be able to deal with it properly, - and something we have to do to create a new instance. */ - - originalDADF = dynamic_cast(original); - if ( originalDADF == 0 ) { - cerr << "GA::GAServices::cloneTemplate: " << - "Original template is of the wrong type." << endl; - return 0; - } - - /** Create a new instance with a copy constructor from the original */ - - templDADF = new DADFTemplate( cloneName, *originalDADF ); - if ( templDADF == 0 ) { - cerr << "GA::GAServices::createTemplate: " << - "Unable to create new template." << endl; - return 0; - } - - /** Record the address for our records */ - - templateRecord[ templDADF ]++; - - /** Cast from our private class to the abstract base class we - publicized */ - - return dynamic_cast (templDADF); -} - -/** Destroy an existing template. - - @returns a null pointer in victim -*/ -int GA::GAServices::destroyTemplate( - DistArrayTemplate * & victim) { - DADFTemplate * victimDADF; - - /** Check that the victim template is one we provided. */ - - if ( templateRecord.count( victim ) ) { - - /** How many templates do we have with this address? - 0 means we had one, but its been destroyed already. */ - - if ( templateRecord[ victim ] == 0 ) { - cerr << "GA::GAServices::destroyTemplate: " << - "Template invalid: previously destroyed." << endl; - return -2; - } - - /** Anything other than 1 shouldn't happen. We'll let it go with - a warning -- in case it is our mistake -- but it will probably - bomb. Maybe this should be an error? */ - else if ( templateRecord[ victim ] != 1 ) { - cerr << "GA::GAServices::destroyTemplate: " << - "Internal records indicate " << templateRecord[ victim ] << - "outstanding templates at address of victim." << endl << - "Something has gone badly wrong. Will try to proceed anyway." - << endl; - } - } - - /** If we don't even have a record of this template, it means we - didn't create it. So either its a bad pointer, or it was - created with another factory. In either case, we don't really - want to mess with it. */ - - else { - cerr << "GA::GAServices::destroyTemplate: " << - "Template not from this factory." << endl; - return -1; - } - - /** Cast it from the abstract class to our private class. This is a - further check that we should be able to deal with it properly. */ - - victimDADF = dynamic_cast(victim); - if ( victimDADF == 0 ) { - cerr << "GA::GAServices::destroyTemplate: " << - "Template is of the wrong type." << endl; - return -3; - } - - /** Get rid of the victim */ - delete victimDADF; - - /** Record the fact that we got rid of it */ - templateRecord[ victim ]--; - - /** Clear the pointer so user knows its no longer valid */ - victim = 0; - - /** Success */ - return 0; -} - -/****************************************************************************** - * DistArrayDescrFactoryPort -******************************************************************************/ - -/** Return an uninitialized descriptor object. */ -DistArrayDescriptor* -GA::GAServices::createDescriptor(std::string name) { - DADFDescriptor* descrDADF; - - /** Create a descriptor of our private class */ - - descrDADF = new DADFDescriptor( name ); - if ( descrDADF == 0 ) { - cerr << "GA::GAServices::createDescriptor:" << - "Unable to create new descriptor." << endl; - return 0; - } - - /** Record the address for later reference */ - - descriptorRecord[ descrDADF ]++; - - /** Cast from our private class to the abstract base class we - publicized */ - - return dynamic_cast (descrDADF); -} - -/** Return a descriptor object initialized with the contents of - another, but not frozen against modification. */ -DistArrayDescriptor* -GA::GAServices::cloneDescriptor( - DistArrayDescriptor * original, std::string cloneName) { - - DADFDescriptor * originalDADF, * descrDADF; - - /** Check that the original descriptor is one we provided. */ - - if ( descriptorRecord.count( original ) ) { - - /** How many descriptors do we have with this address? - 0 means we had one, but its been destroyed already. */ - - if ( descriptorRecord[ original ] == 0 ) { - cerr << "GA::GAServices::cloneDescriptor: " << - "Original descriptor invalid: previously destroyed." << endl; - return 0; - } - - /** Anything other than 1 shouldn't happen. We'll let it go with - a warning -- in case it is our mistake -- but it will probably - bomb. Maybe this should be an error? */ - else if ( descriptorRecord[ original ] != 1 ) { - cerr << "GA::GAServices::cloneDescriptor: " << - "Internal records indicate " << descriptorRecord[ original ] << - "outstanding descriptors at address of original." << endl << - "Something has gone badly wrong. Will try to proceed anyway." - << endl; - } - } - - /** If we don't even have a record of this descriptor, it means we - didn't create it. So either its a bad pointer, or it was - created with another factory. In either case, we don't really - want to mess with it. */ - - else { - cerr << "GA::GAServices::cloneDescriptor: " << - "Original descriptor not from this factory." << endl; - return 0; - } - - /** Cast it from the abstract class to our private class. This is a - further check that we should be able to deal with it properly, - and something we have to do to create a new instance. */ - - originalDADF = dynamic_cast(original); - if ( originalDADF == 0 ) { - cerr << "GA::GAServices::cloneDescriptor: " << - "Original descriptor is of the wrong type." << endl; - return 0; - } - - /** Create a new instance with a copy constructor from the original */ - - descrDADF = new DADFDescriptor( cloneName, *originalDADF ); - if ( descrDADF == 0 ) { - cerr << "GA::GAServices::createDescriptor: " << - "Unable to create new descriptor." << endl; - return 0; - } - - /** Record the address for our records */ - - descriptorRecord[ descrDADF ]++; - - /** Cast from our private class to the abstract base class we - publicized */ - - return dynamic_cast (descrDADF); -} - -/** Destroy an existing descriptor. - - @returns a null pointer in victim -*/ -int GA::GAServices::destroyDescriptor( - DistArrayDescriptor * & victim) { - DADFDescriptor * victimDADF; - - /** Check that the victim descriptor is one we provided. */ - - if ( descriptorRecord.count( victim ) ) { - - /** How many descriptors do we have with this address? - 0 means we had one, but its been destroyed already. */ - - if ( descriptorRecord[ victim ] == 0 ) { - cerr << "GA::GAServices::destroyDescriptor: " << - "Descriptor invalid: previously destroyed." << endl; - return -2; - } - - /** Anything other than 1 shouldn't happen. We'll let it go with - a warning -- in case it is our mistake -- but it will probably - bomb. Maybe this should be an error? */ - else if ( descriptorRecord[ victim ] != 1 ) { - cerr << "GA::GAServices::destroyDescriptor: " << - "Internal records indicate " << descriptorRecord[ victim ] << - "outstanding descriptors at address of victim." << endl << - "Something has gone badly wrong. Will try to proceed anyway." - << endl; - } - } - - /** If we don't even have a record of this descriptor, it means we - didn't create it. So either its a bad pointer, or it was - created with another factory. In either case, we don't really - want to mess with it. */ - - else { - cerr << "GA::GAServices::destroyDescriptor: " << - "Descriptor not from this factory." << endl; - return -1; - } - - /** Cast it from the abstract class to our private class. This is a - further check that we should be able to deal with it properly. */ - - victimDADF = dynamic_cast(victim); - if ( victimDADF == 0 ) { - cerr << "GA::GAServices::destroyDescriptor: " << - "Descriptor is of the wrong type." << endl; - return -3; - } - - /** Get rid of the victim */ - delete victimDADF; - - /** Record the fact that we got rid of it */ - descriptorRecord[ victim ]--; - - /** Clear the pointer so user knows its no longer valid */ - victim = 0; - - /** Success */ - return 0; -} - - -/** Return an uninitialized distributed array object. */ -DistArray* -GA::GAServices::createArray(std::string name) { - DADFArray* arrayDADF; - - /** Create an array of our private class */ - - arrayDADF = new DADFArray( name ); - if ( arrayDADF == 0 ) { - cerr << "GA::GAServices::createArray:" << - "Unable to create new array." << endl; - return 0; - } - - /** Record the address for later reference */ - arrayRecord[ arrayDADF ]++; - - /** Cast from our private class to the abstract base class we - publicized */ - - return dynamic_cast (arrayDADF); -} - -/** Return an array object initialized with the contents of - another, but not frozen against modification. */ -DistArray* -GA::GAServices::cloneArray( - DistArray * original, std::string cloneName) { - - DADFArray * originalDADF, * arrayDADF; - - /** Check that the original array is one we provided. */ - - if ( arrayRecord.count( original ) ) { - - /** How many arrays do we have with this address? - 0 means we had one, but its been destroyed already. */ - - if ( arrayRecord[ original ] == 0 ) { - cerr << "GA::GAServices::cloneArray: " << - "Original array invalid: previously destroyed." << endl; - return 0; - } - - /** Anything other than 1 shouldn't happen. We'll let it go with - a warning -- in case it is our mistake -- but it will probably - bomb. Maybe this should be an error? */ - else if ( arrayRecord[ original ] != 1 ) { - cerr << "GA::GAServices::cloneArray: " << - "Internal records indicate " << arrayRecord[ original ] << - "outstanding arrays at address of original." << endl << - "Something has gone badly wrong. Will try to proceed anyway." - << endl; - } - } - - /** If we don't even have a record of this array, it means we - didn't create it. So either its a bad pointer, or it was - created with another factory. In either case, we don't really - want to mess with it. */ - - else { - cerr << "GA::GAServices::cloneArray: " << - "Original array not from this factory." << endl; - return 0; - } - - /** Cast it from the abstract class to our private class. This is a - further check that we should be able to deal with it properly, - and something we have to do to create a new instance. */ - - originalDADF = dynamic_cast(original); - if ( originalDADF == 0 ) { - cerr << "GA::GAServices::cloneArray: " << - "Original array is of the wrong type." << endl; - return 0; - } - - /** Create a new instance with a copy constructor from the original */ - - arrayDADF = new DADFArray( cloneName, *originalDADF ); - if ( arrayDADF == 0 ) { - cerr << "GA::GAServices::createArray: " << - "Unable to create new array." << endl; - return 0; - } - - /** Record the address for our records */ - - arrayRecord[ arrayDADF ]++; - - /** Cast from our private class to the abstract base class we - publicized */ - - return dynamic_cast (arrayDADF); -} - -/** Destroy an existing array. - - @returns a null pointer in victim -*/ -int GA::GAServices::destroyArray( - DistArray * & victim) { - DADFArray * victimDADF; - - /** Check that the victim array is one we provided. */ - - if ( arrayRecord.count( victim ) ) { - - /** How many arrays do we have with this address? - 0 means we had one, but its been destroyed already. */ - - if ( arrayRecord[ victim ] == 0 ) { - cerr << "GA::GAServices::destroyArray: " << - "Array invalid: previously destroyed." << endl; - return -2; - } - - /** Anything other than 1 shouldn't happen. We'll let it go with - a warning -- in case it is our mistake -- but it will probably - bomb. Maybe this should be an error? */ - else if ( arrayRecord[ victim ] != 1 ) { - cerr << "GA::GAServices::destroyArray: " << - "Internal records indicate " << arrayRecord[ victim ] << - "outstanding arrays at address of victim." << endl << - "Something has gone badly wrong. Will try to proceed anyway." - << endl; - } - } - - /** If we don't even have a record of this array, it means we - didn't create it. So either its a bad pointer, or it was - created with another factory. In either case, we don't really - want to mess with it. */ - - else { - cerr << "GA::GAServices::destroyArray: " << - "Array not from this factory." << endl; - return -1; - } - - /** Cast it from the abstract class to our private class. This is a - further check that we should be able to deal with it properly. */ - - victimDADF = dynamic_cast(victim); - if ( victimDADF == 0 ) { - cerr << "GA::GAServices::destroyArray: " << - "Array is of the wrong type." << endl; - return -3; - } - - /** Get rid of the victim */ - delete victimDADF; - - /** Record the fact that we got rid of it */ - arrayRecord[ victim ]--; - - /** Clear the pointer so user knows its no longer valid */ - victim = 0; - - /** Success */ - return 0; -} - -/****************************************************************************** - * Private stuff -******************************************************************************/ - -/** List outstanding templates on stderr - - @param label (in) String to label output -*/ -void GA::GAServices::listTemplates(std::string label) { - std::map::iterator it = templateRecord.begin(); - - for ( ; it != templateRecord.end(); ++it) { - if ( it->second == 1) { - cerr << label << ": " << - "WARNING! template `" << (it->first)->getName() << - "' at address 0x" << it->first << " still exists." << endl; - } - - /** Report any bizarrities */ - - else if ( it->second < 0 || it->second > 1) { - cerr << label << ": " << - "WARNING! template named `" << (it->first)->getName() << - "' at address 0x" << it->first << " claims to have " << - it->second << "instances!" << endl; - } - - } -} - -/** List outstanding descriptors on stderr - - @param label (in) String to label output -*/ -void GA::GAServices::listDescriptors(std::string label) { - std::map::iterator it = descriptorRecord.begin(); - - for ( ; it != descriptorRecord.end(); ++it) { - if ( it->second == 1) { - cerr << label << ": " << - "WARNING! descriptor `" << (it->first)->getName() << - "' at address 0x" << it->first << " still exists." << endl; - } - - /** Report any bizarrities */ - - else if ( it->second < 0 || it->second > 1) { - cerr << label << ": " << - "WARNING! descriptor named `" << (it->first)->getName() << - "' at address 0x" << it->first << " claims to have " << - it->second << "instances!" << endl; - } - - } -} - -/** List outstanding arrays on stderr - - @param label (in) String to label output -*/ -void GA::GAServices::listArrays(std::string label) { - std::map::iterator it = arrayRecord.begin(); - - for ( ; it != arrayRecord.end(); ++it) { - if ( it->second == 1) { - cerr << label << ": " << - "WARNING! array `" << (it->first)->getName() << - "' at address 0x" << it->first << " still exists." << endl; - } - - /** Report any bizarrities */ - - else if ( it->second < 0 || it->second > 1) { - cerr << label << ": " << - "WARNING! array named `" << (it->first)->getName() << - "' at address 0x" << it->first << " claims to have " << - it->second << "instances!" << endl; - } - - } -} - diff --git a/cca/ga_cca_classic/GAServices_wrapper.cxx b/cca/ga_cca_classic/GAServices_wrapper.cxx deleted file mode 100644 index b8af20b25..000000000 --- a/cca/ga_cca_classic/GAServices_wrapper.cxx +++ /dev/null @@ -1,26 +0,0 @@ -// This is a generated file. Do not commit to CVS. -#include -#include -#include -#include "gacca.h" -#include "GAServices.h" - -extern "C" { - -classic::gov::cca::Component *create_GAServices() { - classic::gov::cca::Component *wanker; - GA::GAServices *component; - component = new GA::GAServices(); - wanker = dynamic_cast(component); - return wanker; -} - -char **getComponentList() { - static char *list[2]; - list[0] = "create_GAServices GA::GAServices"; - list[1] = 0; - return list; -} - -} -static char id[]="$Id: GAServices_wrapper.cxx,v 1.1 2003-08-01 00:41:54 manoj Exp $"; diff --git a/cca/ga_cca_classic/GA_DADFArray.cxx b/cca/ga_cca_classic/GA_DADFArray.cxx deleted file mode 100644 index 0a4d4147a..000000000 --- a/cca/ga_cca_classic/GA_DADFArray.cxx +++ /dev/null @@ -1,426 +0,0 @@ -#include "gacca.h" -#include "GA_DADFArray.h" -#include "DistArrayTemplate.h" - -/** This is our implementation of DistArray. It is kept - private within the GAServices. Only - DistArray is exposed to the outside. - */ - -/****************************************************************************** - * Constructors and destructors - *****************************************************************************/ - -/** Constructor sets distributed array name. */ -DADFArray::DADFArray( const std::string name) { - _name = name; - _frozen = false; - _rank = -1; - _type = stv_Int; - _templ = 0; - _isExplicitDist = false; -} - -/** Construct a new distributed array as a copy of an old one. -*/ -DADFArray::DADFArray(const std::string name, DADFArray & original) - : _rank( original._rank ), - _type( original._type), - _lowerBounds( original._lowerBounds ), - _upperBounds( original._upperBounds ), - _topology( original._topology ), - _procCoords( original._procCoords ), - _isExplicitDist( original._isExplicitDist ) -{ - // Use name provided by user rather than from original - _name = name; - - _frozen = true; - - // Duplicate template - DADFTemplate * originalTemplDADF = - dynamic_cast(original._templ); - _templ = new DADFTemplate( *originalTemplDADF ); - - /** _regionList must be treated with similar care to _axisInfo. */ - - DADFRegionInfo * dri; // Used to copy _regionList - std::list::iterator driter; - - for ( driter=original._regionList.begin(); - driter != original._regionList.end(); ++driter ) { - - dri = new DADFRegionInfo( *(*driter) ); - _regionList.push_back( dri ); - } - - _handle = GA_Duplicate(original._handle, (char *)_name.c_str()); - if(!_handle) GA_Error(" DADFArray::DADFArray(): GA creation failed",0); - -} - -DADFArray::~DADFArray() { - delete _templ; - - std::list::iterator driter; - - // Clean up _regionList - for ( driter= _regionList.begin(); driter != _regionList.end(); - ++driter ) { - delete *driter; - } - _regionList.resize(0); - - // Destroy the Global Array - GA_Destroy(_handle); -} - -/****************************************************************************** - * Define the distributed array - *****************************************************************************/ - -/** Set data type. */ -int DADFArray::setDataType(const enum DataType type) { - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - _type = type; - - return 0; -} - -/** Associate this data object with a distribution template. */ -int DADFArray::setTemplate(DistArrayTemplate * & templ) { - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - // If there's already one there, replace it - if ( _templ != 0 ) { delete _templ; } - - DADFTemplate * templDADF = dynamic_cast(templ); - if ( templDADF == 0 ) { - cerr << "DADFArray:setTemplate: " << - "Template is of the wrong type." << endl; - return -13; - } - - // Make a private copy of the template - DADFTemplate * newtempl = new DADFTemplate( *templDADF ); - _templ = dynamic_cast(newtempl); - - return 0; -} - -/** Sets this process's location in the process topology. */ -int DADFArray::setMyProcCoords(const int procCoords[] ) { - int i; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - // Check bounds for validity - for ( i=0; i < _rank; ++i ) { - if ( procCoords[i] < 0 || procCoords[i] > _topology[i] ) { - return -6; - } - } - - // Copy arguments into our data structure - // for ( i=0; i < _rank; ++i ) { _procCoords[ i ] = procCoords[ i ]; } - - GA_Error("DADFArray::setMyProcCoords(): This method is currently not supported in GA\n", 0); - - return 0; -} - -/** Align object to template with identity mapping. */ -int DADFArray::setIdentityAlignmentMap() { - int i, err; - int *proc; - int *lower, *upper; - DistArrayTemplate::DistType *dist; - - // Insure we haven't been commit()ed - if ( _frozen ) { return -11; } - - // Insure we have a template - if ( _templ == 0 ) { return -14; } - - /** At the moment, this simply means we're allowed to extract stuff - from the template more or less with impunity. */ - _rank = _templ->getRank(); - - // Resize our internal vectors - _lowerBounds.resize( _rank ); - _upperBounds.resize( _rank ); - _topology.resize( _rank ); - _procCoords.resize( _rank ); - - proc = new int[ _rank ]; - err = _templ->getProcTopology( &(*proc) ); - if ( err != 0 ) { return err; } - - for ( i=0; i < _rank; ++i ) { - _topology[i] = proc[i]; - } - delete [] proc; - - lower = new int[ _rank ]; - upper = new int[ _rank ]; - err = _templ->getGlobalBounds( &(*lower), &(*upper) ); - if ( err != 0 ) { return err; } - - for ( i=0; i < _rank; ++i ) { - _lowerBounds[i] = lower[i]; - _upperBounds[i] = upper[i]; - } - - delete [] lower; - delete [] upper; - - dist = new DistArrayTemplate::DistType[ _rank ]; - err = _templ->getDistType( &(*dist) ); - if ( err != 0 ) { return err; } - - _isExplicitDist = ( dist[0] == DistArrayTemplate::Explicit ); - - delete [] dist; - - return 0; -} - -/** Signal that distributed array is completely defined. */ -int DADFArray::commit() { - // Insure we haven't been commit()ed already - if ( _frozen ) { - return -11; - } else { - _frozen = true; - // return 0; - } - - int err, me = GA_Nodeid(), type; - int chunk[GA_MAX_DIM], dims[GA_MAX_DIM], strides[GA_MAX_DIM]; - int lower[GA_MAX_DIM], upper[GA_MAX_DIM]; - void *data; - DistArrayTemplate::DistType dist[GA_MAX_DIM]; - - // get the name of the array - _name = _templ->getName(); - - err = _templ->getGlobalBounds( &(*lower), &(*upper) ); - if ( err != 0 ) { return err; } - - err = _templ->getDistType( dist ); - if ( err != 0 ) { return err; } - - // Identify GA Data Type - switch(_type) { - case DistArray::stv_Int: - type = C_INT; - break; - case DistArray::stv_Float: - type = C_FLOAT; - break; - case DistArray::stv_Double: - type = C_DBL; - break; - case DistArray::stv_Dcplx: - type = C_DCPL; - break; - case DistArray::stv_Cplx: - type = C_SCPL; - break; - case DistArray::stv_Long: - type = C_LONG; - break; - default: - GA_Error("DADFArray::commit(): Invalid Data Type\n", 0); - } - - if(dist[0] == DistArrayTemplate::Block) { - int first; - for(int i=0; i<_rank; ++i) { - _templ->getDistParameters(i, chunk[i], first); - dims[i] = upper[i] - lower[i];// + 1; - } - _handle = NGA_Create(type, _rank, dims, (char *)_name.c_str(), chunk); - if(!_handle) GA_Error(" GA creation failed",0); - if(!me) cout << "NGA_Create called\n"; - } - else if(dist[0] == DistArrayTemplate::GenBlock){ - int *blockSize[GA_MAX_DIM], total=0; - for(int i=0; i<_rank; ++i) { - blockSize[i] = new int[_topology[i]]; - _templ->getGenBlock(i, blockSize[i]); - total += _topology[i]; - dims[i] = upper[i] - lower[i]; // + 1; - chunk[i] = _topology[i]; - } - int *ga_map = new int [total]; - int offset=0; - for(int i=0; i<_rank; i++) { - for(int j=0; j<_topology[i]; ++j) { - if(j!=0) ga_map[offset] = ga_map[offset-1] + blockSize[i][j-1]; - else ga_map[offset] = 0; - offset++; - } - } - _handle = NGA_Create_irreg(type, _rank, dims, (char *)_name.c_str(), - chunk, ga_map); - if(!_handle) GA_Error(" GA creation failed",0); - GA_Print_distribution(_handle); - for(int i=0; i<_rank; i++) { delete blockSize[i]; blockSize[i] = NULL; } - delete[] ga_map; ga_map = NULL; - if(!me) cout << "NGA_Create_irreg called\n"; - } - else - cerr << "Error: Invalid Distribution Type\n"; - - // Create a region info and set everything - NGA_Distribution(_handle, me, lower, upper); - NGA_Access(_handle, lower, upper, &data, strides); - DADFRegionInfo * dri = new DADFRegionInfo( _rank ); - dri->setBounds( lower, upper ); - dri->setDataLocation( data, strides ); - _regionList.push_back( dri ); - - return 0; - - // Perform global consistency checks - - // Should test topology if dist is explicit - // Test if area of explicit regions == area of distributed array -} - -/****************************************************************************** - * Query the distributed array - *****************************************************************************/ - -std::string DADFArray::getName() { - return _name; -} - -bool DADFArray::isDefined() { - if ( _frozen ) { - return true; - } else { - return false; - } -} - -/** Get data type. */ -DistArray::DataType DADFArray::getDataType() { - return _type; -} - -/** Return pointer to distribution template associated with this - data object. -*/ -DistArrayTemplate * DADFArray::getTemplate() { - return _templ; -} - -/** Get this process's location in the process topology. */ -int DADFArray::getMyProcCoords(int procCoords[] ) { - int me=GA_Nodeid(); - - // Sanity check our state - if ( _rank < 1 ) { return -8; } - - // Copy arguments out of our data structures - // for ( int i=0; i < _rank; ++i ) { procCoords[ i ] = _procCoords[ i ]; } - NGA_Proc_topology(_handle, me, procCoords); - - return 0; -} - -// Kludges to check GenBlock and Explicit distributed arrays. -int DADFArray::getNumLocalRegions() { - // if ( ! _frozen ) { return -100; } - - if ( _isExplicitDist ) { - return _regionList.size(); - } else { - return 1; - } -} -// Kludges to check GenBlock and Explicit distributed arrays. -int DADFArray::getLocalRegionInfo(int region, int lower[], int upper[], - void * & data, int strides[]) { - int i; - - // if ( ! _frozen ) { return -100; } - - if ( _isExplicitDist ) { - if ( region >= (int)_regionList.size() ) { return -15; } - } else { - if ( region != 0 ) { return -15; } - } - - // Find the right region (assumes list not rearranged btw calls!) - std::list::iterator driter; - driter= _regionList.begin(); - for ( i = 0; i < region; ++i ) { ++driter; } - - (*driter)->getBounds( lower, upper ); - (*driter)->getDataLocation(data, strides ); - - return 0; -} - -/** Print the contents of the distributed array (for debugging) */ -void DADFArray::printArray() { - GA_Print(_handle); -} - - -/** Print the distribution info of the distributed array (for debugging) */ -void DADFArray::printArrayDistribution() { - int i; - static const std::string typeLabels[12] - = { "stvInt", "stvFloat", "stvCplx", "stvDouble", "stvDcplx", - "stvLong", "stvShort", "stvStr", "stvUshort", "stvUint", - "stvUlong", "stvByte" }; - - cerr << "Distributed array `" << _name << "' rank " << - _rank << " type `" << typeLabels[ _type ] << "'" << - ((_frozen)? " (" : " (not" ) << " frozen)" << endl; - - if ( _templ != 0 ) { - cerr << " Associated with template: `" << _templ->getName() - << "'" << endl; - } - - if ( _rank < 1 ) { return; } - - cerr << " Bounds: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << _lowerBounds[i] << ":" << _upperBounds[i] << ", "; - } - cerr << _lowerBounds[i] << ":" << _upperBounds[i] << endl; - - cerr << " Process topology: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << _topology[i] << ", "; - } - cerr << _topology[i] << endl; - - cerr << " Process coordinates: " ; - for ( i=0; i < _rank-1 ; ++i ) { - cerr << _procCoords[i] << ", "; - } - cerr << _procCoords[i] << endl; - - // Print out the region list - std::list::iterator driter; - - cerr << " Regions registered: " << _regionList.size() << endl; - - for ( driter= _regionList.begin(); - driter != _regionList.end(); ++driter ) { - cerr << " Region "; - (*driter)->printRegionInfo(); - cerr << endl; - } -} diff --git a/cca/ga_cca_classic/GA_DADFArray.h b/cca/ga_cca_classic/GA_DADFArray.h deleted file mode 100644 index bbcc16a69..000000000 --- a/cca/ga_cca_classic/GA_DADFArray.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef GA_DADFArray_h_seen -#define GA_DADFArray_h_seen - -/** This is our implementation of DistArray. It is kept - private within the GAServices. Only DistArray is exposed - to the outside. -*/ - -#include -#include - -#include "DistArray.h" -#include "DADFTemplate.h" -#include "DADFDescriptor.h" - -class DADFArray : public DistArray { - public: - - /**************************************************************************** - * Constructors and destructors - ***************************************************************************/ - - /** Simple constructor for internal use. */ - DADFArray() ; - - /** Normal constructor -- forces setting of name */ - DADFArray( const std::string name ) ; - - /** Copy constructor */ - DADFArray( const std::string name, DADFArray & original ); - - /** The usual destructor */ - virtual ~DADFArray(); - - /** Set data type. */ - virtual int setDataType(const enum DataType type); - - /** Associate this data object with a distribution template. */ - virtual int setTemplate(DistArrayTemplate * & templ); - - /** Sets this process's location in the process topology. */ - virtual int setMyProcCoords(const int procCoords[] ); - - /** Align object to template with identity mapping. */ - virtual int setIdentityAlignmentMap(); - - /** Signal that data object is completely defined. */ - virtual int commit(); - - /**************************************************************************** - * Query the descriptor - ***************************************************************************/ - - /** Return name given to descriptor */ - virtual std::string getName(); - - /** Has commit() been called on this descriptor? */ - virtual bool isDefined(); - - /** Get data type. */ - virtual DistArray::DataType getDataType(); - - /** Return pointer to distribution template associated with this - data object. - */ - virtual DistArrayTemplate * getTemplate(); - - /** Get this process's location in the process topology. */ - virtual int getMyProcCoords(int procCoords[] ); - - /** Mainly for testing and debugging */ - virtual void printArrayDistribution(); - - virtual void printArray(); - - /** Part of a kludge for debugging. */ - virtual int getNumLocalRegions(); - - /** Part of a kludge for debugging. */ - virtual int getLocalRegionInfo(int region, int lower[], int upper[], - void * & data, int strides[]); - - /**************************************************************************** - * Internals - ***************************************************************************/ - private: - - // GA specific array Handle - int _handle; - - // Human-readable name for this descriptor - std::string _name; - - // Whether or not commit() has been called - int _frozen; - - // Rank of array - int _rank; - - // Type of data - DataType _type; - - // Shorthand check if this is an explicit distribution - bool _isExplicitDist; - - // Array distribution template for this descriptor - DistArrayTemplate * _templ; - - // Global lower bounds of array - std::vector _lowerBounds; - - // Global upper bounds of array - std::vector _upperBounds; - - // Process topology - std::vector _topology; - - // Coordinates of this process in topology - std::vector _procCoords; - - // List of regions associated with this process of array - std::list _regionList; -}; - -#endif // GA_DADFArray_h_seen diff --git a/cca/ga_cca_classic/GlobalArray.cxx b/cca/ga_cca_classic/GlobalArray.cxx deleted file mode 100644 index 59e4429c5..000000000 --- a/cca/ga_cca_classic/GlobalArray.cxx +++ /dev/null @@ -1,634 +0,0 @@ -#include "gacca.h" - -#define GA_DATA_TYPE C_DBL - -static int sTmpVar = 0; - - -/** - * Constructors and Destructor of GlobalArray - */ - - -GA::GlobalArray::GlobalArray(int type, int ndim, int dims[], char *arrayname, - int chunk[]) { - mHandle = NGA_Create(type, ndim, dims, arrayname, chunk); - if(!mHandle) GA_Error((char *)" GA creation failed",0); -} - -GA::GlobalArray::GlobalArray(int type, int ndim, int dims[], char *arrayname, - int block[], int maps[]) { - mHandle = NGA_Create_irreg(type, ndim, dims, arrayname, block, maps); - if(!mHandle) GA_Error((char *)" GA creation failed",0); -} - -GA::GlobalArray::GlobalArray(int type, int ndim, int dims[], int width[], - char *arrayname, int chunk[], char ghosts) { - mHandle = NGA_Create_ghosts(type, ndim, dims, width, arrayname, chunk); - if(!mHandle) GA_Error((char *)" GA creation failed",0); -} - -GA::GlobalArray::GlobalArray(int type, int ndim, int dims[], int width[], - char *arrayname, int block[], int maps[], - char ghosts) { - mHandle = NGA_Create_ghosts_irreg(type, ndim, dims, width, arrayname, block, - maps); - if(!mHandle) GA_Error((char *)" GA creation failed",0); -} - -GA::GlobalArray::GlobalArray(const GA::GlobalArray &g_a, char *arrayname) { - mHandle = GA_Duplicate(g_a.mHandle, arrayname); - if(!mHandle) GA_Error((char *)" GA creation failed",0); -} - -GA::GlobalArray::GlobalArray(const GA::GlobalArray &g_a) { - char temp_name[20]; - - sprintf(temp_name, "tmpGA%d", sTmpVar++); - mHandle = GA_Duplicate(g_a.mHandle, temp_name); - if(!mHandle) GA_Error((char *)" GA creation failed",0); - GA_Copy(g_a.mHandle, mHandle); -} - -GA::GlobalArray::GlobalArray() { - char temp_name[20]; - int n_dim; - int *dimensions; - - sprintf(temp_name, "tmpGA%d", sTmpVar++); - n_dim = DEF_NDIM; - dimensions = new int [n_dim]; - for(int i=0; imHandle, beta, g_b->mHandle, mHandle); -} - -void -GA::GlobalArray::addPatch (void *alpha, - const GA::GlobalArray * g_a, int alo[], int ahi[], - void *beta, - const GA::GlobalArray * g_b, int blo[], int bhi[], - int clo[], int chi[]) const { - NGA_Add_patch(alpha, g_a->mHandle, alo, ahi, beta, - g_b->mHandle, blo, bhi, mHandle, clo, chi); -} - -void -GA::GlobalArray::checkHandle(char* string) const { - GA_Check_handle(mHandle, string); -} - -int -GA::GlobalArray::compareDistr(const GA::GlobalArray *g_a) const { - return GA_Compare_distr(mHandle, g_a->mHandle); -} - -void -GA::GlobalArray::copy(const GA::GlobalArray *g_a) const { - GA_Copy(g_a->mHandle, mHandle); -} - -void -GA::GlobalArray::copyPatch(char trans, const GA::GlobalArray* ga, int alo[], - int ahi[], int blo[], int bhi[]) const { - NGA_Copy_patch(trans, ga->mHandle, alo, ahi, mHandle, blo, bhi); -} - -double -GA::GlobalArray::ddot(const GA::GlobalArray * g_a) const { - return GA_Ddot(mHandle, g_a->mHandle); -} - -double -GA::GlobalArray::ddotPatch(char ta, int alo[], int ahi[], - const GA::GlobalArray * g_a, char tb, - int blo[], int bhi[]) const { - return NGA_Ddot_patch(mHandle, ta, alo, ahi, g_a->mHandle, tb, blo, bhi); -} - -void -GA::GlobalArray::destroy() const { - GA_Destroy(mHandle); -} - -void -GA::GlobalArray::dgemm(char ta, char tb, int m, int n, int k, double alpha, - const GA::GlobalArray *g_a, const GA::GlobalArray *g_b, - double beta) const { - GA_Dgemm(ta, tb, m, n, k, alpha, g_a->mHandle, g_b->mHandle, - beta, mHandle); -} - -void -GA::GlobalArray::diag(const GA::GlobalArray *g_s, GA::GlobalArray *g_v, - void *eval) const { - GA_Diag(mHandle, g_s->mHandle, g_v->mHandle, eval); -} - -void -GA::GlobalArray::diagReuse(int control, const GA::GlobalArray *g_s, - GA::GlobalArray *g_v, void *eval) const { - GA_Diag_reuse(control, mHandle, g_s->mHandle, g_v->mHandle, eval); -} - -void -GA::GlobalArray::diagStd(GlobalArray *g_v, void *eval) const { - GA_Diag_std(mHandle, g_v->mHandle, eval); -} - -void -GA::GlobalArray::diagSeq(const GA::GlobalArray * g_s, - const GA::GlobalArray * g_v, void *eval) const { - GA_Diag_seq(mHandle, g_s->mHandle, g_v->mHandle, eval); -} - -void -GA::GlobalArray::diagStdSeq(const GA::GlobalArray * g_v, void *eval) const { - GA_Diag_std_seq(mHandle, g_v->mHandle, eval); -} - -void -GA::GlobalArray::distribution(int me, int* lo, int* hi) const { - NGA_Distribution(mHandle, me, lo, hi); -} - -float -GA::GlobalArray::fdot(const GA::GlobalArray * g_a) const { - return GA_Fdot(mHandle, g_a->mHandle); -} - -float -GA::GlobalArray::fdotPatch(char t_a, int alo[], int ahi[], - const GA::GlobalArray * g_b, char t_b, int blo[], - int bhi[]) const { - return NGA_Fdot_patch(mHandle, t_a, alo, ahi, - g_b->mHandle, t_b, blo, bhi); -} - -void -GA::GlobalArray::fill(void *value) const { - GA_Fill(mHandle, value); -} - -void -GA::GlobalArray::fillPatch (int lo[], int hi[], void *val) const { - NGA_Fill_patch(mHandle, lo, hi, val); -} - -void -GA::GlobalArray::gather(void *v, int * subsarray[], int n) const { - NGA_Gather(mHandle, v, subsarray, n); -} - -void -GA::GlobalArray::get(int lo[], int hi[], void *buf, int ld[]) const { - NGA_Get(mHandle, lo, hi, buf, ld); -} - -int -GA::GlobalArray::hasGhosts() const { - return GA_Has_ghosts(mHandle); -} - -Integer -GA::GlobalArray::idot(const GA::GlobalArray * g_a) const { - return GA_Idot(mHandle, g_a->mHandle); -} - -long -GA::GlobalArray::idotPatch(char ta, int alo[], int ahi[], - const GA::GlobalArray * g_a, char tb, - int blo[], int bhi[]) const { - return NGA_Idot_patch(mHandle, ta, alo, ahi, g_a->mHandle, tb, blo, bhi); -} - -void -GA::GlobalArray::inquire(int *type, int *ndim, int dims[]) const { - NGA_Inquire(mHandle, type, ndim, dims); -} - -char* -GA::GlobalArray::inquireName() const { - return GA_Inquire_name(mHandle); -} - -long -GA::GlobalArray::ldot(const GA::GlobalArray * g_a) const { - return GA_Ldot(mHandle, g_a->mHandle); -} - -int -GA::GlobalArray::lltSolve(const GA::GlobalArray * g_a) const { - return GA_Llt_solve(g_a->mHandle, mHandle); -} - -int -GA::GlobalArray::locate(int subscript[]) const { - return NGA_Locate(mHandle, subscript); -} - -int -GA::GlobalArray::locateRegion(int lo[], int hi[], int map[], int procs[]) const { - return NGA_Locate_region(mHandle, lo, hi, map, procs); -} - -void -GA::GlobalArray::luSolve(char trans, const GA::GlobalArray * g_a) const { - GA_Lu_solve(trans, g_a->mHandle, mHandle); -} - -void -GA::GlobalArray::matmulPatch(char transa, char transb, void* alpha, - void *beta, const GA::GlobalArray *g_a, - int ailo, int aihi, int ajlo, int ajhi, - const GA::GlobalArray *g_b, - int bilo, int bihi, int bjlo, int bjhi, - int cilo, int cihi, int cjlo, int cjhi) const { - GA_Matmul_patch(transa, transb, alpha, beta, - g_a->mHandle, ailo, aihi, ajlo, ajhi, - g_b->mHandle, bilo, bihi, bjlo, bjhi, - mHandle, cilo, cihi, cjlo, cjhi); -} - -void -GA::GlobalArray::matmulPatch(char transa, char transb, void* alpha, void *beta, - const GA::GlobalArray *g_a, int *alo, int *ahi, - const GA::GlobalArray *g_b, int *blo, int *bhi, - int *clo, int *chi) const { - NGA_Matmul_patch(transa, transb, alpha, beta, g_a->mHandle, alo, ahi, - g_b->mHandle, blo, bhi, mHandle, clo, chi); -} - -void -GA::GlobalArray::nblock(int numblock[]) const { - GA_Nblock(mHandle, numblock); -} - -int -GA::GlobalArray::ndim() const { - return GA_Ndim(mHandle); -} - -void -GA::GlobalArray::periodicAcc(int lo[], int hi[], void* buf, int ld[], - void* alpha) const { - NGA_Periodic_acc(mHandle, lo, hi, buf, ld, alpha); -} - -void -GA::GlobalArray::periodicGet(int lo[], int hi[], void* buf, int ld[]) const { - NGA_Periodic_get(mHandle, lo, hi, buf, ld); -} - -void -GA::GlobalArray::periodicPut(int lo[], int hi[], void* buf, int ld[]) const { - NGA_Periodic_put(mHandle, lo, hi, buf, ld); -} - -void -GA::GlobalArray::print() const { - GA_Print(mHandle); -} - -void -GA::GlobalArray::printDistribution() const { - GA_Print_distribution(mHandle); -} - -void -GA::GlobalArray::printFile(FILE *file) const { - GA_Print_file(file, mHandle); -} - -void -GA::GlobalArray::printPatch(int* lo, int* hi, int pretty) const { - NGA_Print_patch(mHandle, lo, hi, pretty); -} - -void -GA::GlobalArray::procTopology(int proc, int coord[]) const { - NGA_Proc_topology(mHandle, proc, coord); -} - -void -GA::GlobalArray::put(int lo[], int hi[], void *buf, int ld[]) const { - NGA_Put(mHandle, lo, hi, buf, ld); -} - -long -GA::GlobalArray::readInc(int subscript[], long inc) const { - return NGA_Read_inc(mHandle, subscript, inc); -} - -void -GA::GlobalArray::release(int lo[], int hi[]) const { - NGA_Release(mHandle, lo, hi); -} - -void -GA::GlobalArray::releaseUpdate(int lo[], int hi[]) const { - NGA_Release_update(mHandle, lo, hi); -} - -void -GA::GlobalArray::scale(void *value) const { - GA_Scale(mHandle, value); -} - -void -GA::GlobalArray::scalePatch (int lo[], int hi[], void *val) const { - NGA_Scale_patch(mHandle, lo, hi, val); -} - -void -GA::GlobalArray::scatter(void *v, int *subsarray[], int n) const { - NGA_Scatter(mHandle, v, subsarray, n); -} - -int -GA::GlobalArray::solve(const GA::GlobalArray * g_a) const { - return GA_Solve(g_a->mHandle, mHandle); -} - -int -GA::GlobalArray::spdInvert() const { - return GA_Spd_invert(mHandle); -} - -void -GA::GlobalArray::selectElem(char *op, void* val, int index[]) const { - NGA_Select_elem(mHandle, op, val, index); -} - -void -GA::GlobalArray::sgemm(char ta, char tb, int m, int n, int k, float alpha, - const GA::GlobalArray *g_a, const GA::GlobalArray *g_b, - float beta) const { - GA_Sgemm(ta, tb, m, n, k, alpha, g_a->mHandle, g_b->mHandle, - beta, mHandle); -} - -void -GA::GlobalArray::symmetrize() const { - GA_Symmetrize(mHandle); -} - -void -GA::GlobalArray::transpose(const GA::GlobalArray * g_a) const { - GA_Transpose(mHandle, g_a->mHandle); -} - -void -GA::GlobalArray::updateGhosts() const { - GA_Update_ghosts(mHandle); -} - -int -GA::GlobalArray::updateGhostDir(int dimension, int idir, int cflag) const { - return NGA_Update_ghost_dir(mHandle, dimension, idir, cflag); -} - -DoubleComplex -GA::GlobalArray::zdot(const GA::GlobalArray * g_a) const { - return GA_Zdot(mHandle, g_a->mHandle); -} - -DoubleComplex -GA::GlobalArray::zdotPatch(char ta, int alo[], int ahi[], - const GA::GlobalArray * g_a, char tb, - int blo[], int bhi[]) const { - return NGA_Zdot_patch(mHandle, ta, alo, ahi, g_a->mHandle, tb, blo, bhi); -} - -void -GA::GlobalArray::zero() const { - GA_Zero(mHandle); -} - -void -GA::GlobalArray::zeroPatch (int lo[], int hi[]) const { - NGA_Zero_patch(mHandle, lo, hi); -} - -void -GA::GlobalArray::zgemm(char ta, char tb, int m, int n, int k, - DoubleComplex alpha, - const GA::GlobalArray *g_a, const GA::GlobalArray *g_b, - DoubleComplex beta) const { - GA_Zgemm(ta, tb, m, n, k, alpha, g_a->mHandle, g_b->mHandle, - beta, mHandle); -} - -/* recent additions */ - -void -GA::GlobalArray::absValue() const { - GA_Abs_value(mHandle); -} - -void -GA::GlobalArray::addConstant(void* alpha) const { - GA_Add_constant(mHandle, alpha); -} - -void -GA::GlobalArray::recip() const { - GA_Recip(mHandle); -} - -void -GA::GlobalArray::elemMultiply(const GA::GlobalArray * g_a, - const GA::GlobalArray * g_b) const { - GA_Elem_multiply(g_a->mHandle, g_b->mHandle, mHandle); -} - -void -GA::GlobalArray::elemDivide(const GA::GlobalArray * g_a, - const GA::GlobalArray * g_b) const { - GA_Elem_divide(g_a->mHandle, g_b->mHandle, mHandle); -} - - -void -GA::GlobalArray::elemMaximum(const GA::GlobalArray * g_a, - const GA::GlobalArray * g_b) const { - GA_Elem_maximum(g_a->mHandle, g_b->mHandle, mHandle); -} - - -void -GA::GlobalArray::elemMinimum(const GA::GlobalArray * g_a, - const GA::GlobalArray * g_b) const { - GA_Elem_minimum(g_a->mHandle, g_b->mHandle, mHandle); -} - -void -GA::GlobalArray::absValuePatch(int *lo, int *hi) const { - GA_Abs_value_patch(mHandle, lo, hi); -} - -void -GA::GlobalArray::addConstantPatch(int *lo,int *hi, void *alpha) const { - GA_Add_constant_patch(mHandle, lo, hi, alpha); -} - -void -GA::GlobalArray::recipPatch(int *lo, int *hi) const { - GA_Recip_patch(mHandle, lo, hi); -} - -void -GA::GlobalArray::stepMax(const GA::GlobalArray * g_a, double *step) const {// CHECK all Step Max functions - GA_Step_max(mHandle, g_a->mHandle, step); -} - -void -GA::GlobalArray::stepMaxPatch(int *alo, int *ahi, - const GA::GlobalArray * g_b, int *blo, int *bhi, - double *step) const { - GA_Step_max_patch(mHandle, alo, ahi, g_b->mHandle, blo, bhi, step); -} - -void -GA::GlobalArray::elemMultiplyPatch(const GA::GlobalArray * g_a, - int *alo,int *ahi, - const GA::GlobalArray * g_b, - int *blo,int *bhi, - int *clo,int *chi) const { - GA_Elem_multiply_patch(g_a->mHandle, alo, ahi, g_b->mHandle, blo, bhi, - mHandle, clo, chi); -} - -void -GA::GlobalArray::elemDividePatch(const GA::GlobalArray * g_a,int *alo,int *ahi, - const GA::GlobalArray * g_b,int *blo,int *bhi, - int *clo,int *chi) const { - GA_Elem_divide_patch(g_a->mHandle, alo, ahi, g_b->mHandle, blo, bhi, - mHandle, clo, chi); -} - -void -GA::GlobalArray::elemMaximumPatch(const GA::GlobalArray * g_a, - int *alo,int *ahi, - const GA::GlobalArray * g_b, - int *blo,int *bhi, - int *clo,int *chi) const { - GA_Elem_maximum_patch(g_a->mHandle, alo, ahi, g_b->mHandle, blo, bhi, - mHandle, clo, chi); -} - -void -GA::GlobalArray::elemMinimumPatch(const GA::GlobalArray * g_a, - int *alo,int *ahi, - const GA::GlobalArray * g_b, - int *blo,int *bhi, - int *clo,int *chi) const { - GA_Elem_minimum_patch(g_a->mHandle, alo, ahi, g_b->mHandle, blo, bhi, - mHandle, clo, chi); -} - - - -/*Added by Limin for matrix operations*/ - -void -GA::GlobalArray::shiftDiagonal(void *c) const { - GA_Shift_diagonal(mHandle, c); -} - -void -GA::GlobalArray::setDiagonal(const GA::GlobalArray * g_v) const { - GA_Set_diagonal(mHandle, g_v->mHandle); -} - -void -GA::GlobalArray::zeroDiagonal() const { - GA_Zero_diagonal(mHandle); -} - -void -GA::GlobalArray::addDiagonal(const GA::GlobalArray * g_v) const { - GA_Add_diagonal(mHandle, g_v->mHandle); -} - -void -GA::GlobalArray::getDiagonal(const GA::GlobalArray * g_a) const { - GA_Get_diag(g_a->mHandle, mHandle); -} - -void -GA::GlobalArray::scaleRows(const GA::GlobalArray * g_v) const { - GA_Scale_rows(mHandle, g_v->mHandle); -} - -void -GA::GlobalArray::scaleCols(const GA::GlobalArray * g_v) const { - GA_Scale_cols(mHandle, g_v->mHandle); -} - -void -GA::GlobalArray::norm1(double *nm) const { - GA_Norm1(mHandle, nm); -} - -void -GA::GlobalArray::normInfinity(double *nm) const { - GA_Norm_infinity(mHandle, nm); -} - -void -GA::GlobalArray::median(const GA::GlobalArray * g_a, - const GA::GlobalArray * g_b, - const GA::GlobalArray * g_c) const { - GA_Median(g_a->mHandle, g_b->mHandle, g_c->mHandle, mHandle); -} - -void -GA::GlobalArray::medianPatch(const GA::GlobalArray * g_a, int *alo, int *ahi, - const GA::GlobalArray * g_b, int *blo, int *bhi, - const GA::GlobalArray * g_c, int *clo, int *chi, - int *mlo, int *mhi) const { - GA_Median_patch(g_a->mHandle, alo, ahi, g_b->mHandle, blo, bhi, - g_c->mHandle, clo, chi, mHandle, mlo, mhi); -} - - - - diff --git a/cca/ga_cca_classic/GlobalArray.h b/cca/ga_cca_classic/GlobalArray.h deleted file mode 100644 index e3f8b8b8a..000000000 --- a/cca/ga_cca_classic/GlobalArray.h +++ /dev/null @@ -1,1385 +0,0 @@ -#ifndef _GLOBALARRAY_H -#define _GLOBALARRAY_H - - -#define DEF_NDIM 2 -#define DEF_DIMS 10 - -/** - * This is the GlobalArray class. - */ -class GlobalArray { - - public: - - /** - * Creates an ndim-dimensional array using the regular distribution model - * and returns integer handle representing the array. - - * The array can be distributed evenly or not. The control over the - * distribution is accomplished by specifying chunk (block) size for all or - * some of array dimensions. - - * For example, for a 2-dimensional array, setting chunk[0]=dim[0] gives - * distribution by vertical strips (chunk[0]*dims[0]); - * setting chunk[1]=dim[1] gives distribution by horizontal strips - * (chunk[1]*dims[1]). Actual chunks will be modified so that they are at - * least the size of the minimum and each process has either zero or one - * chunk. Specifying chunk[i] as <1 will cause that dimension to be - * distributed evenly. - - * As a convenience, when chunk is specified as NULL, the entire array is - * distributed evenly. - - * This is a collective operation. - - * @param arrayname - a unique character string [input] - * @param type - data type(MT_F_DBL,MT_F_INT,MT_F_DCPL) [input] - * @param ndim - number of array dimensions [input] - * @param dims[ndim] - array of dimensions [input] - * @param chunk[ndim] - array of chunks, each element specifies - * minimum size that given dimensions should be chunked up into [input] - */ - GlobalArray(int type, int ndim, int dims[], char *arrayname, int chunk[]); - - /** - * Creates an array by following the user-specified distribution and - * returns integer handle representing the array. - - * The distribution is specified as a Cartesian product of distributions - * for each dimension. The array indices start at 0. For example, the - * following figure demonstrates distribution of a 2-dimensional array 8x10 - * on 6 (or more) processors. nblock[2]={3,2}, the size of map array is s=5 - * and array map contains the following elements map={0,2,8, 0, 5}. The - * distribution is nonuniform because, P1 and P4 get 20 elements each and - * processors P0,P2,P3, and P5 only 10 elements each. - * - * - * - * - * - * - *
5 5
P0 P3 2
P1 P4 4
P2 P5 2
- * - * This is a collective operation. - * @param arrayname - a unique character string [input] - * @param type - MA data type (MT_F_DBL,MT_F_INT,MT_F_DCPL) [input] - * @param ndim - number of array dimensions [input] - * @param dims - array of dimension values [input] - * @param block[ndim] - no. of blocks each dimension is divided into [input] - * @param maps[s] - starting index for for each block; the size s is a sum - * all elements of nblock array [input] - */ - GlobalArray(int type, int ndim, int dims[], char *arrayname, int block[], - int maps[]); - - /** - * Creates an ndim-dimensional array with a layer of ghost cells around - * the visible data on each processor using the regular distribution - * model and returns an integer handle representing the array. - * The array can be distributed evenly or not evenly. The control over - * the distribution is accomplished by specifying chunk (block) size for - * all or some of the array dimensions. For example, for a 2-dimensional - * array, setting chunk(1)=dim(1) gives distribution by vertical strips - * (chunk(1)*dims(1)); setting chunk(2)=dim(2) gives distribution by - * horizontal strips (chunk(2)*dims(2)). Actual chunks will be modified - * so that they are at least the size of the minimum and each process - * has either zero or one chunk. Specifying chunk(i) as <1 will cause - * that dimension (i-th) to be distributed evenly. The width of the - * ghost cell layer in each dimension is specified using the array - * width(). The local data of the global array residing on each - * processor will have a layer width[n] ghosts cells wide on either - * side of the visible data along the dimension n. - * - * @param array_name - a unique character string [input] - * @param type - data type (MT_DBL,MT_INT,MT_DCPL) [input] - * @param ndim - number of array dimensions [input] - * @param dims[ndim] - array of dimensions [input] - * @param width[ndim] - array of ghost cell widths [input] - * @param chunk[ndim] - array of chunks, each element specifies - * minimum size that given dimensions should be - * chunked up into [input] - * @param ghosts - this is a dummy parameter: added to increase the - * number of arguments, inorder to avoid the conflicts - * among constructors. (ghosts = 'g' or 'G') - */ - GlobalArray(int type, int ndim, int dims[], int width[], char *arrayname, - int chunk[], char ghosts); - - - /** - * Creates an array with ghost cells by following the user-specified - * distribution and returns integer handle representing the array. - * The distribution is specified as a Cartesian product of distributions - * for each dimension. For example, the following figure demonstrates - * distribution of a 2-dimensional array 8x10 on 6 (or more) processors. - * nblock(2)={3,2}, the size of map array is s=5 and array map contains - * the following elements map={1,3,7, 1, 6}. The distribution is - * nonuniform because, P1 and P4 get 20 elements each and processors - * P0,P2,P3, and P5 only 10 elements each. - * - * - * - * - * - * - *
5 5
P0 P3 2
P1 P4 4
P2 P5 2
- * - * The array width[] is used to control the width of the ghost cell - * boundary around the visible data on each processor. The local data - * of the global array residing on each processor will have a layer - * width[n] ghosts cells wide on either side of the visible data along - * the dimension n. This is a collective operation. - * - * @param array_name - a unique character string [input] - * @param type - data type (MT_DBL,MT_INT,MT_DCPL) [input] - * @param ndim - number of array dimensions [input] - * @param dims[ndim] - array of dimensions [input] - * @param width[ndim] - array of ghost cell widths [input] - * @param nblock[ndim] - no. of blocks each dimension is divided into[input] - * @param map[s] - starting index for for each block; the size - * s is a sum of all elements of nblock array[input] - * @param ghosts - this is a dummy parameter: added to increase the - * number of arguments, inorder to avoid the conflicts - * among constructors. (ghosts = 'g' or 'G') - * @return Returns pointer to GlobalArray object created. Returns - * NULL if it fails to create a GA object. - */ - GlobalArray(int type, int ndim, int dims[], int width[], char *arrayname, - int block[], int maps[], char ghosts); - - /** - * Creates a new array by applying all the properties of another existing - * array. - * This is a collective operation. - * @param arrayname - a character string [input] - * @param g_b - integer handle for reference array [input] - */ - GlobalArray(const GlobalArray &g_a, char *arrayname); - - /** - * Creates a new array by applying all the properties of another existing - * array. - * This is a collective operation. - * @param g_b - integer handle for reference array [input] - */ - GlobalArray(const GlobalArray &g_a);/* copy constructor */ - - /** Creates a 10x10 array of type "double"(default).*/ - GlobalArray(); - - /** Destructor */ - ~GlobalArray(); - - /* access the data */ - /** @return returns the array handler*/ - int handle() const { return mHandle; } - - /* Global Array operations */ - - /** - * Combines data from local array buffer with data in the global array - * section. The local array is assumed to be have the same number of - * dimensions as the global array. - - * global array section (lo[],hi[]) += *alpha * buffer - - * This is a one-sided and atomic operation. - * @param lo[ndim] - array of starting indices for array section[input] - * @param hi[ndim] - array of ending indices for array section [input] - * @param buf - pointer to the local buffer array [input] - * @param ld[ndim-1] - array specifying leading dimensions/strides/extents - * for buffer array [input] - * @param alpha - scale factor (double/DoubleComplex/long *) [input] - */ - void acc(int lo[], int hi[], void *buf, int ld[], void *alpha) const; - - /** - * Provides access to the specified patch of a global array. Returns - * array of leading dimensions ld and a pointer to the first element - * in the patch. This routine allows to access directly, in place - * elements in the local section of a global array. It useful for - * writing new GA operations. A call to ga_access normally follows a - * previous call to ga_distribution that returns coordinates of the - * patch associated with a processor. You need to make sure that the - * coordinates of the patch are valid (test values returned from - * ga_distribution). - * - * Each call to ga_access has to be followed by a call to either - * ga_release or ga_release_update. You can access in this fashion only - * local data. Since the data is shared with other processes, you need - * to consider issues of mutual exclusion. This operation is local. - * - * @param ndim - number of dimensions of the global array - * @param lo[ndim] - array of starting indices for array section [input] - * @param hi[ndim] - array of ending indices for array section [input] - * @param ptr - points to location of first element in patch[output] - * @param ld[ndim-1]- leading dimensions for the pacth elements [output] - */ - void access(int lo[], int hi[], void *ptr, int ld[]) const; - - /** - * Provides access to the local patch of the global array. Returns - * leading dimension ld and and pointer for the data. This routine - * will provide access to the ghost cell data residing on each processor. - * Calls to NGA_Access_ghosts should normally follow a call to - * NGA_Distribution that returns coordinates of the visible data patch - * associated with a processor. You need to make sure that the coordinates - * of the patch are valid (test values returned from NGA_Distribution). - * - * You can only access local data. - * This is a local operation. - * - * @param g_a [input] - * @param dims[ndim] - array of dimensions of local patch, - * including ghost cells [output] - * @param ptr - returns an index corresponding to the origin - * the global array patch held locally on the - * processor [output] - * @param ld[ndim-1] - physical dimenstions of the local array patch, - * including ghost cells [output] - */ - void accessGhosts(int dims[], void *ptr, int ld[]) const; - - /** - * @param g_a [input] - * @param index - index pointing to location of element - * indexed by subscript[] [output] - * @param subscript[ndim] - array of integers that index desired - * element [input] - * @param ld[ndim-1] - array of strides for local data patch. - * These include ghost cell widths. [output] - * - * This function can be used to return a pointer to any data element - * in the locally held portion of the global array and can be used to - * directly access ghost cell data. The array subscript refers to the - * local index of the element relative to the origin of the local - * patch (which is assumed to be indexed by (0,0,...)). - * This is a local operation. - */ - void accessGhostElement(void *ptr, int subscript[], int ld[]) const; - - /** - * The arrays are aded together elemet-wise: - * [for example: g_c.add(...,g_a, .., g_b);] - * c = alpha * a + beta * b - * The result c may replace one of he input arrays(a/b). - * This is a collective operation. - */ - void add(void *alpha, const GlobalArray * g_a, - void *beta, const GlobalArray * g_b) const; - - - /** - * Patches of arrays (which must have the same number of elements) are - * added together element-wise. - * c[ ][ ] = alpha * a[ ][ ] + beta * b[ ][ ]. - * This is a collective operation. - * @param g_a, g_b, g_c global array [input] - * @param alo[], ahi[] patch of g_a [input] - * @param blo[], bhi[] patch of g_b [input] - * @param clo[], chi[] patch of g_c [input] - * @param alpha, beta scale factors [input] - */ - void addPatch (void *alpha, const GlobalArray * g_a, int alo[], int ahi[], - void *beta, const GlobalArray * g_b, int blo[], int bhi[], - int clo[], int chi[]) const; - - - /** - * @param string - message string [input] - * - * Check that the global array handle g_a is valid ... if not call - * ga_error with the string provided and some more info. - * This operation is local. - */ - void checkHandle(char* string) const; - - /** - * Compares distributions of two global arrays. Returns 0 if - * distributions are identical and 1 when they are not. - * This is a collective operation. - * @param g_a - global array [input] - */ - int compareDistr(const GlobalArray *g_a) const; - - /** - * Copies elements in array represented by g_a into the array - * represented by g_b [say for example: g_b.copy(g_a);]. The arrays must be the same type, shape, - * and identically aligned. - * This is a collective operation. - * @param g_a - global array [input] - */ - void copy(const GlobalArray *g_a) const; - - /** - * Copies elements in a patch of one array (ga) into another one (say for - * example:gb.copyPatch(...,ga,....); ). The patches of arrays may be of - * different shapes but must have the same number of elements. Patches must - * be nonoverlapping (if gb=ga). - * - * trans = 'N' or 'n' means that the transpose operator should not be - * applied. trans = 'T' or 't' means that transpose operator should be - * applied. This is a collective operation. - * @param ga - global array [input] - * @param alo[] - ga patch coordinates [input] - * @param ahi[] - ga patch coordinates [input] - * @param blo[] - gb patch coordinates [input] - * @param bhi[] - gb patch coordinates [input] - */ - void copyPatch(char trans, const GlobalArray* ga, int alo[], int ahi[], - int blo[], int bhi[]) const; - - /** - * Computes element-wise dot product of the two arrays which must be of - * the same types and same number of elements. - * return value = SUM_ij a(i,j)*b(i,j) - * This is a collective operation. - * @param g_a - array handle [input] - */ - double ddot(const GlobalArray * g_a) const; - - /** - * Computes the element-wise dot product of the two (possibly transposed) - * patches which must be of the same type and have the same number of - * elements. - * @param g_a - global array [input] - * @param alo[], ahi[] - g_a patch coordinates [input] - * @param blo[], bhi[] - g_b patch coordinates [input] - * @param ta, tb - transpose flags [input] - */ - double ddotPatch(char ta, int alo[], int ahi[], const GlobalArray * g_a, - char tb, int blo[], int bhi[]) const; - - /** Deallocates the array and frees any associated resources. */ - void destroy() const; - - /** - * Performs one of the matrix-matrix operations: - * [say: g_c.dgemm(..., g_a, g_b,..);] - * - * C := alpha*op( A )*op( B ) + beta*C, \n - * where op( X ) is one of \n - * op( X ) = X or op( X ) = X', \n - * alpha and beta are scalars, and A, B and C are matrices, with op( A ) - * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - * On entry, transa specifies the form of op( A ) to be used in the - * matrix multiplication as follows:\n - * ta = 'N' or 'n', op( A ) = A. \n - * ta = 'T' or 't', op( A ) = A'. \n - * This is a collective operation. - * @param g_a,g_b- input arrays [input] - * @param ta, tb - transpose operators [input] - * @param m - number of rows of op(A) and of matrix C [input] - * @param n - number of columns of op(B) and of matrix C [input] - * @param k - number of columns of op(A) and rows of matrix op(B)[input] - * @param alpha, beta - scale factors [input] - */ - void dgemm(char ta, char tb, int m, int n, int k, double alpha, - const GlobalArray *g_a, const GlobalArray *g_b,double beta) const; - - /** - * @param g_s - Metric [input] - * @param g_v - Global matrix to return evecs [output] - * @param eval - Local array to return evals [output] - * - * Solve the generalized eigen-value problem returning all eigen-vectors - * and values in ascending order. The input matrices are not overwritten - * or destroyed. - * This is a collective operation. - */ - void diag(const GlobalArray *g_s, GlobalArray *g_v, void *eval) const; - - /** - * Solve the generalized eigen-value problem returning all eigen-vectors - * and values in ascending order. Recommended for REPEATED calls if g_s - * is unchanged. Values of the control flag: - * - * value action/purpose - * - * 0 indicates first call to the eigensolver - * - * >0 consecutive calls (reuses factored g_s) - * - * <0 only erases factorized g_s; g_v and eval unchanged - * (should be called after previous use if another - * eigenproblem, i.e., different g_a and g_s, is to - * be solved) - * - * The input matrices are not destroyed. - * This is a collective operation. - * @param control - Control flag [input] - * @param g_a - Matrix to diagonalize [input] - * @param g_s - Metric [input] - * @param g_v - Global matrix to return evecs [output] - * @param eval - Local array to return evals [output] - */ - void diagReuse(int control, const GlobalArray *g_s, GlobalArray *g_v, - void *eval) const; - - /** - * Solve the standard (non-generalized) eigenvalue problem returning - * all eigenvectors and values in the ascending order. The input matrix - * is neither overwritten nor destroyed. - * This is a collective operation. - * @param g_v - Global matrix to return evecs [output] - * @param eval - Local array to return evals [output] - */ - void diagStd(GlobalArray *g_v, void *eval) const; - - void diagSeq(const GlobalArray * g_s, const GlobalArray * g_v, - void *eval) const; - - void diagStdSeq(const GlobalArray * g_v, void *eval) const; - - /** - * If no array elements are owned by process 'me', the range is returned - * as lo[]=0 and hi[]=-1 for all dimensions. The operation is local. - * @param iproc - process number [input] - * @param ndim - number of dimensions of the global array - * @param lo[ndim] - array of starting indices for array section[input] - * @param hi[ndim] - array of ending indices for array section [input] - */ - void distribution(int me, int* lo, int* hi) const; - - float fdot(const GlobalArray * g_a) const; - - float fdotPatch(char t_a, int alo[], int ahi[], const GlobalArray * g_b, - char t_b, int blo[], int bhi[]) const; - - /** - * @param value - pointer to the value of appropriate type - * (double/DoubleComplex/long) that matches array type. - * - * Assign a single value to all elements in the array. - * This is a collective operation. - */ - void fill(void *value) const; - - /** - * @param lo[], hi[] patch of g_a [input] - * @param val value to fill [input] - * - * Fill the patch with value of 'val' - * This is a collective operation. - */ - void fillPatch (int lo[], int hi[], void *val) const; - - /** - * Gathers array elements from a global array into a local array. - * The contents of the input arrays (v, subscrArray) are preserved, - * but their contents might be (consistently) shuffled on return. - - * for(k=0; k<= n; k++){ - * - * v[k] = a[subsArray[k][0]][subsArray[k][1]][subsArray[k][2]]...; - * - * } - * - * This is a one-sided operation. - * @param n - number of elements [input] - * @param v[n] - array containing values [input] - * @param subsarray[n][ndim] - array of subscripts for each element [input] - */ - void gather(void *v, int * subsarray[], int n) const; - - /** - * One-side operations. - * Copies data from global array section to the local array buffer. The - * local array is assumed to be have the same number of dimensions as the - * global array. Any detected inconsitencies/errors in the input arguments - * are fatal. - * - * Example: For ga_get operation transfering data from the [10:14,0:4] - * section of 2-dimensional 15x10 global array into local buffer 5x10 - * array we have: lo={10,0}, hi={14,4}, ld={10} - * - * @param lo[ndim] -array of starting indices for global array section[input] - * @param hi[ndim] - array of ending indices for global array section[input] - * @param buf - pointer to the local buffer array where the data goes[output] - * @param ld[ndim-1] - array specifying leading dimensions/strides/extents - * for buffer array [input] - */ - void get(int lo[], int hi[], void *buf, int ld[]) const; - - /** - * This function returns 1 if the global array has some dimensions for - * which the ghost cell width is greater than zero, it returns 0 otherwise. - * This is a collective operation. - */ - int hasGhosts() const; - - /** - * Computes element-wise dot product of the two arrays which must be of - * the same types and same number of elements. - * return value = SUM_ij a(i,j)*b(i,j) - * This is a collective operation. - * @param g_a - array handle [input] - */ - Integer idot(const GlobalArray * g_a) const; - - /** - * @param g_a - global array [input] - * @param alo[], ahi[] - g_a patch coordinates [input] - * @param blo[], bhi[] - g_b patch coordinates [input] - * @param ta, tb - transpose flags [input] - * - * Computes the element-wise dot product of the two (possibly transposed) - * patches which must be of the same type and have the same number of - * elements. - */ - long idotPatch(char ta, int alo[], int ahi[], const GlobalArray * g_a, - char tb, int blo[], int bhi[]) const; - - /** - * Returns data type and dimensions of the array. - * This operation is local. - * @param type - data type [output] - * @param ndim - number of dimensions [output] - * @param dims - array of dimensions [output] - */ - void inquire(int *type, int *ndim, int dims[]) const; - - /** - * Returns the name of an array represented by the handle g_a. - * This operation is local. - */ - char* inquireName() const; - - /** - * Computes element-wise dot product of the two arrays which must be of - * the same types and same number of elements. - * return value = SUM_ij a(i,j)*b(i,j) - * This is a collective operation. - * @param g_a - array handle [input] - */ - long ldot(const GlobalArray * g_a) const; - - /** - * @param g_a - coefficient matrix [input] - * Solves a system of linear equations - * - * A * X = B - * using the Cholesky factorization of an NxN double precision symmetric - * positive definite matrix A (epresented by handle g_a). On successful - * exit B will contain the solution X. - * It returns: - * - * = 0 : successful exit - * - * > 0 : the leading minor of this order is not positive - * definite and the factorization could - * not be completed - * - * This is a collective operation. - */ - int lltSolve(const GlobalArray * g_a) const; - - /** - * Return in owner the GA compute process id that 'owns' the data. If any - * element of subscript[] is out of bounds "-1" is returned. This operation - * is local. - * @param subscript[ndim] element subscript [output] - */ - int locate(int subscript[]) const; - - /** - * Return the list of the GA processes id that 'own' the data. Parts of the - * specified patch might be actually 'owned' by several processes. If lo/hi - * are out of bounds "0" is returned, otherwise return value is equal to the - * number of processes that hold the data. This operation is local. - * - * map[i][0:ndim-1] - lo[i] - * - * map[i][ndim:2*ndim-1] - hi[i] - * - * procs[i] - processor id that owns data in patch - * lo[i]:hi[i] - * - * @param ndim - number of dimensions of the global array - * @param lo[ndim] - array of starting indices for array section[input] - * @param hi[ndim] - array of ending indices for array section [input] - * @param map[][2*ndim] - array with mapping information [output] - * @param procs[nproc] - list of processes that own a part of array - * section[output] - */ - int locateRegion(int lo[], int hi[], int map[], int procs[]) const; - - /** - * @param trans - transpose or not transpose [input] - * @param g_a - coefficient matrix [input] - * - * Solve the system of linear equations op(A)X = B based on the LU - * factorization. - * - * op(A) = A or A' depending on the parameter trans: - * - * trans = 'N' or 'n' means that the transpose operator should not - * be applied. - * - * trans = 'T' or 't' means that the transpose operator should be applied. - * - * Matrix A is a general real matrix. Matrix B contains possibly multiple - * rhs vectors. The array associated with the handle g_b is overwritten - * by the solution matrix X. - * This is a collective operation. - */ - void luSolve(char trans, const GlobalArray * g_a) const; - - /** - * @param g_a, g_b global array [input] - * @param ailo, aihi, ajlo, ajhi patch of g_a [input] - * @param bilo, bihi, bjlo, bjhi patch of g_b [input] - * @param cilo, cihi, cjlo, cjhi patch of g_c [input] - * @param alpha, beta scale factors [input] - * @param transa, transb transpose operators [input] - * - * ga_matmul_patch is a patch version of ga_dgemm: - * - * C[cilo:cihi,cjlo:cjhi] := alpha* AA[ailo:aihi,ajlo:ajhi] * - * BB[bilo:bihi,bjlo:bjhi] ) + - * beta*C[cilo:cihi,cjlo:cjhi], - * - * where AA = op(A), BB = op(B), and op( X ) is one of - * op( X ) = X or op( X ) = X', - * - * Valid values for transpose arguments: 'n', 'N', 't', 'T'. It works - * for both double and DoubleComplex data tape. - * This is a collective operation. - */ - void matmulPatch(char transa, char transb, void* alpha, void *beta, - const GlobalArray *g_a, - int ailo, int aihi, int ajlo, int ajhi, - const GlobalArray *g_b, - int bilo, int bihi, int bjlo, int bjhi, - int cilo, int cihi, int cjlo, int cjhi) const; - - /** - * N-dimensional Arrays: - * @param g_a, g_b global array [input] - * @param alo, ahi array of patch of g_a [input] - * @param blo, bhi array of patch of g_b [input] - * @param clo, chi array of patch of g_c [input] - * @param alpha, beta scale factors [input] - * @param transa, transb transpose operators [input] - * - * nga_matmul_patch is a n-dimensional patch version of ga_dgemm: - * - * C[clo[]:chi[]] := alpha* AA[alo[]:ahi[]] * - * BB[blo[]:bhi[]]) + - * beta*C[clo[]:chi[]], - * - * where AA = op(A), BB = op(B), and op( X ) is one of - * op( X ) = X or op( X ) = X', - * - * Valid values for transpose arguments: 'n', 'N', 't', 'T'. It works - * for both double and DoubleComplex data tape. - * This is a collective operation. - */ - void matmulPatch(char transa, char transb, void* alpha, void *beta, - const GlobalArray *g_a, int *alo, int *ahi, - const GlobalArray *g_b, int *blo, int *bhi, - int *clo, int *chi) const; - - /** - * @param nblock[ndim] - number of partitions for each dimension [output] - * - * Given a distribution of an array represented by the handle g_a, - * returns the number of partitions of each array dimension. - * This operation is local. - */ - void nblock(int numblock[]) const; - - /** - * Returns the number of dimensions in array represented by the handle - * g_a. This operation is local. - */ - int ndim() const; - - /** - * @param ndim - number of dimensions of the global array - * @param lo[ndim] - array of starting indices for array section [input] - * @param hi[ndim] - array of ending indices for array section [input] - * @param buf - pointer to the local buffer array [input] - * @param ld[ndim-1] - array specifying leading - * dimensions/strides/extents for buffer array [input] - * @param double/DoubleComplex/long *alpha scale factor - * - * Same as nga_acc except the indices can extend beyond the array - * boundary/dimensions in which case the library wraps them around. - * This is a one-sided and atomic operation. - */ - void periodicAcc(int lo[], int hi[], void* buf, int ld[], void* alpha) const; - - /** - * @param ndim - number of dimensions of the global array - * @param lo[ndim] - array of starting indices for global array - * section [input] - * @param hi[ndim] - array of ending indices for global array - * section [input] - * @param buf - pointer to the local buffer array where the data - * goes [output] - * @param ld[ndim-1] - array specifying leading dimensions/strides/extents - * for buffer array [input] - * - * Same as nga_get except the indices can extend beyond the array - * boundary/dimensions in which case the library wraps them around. - * This is a one-sided operation. - */ - void periodicGet(int lo[], int hi[], void* buf, int ld[]) const; - - /** - * @param ndim - number of dimensions of the global array - * @param lo[ndim] - array of starting indices for global array - * section [input] - * @param hi[ndim] - array of ending indices for global array - * section [input] - * @param buf - pointer to the local buffer array where the data - * goes [output] - * @param ld[ndim-1] - array specifying leading dimensions/strides/extents - * for buffer array [input] - * - * Same as nga_put except the indices can extend beyond the array - * boundary/dimensions in which case the library wraps them around. - * This is a one-sided operation. - */ - void periodicPut(int lo[], int hi[], void* buf, int ld[]) const; - - /** - * Prints an entire array to the standard output. - * This is a collective operation. - */ - void print() const ; - - /** - * Prints the array distribution. - * This is a collective operation. - */ - void printDistribution() const ; - - /** - * Prints the array distribution to a file. - * This is a collective operation. - */ - void printFile(FILE *file) const; - - /** - * Prints a patch of g_a array to the standard output. If pretty has the - * value 0 then output is printed in a dense fashion. If pretty has the - * value 1 then output is formatted and rows/columns labeled. - - * This is a collective operation. - * @param lo[] - coordinates of the patch [input] - * @param hi[] - coordinates of the patch [input] - * @param int pretty - formatting flag [input] - */ - void printPatch(int* lo, int* hi, int pretty) const; - - /** - * @param ndim number of array dimensions - * @param proc process id [input] - * @param coord[ndim] coordinates in processor grid [output] - * - * Based on the distribution of an array associated with handle g_a, - * determines coordinates of the specified processor in the virtual - * processor grid corresponding to the distribution of array g_a. The - * numbering starts from 0. The values of -1 means that the processor - * doesn't 'own' any section of array represented by g_a. - * This operation is local. - */ - void procTopology(int proc, int coord[]) const; - - /*void procTopology(int proc, int *prow, int *pcol);*/ - - /** - * Copies data from local array buffer to the global array section . The - * local array is assumed to be have the same number of dimensions as the - * global array. Any detected inconsitencies/errors in input arguments are - * fatal. This is a one-sided operation. - * - * @param lo[ndim]-array of starting indices for global array section[input] - * @param hi[ndim]- array of ending indices for global array section [input] - * @param buf - pointer to the local buffer array where the data is [input] - * @param ld[ndim-1]-array specifying leading dimensions/strides/extents for - * @param buffer array [input] - */ - void put(int lo[], int hi[], void *buf, int ld[]) const; - - - /** - * @param ndim - number of dimensions of the global array - * @param subscript[ndim] - subscript array for the referenced element[input] - * - * Atomically read and increment an element in an integer array. - * - * *BEGIN CRITICAL SECTION* - * - * old_value = a(subscript) - * - * a(subscript) += inc - * - * *END CRITICAL SECTION* - * - * return old_value - * - * This is a one-sided and atomic operation. - */ - long readInc(int subscript[], long inc) const; - - /** - * @param ndim - number of dimensions of the global array - * @param lo[ndim] - array of starting indices for array section [input] - * @param hi[ndim] - array of ending indices for array section [input] - * - * Releases access to a global array when the data was read only. - * Your code should look like: - * - * NGA_Distribution(g_a, myproc, lo,hi); - * - * NGA_Access(g_a, lo, hi, &ptr, ld); - * - * - * - * GA_Release(g_a, lo, hi); - * - * NOTE: see restrictions specified for ga_access. - * This operation is local. - */ - void release(int lo[], int hi[]) const; - - /** - * @param ndim - number of dimensions of the global array - * @param lo[ndim] - array of starting indices for array section [input] - * @param hi[ndim] - array of ending indices for array section [input] - * - * Releases access to the data. It must be used if the data was accessed - * for writing. NOTE: see restrictions specified for ga_access. - * This operation is local. - */ - void releaseUpdate(int lo[], int hi[]) const; - - /** - * Scales an array by the constant s. Note that the library is unable - * to detect errors when the pointed value is of different type than - * the array. - * This is a collective operation. - * @param value - pointer to the value of appropriate type - * (double/DoubleComplex/long) that matches array type - */ - void scale(void *value) const; - - /** - * @param lo[], hi[] patch of g_a [input] - * @param val scale factor [input] - * - * Scale an array by the factor 'val'. - * This is a collective operation. - */ - void scalePatch (int lo[], int hi[], void *val) const; - - /** - * Scatters array elements into a global array. The contents of the input - * arrays (v,subscrArray) are preserved, but their contents might be - * (consistently) shuffled on return. - * - * for(k=0; k<= n; k++){ - * - * a[subsArray[k][0]][subsArray[k][1]][subsArray[k][2]]... = v[k]; - * - * } - * This is a one-sided operation. - * @param n - number of elements [input] - * @param v[n] - array containing values [input] - * @param subsarray[n][ndim] - array of subscripts for each element [input] - */ - void scatter(void *v, int *subsarray[], int n) const; - - /** - * @param g_a,g_b- handles to input arrays [input] - * @param g_c - handles to output array [input] - * @param ta, tb - transpose operators [input] - * @param m - number of rows of op(A) and of matrix C [input] - * @param n - number of columns of op(B) and of matrix C [input] - * @param k - number of columns of op(A) and rows of matrix op(B)[input] - * @param alpha, beta - scale factors [input] - * - * Performs one of the matrix-matrix operations: - * - * C := alpha*op( A )*op( B ) + beta*C, - * where op( X ) is one of - * op( X ) = X or op( X ) = X', - * alpha and beta are scalars, and A, B and C are matrices, with op( A ) - * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - * On entry, transa specifies the form of op( A ) to be used in the - * matrix multiplication as follows: - * - * ta = 'N' or 'n', op( A ) = A. - * - * ta = 'T' or 't', op( A ) = A'. - * - * This is a collective operation. - */ - void sgemm(char ta, char tb, int m, int n, int k, float alpha, - const GlobalArray *g_a, const GlobalArray *g_b, float beta) const; - - /** - * @param g_a - coefficient matrix [input] - * - * Solves a system of linear equations - * A * X = B - * It first will call the Cholesky factorization routine and, if - * sucessfully, will solve the system with the Cholesky solver. If - * Cholesky will be not be able to factorize A, then it will call the - * LU factorization routine and will solve the system with forward/backward - * substitution. On exit B will contain the solution X. - * It returns - * - * = 0 : Cholesky factoriztion was succesful - * - * > 0 : the leading minor of this order - * is not positive definite, Cholesky factorization - * could not be completed and LU factoriztion was used - * - * This is a collective operation. - */ - int solve(const GlobalArray * g_a) const; - - /** - * It computes the inverse of a double precision using the Cholesky - * factorization of a NxN double precision symmetric positive definite - * matrix A stored in the global array represented by g_a. On successful - * exit, A will contain the inverse. - * It returns - * - * = 0 : successful exit * - - * > 0 : the leading minor of this order is not positive - * definite and the factorization could not be completed - * - * < 0 : it returns the index i of the (i,i) - * element of the factor L/U that is zero and - * the inverse could not be computed - * - * This is a collective operation. - */ - int spdInvert() const; - - /** - * @param op - operator {"min","max"} [input] - * @param val - address where value should be stored [output] - * @param subscript[ndim] - array index for the selected element [output] - * - * Returns the value and index for an element that is selected by the - * specified operator in a global array corresponding to g_a handle. - * This is a collective operation. - */ - void selectElem(char *op, void* val, int index[]) const; - - /** - * Symmmetrizes matrix A with handle A:=.5 * (A+A'). - * This is a collective operation - */ - void symmetrize() const; - - /** - * Transposes a matrix: B = A', where A and B are represented by - * handles g_a and g_b [say, g_b.transpose(g_a);]. This is a collective - * operation. - */ - void transpose(const GlobalArray * g_a) const; - - /** - * This call updates the ghost cell regions on each processor with the - * corresponding neighbor data from other processors. The operation assumes - * that all data is wrapped around using periodic boundary data so that - * ghost cell data that goes beyound an array boundary is wrapped around to - * the other end of the array. The GA_Update_ghosts call contains two - * GA_Sync calls before and after the actual update operation. For some - * applications these calls may be unecessary, if so they can be removed - * using the GA_Mask_sync subroutine. - * This is a collective operation. - */ - void updateGhosts() const; - - /** - * This function can be used to update the ghost cells along individual - * directions. It is designed for algorithms that can overlap updates - * with computation. The variable dimension indicates which coordinate - * direction is to be updated (e.g. dimension = 1 would correspond to the - * y axis in a two or three dimensional system), the variable idir can take - * the values +/-1 and indicates whether the side that is to be updated lies - * in the positive or negative direction, and cflag indicates whether or not - * the corners on the side being updated are to be included in the update. - * The following calls would be equivalent to a call to GA_Update_ghosts - * for a 2-dimensional system: - * - * status = NGA_Update_ghost_dir(g_a,0,-1,1);\n - * status = NGA_Update_ghost_dir(g_a,0,1,1);\n - * status = NGA_Update_ghost_dir(g_a,1,-1,0);\n - * status = NGA_Update_ghost_dir(g_a,1,1,0);\n - * - * The variable cflag is set equal to 1 (or non-zero) in the first two calls so that the corner ghost cells are update, it is set equal to 0 in the second two calls to avoid redundant updates of the corners. Note that updating the ghosts cells using several independent calls to the nga_update_ghost_dir functions is generally not as efficient as using GA_Update_ghosts unless the individual calls can be effectively overlapped with computation. - * This is a collective operation. - * @param g_a [input] - * @param dimension - array dimension that is to be updated [input] - * @param idir - direction of update (+/- 1) [input] - * @param cflag - flag (0/1) to include corners in update [input] - */ - int updateGhostDir(int dimension, int idir, int cflag) const; - - - /** - * Computes element-wise dot product of the two arrays which must be of - * the same types and same number of elements. - * return value = SUM_ij a(i,j)*b(i,j) - * This is a collective operation. - * @param g_a - array handle [input] - */ - DoubleComplex zdot(const GlobalArray * g_a) const; - - /** - * @param g_a - global array [input] - * @param alo[], ahi[] - g_a patch coordinates [input] - * @param blo[], bhi[] - g_b patch coordinates [input] - * @param ta, tb - transpose flags [input] - * - * Computes the element-wise dot product of the two (possibly transposed) - * patches which must be of the same type and have the same number of - * elements. - */ - DoubleComplex zdotPatch(char ta, int alo[], int ahi[], - const GlobalArray * g_a, char tb, int blo[], - int bhi[]) const; - - /** - * Sets value of all elements in the array to zero. - * This is a collective operation. - */ - void zero() const; - - /** - * @param lo[], hi[] [input] - * - * Set all the elements in the patch to zero. - * This is a collective operation. - */ - void zeroPatch (int lo[], int hi[]) const; - - /** - * @param g_a,g_b- handles to input arrays [input] - * @param g_c - handles to output array [input] - * @param ta, tb - transpose operators [input] - * @param m - number of rows of op(A) and of matrix C [input] - * @param n - number of columns of op(B) and of matrix C [input] - * @param k - number of columns of op(A) and rows of matrix op(B)[input] - * @param alpha, beta - scale factors [input] - * - * Performs one of the matrix-matrix operations: - * C := alpha*op( A )*op( B ) + beta*C, - * where op( X ) is one of - * op( X ) = X or op( X ) = X', - * alpha and beta are scalars, and A, B and C are matrices, with op( A ) - * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. - * On entry, transa specifies the form of op( A ) to be used in the - * matrix multiplication as follows: - * - * ta = 'N' or 'n', op( A ) = A. - * - * ta = 'T' or 't', op( A ) = A'. * - * - * This is a collective operation. - */ - void zgemm(char ta, char tb, int m, int n, int k, DoubleComplex alpha, - const GlobalArray *g_a, const GlobalArray *g_b, - DoubleComplex beta) const; - - - /** - * New additional functionalities from Limin. - */ - /** - * Take element-wise absolute value of the array. - * This is a collective operation. - */ - void absValue() const; - - /** - * Take element-wise absolute value of the patch. - * This is a collective operation. - * @param lo[], hi[] - g_a patch coordinates [input] - */ - void absValuePatch(int *lo, int *hi) const; - - /** - * Add the constant pointed by alpha to each element of the array. - * This is a collective operation. - * @param double/complex/int/long/float *alpha [input] - */ - void addConstant(void* alpha) const; - - /** - * Add the constant pointed by alpha to each element of the patch. - * This is a collective operation. - * @param lo[], hi[] - g_a patch coordinates [input] - * @param double/complex/int/long/float *alpha [input] - */ - void addConstantPatch(int *lo, int *hi, void *alpha) const; - - /** - * Take element-wise reciprocal of the array. - * This is a collective operation. - */ - void recip() const; - - /** - * Take element-wise reciprocal of the patch. - * This is a collective operation. - * @param lo[], hi[] - g_a patch coordinates [input] - */ - void recipPatch(int *lo, int *hi) const; - - - /** - * Computes the element-wise product of the two arrays - * which must be of the same types and same number of - * elements. For two-dimensional arrays, - * - * c(i, j) = a(i,j)*b(i,j) - * - * The result (c) may replace one of the input arrays (a/b). - * This is a collective operation. - * @param g_a, g_b - global array [input] - */ - void elemMultiply(const GlobalArray * g_a, const GlobalArray * g_b) const; - - - /** - * Computes the element-wise product of the two patches - * which must be of the same types and same number of - * elements. For two-dimensional arrays, - * - * c(i, j) = a(i,j)*b(i,j) - * - * The result (c) may replace one of the input arrays (a/b). - * This is a collective operation. - * @param g_a, g_b - global array [input] - * @param alo[], ahi[] - g_a patch coordinates [input] - * @param blo[], bhi[] - g_b patch coordinates [input] - * @param clo[], chi[] - g_c patch coordinates [output] - */ - void elemMultiplyPatch(const GlobalArray * g_a,int *alo,int *ahi, - const GlobalArray * g_b,int *blo,int *bhi, - int *clo,int *chi) const; - /** - * @param g_a, g_b - global array [input] - * - * Computes the element-wise quotient of the two arrays - * which must be of the same types and same number of - * elements. For two-dimensional arrays, - * - * c(i, j) = a(i,j)/b(i,j) - * - * The result (c) may replace one of the input arrays (a/b). If one of - * the elements of array g_b is zero, the quotient for the element of g_c - * will be set to GA_NEGATIVE_INFINITY. - * This is a collective operation. - */ - void elemDivide(const GlobalArray * g_a, const GlobalArray * g_b) const; - - /** - * Computes the element-wise quotient of the two patches - * which must be of the same types and same number of - * elements. For two-dimensional arrays, - * - * c(i, j) = a(i,j)/b(i,j) - * - * The result (c) may replace one of the input arrays (a/b). - * This is a collective operation. - * @param g_a, g_b - global array [input] - * @param alo[], ahi[] - g_a patch coordinates [input] - * @param blo[], bhi[] - g_b patch coordinates [input] - * @param clo[], chi[] - g_c patch coordinates [output] - * - */ - void elemDividePatch(const GlobalArray * g_a,int *alo,int *ahi, - const GlobalArray * g_b,int *blo,int *bhi, - int *clo,int *chi) const; - /** - * Computes the element-wise maximum of the two arrays - * which must be of the same types and same number of - * elements. For two dimensional arrays, - * - * c(i, j) = max{a(i,j), b(i,j)} - * - * The result (c) may replace one of the input arrays (a/b). - * This is a collective operation. - * @param g_a, g_b - global array [input] - */ - void elemMaximum(const GlobalArray * g_a, const GlobalArray * g_b) const; - - - /** - * Computes the element-wise maximum of the two patches - * which must be of the same types and same number of - * elements. For two-dimensional of noncomplex arrays, - * - * c(i, j) = max{a(i,j), b(i,j)} - * - * If the data type is complex, then - * c(i, j).real = max{ |a(i,j)|, |b(i,j)|} while c(i,j).image = 0. - * - * The result (c) may replace one of the input arrays (a/b). - * This is a collective operation. - * @param g_a, g_b - global array [input] - * @param alo[], ahi[] - g_a patch coordinates [input] - * @param blo[], bhi[] - g_b patch coordinates [input] - * @param clo[], chi[] - g_c patch coordinates [output] - */ - void elemMaximumPatch(const GlobalArray * g_a,int *alo,int *ahi, - const GlobalArray * g_b,int *blo,int *bhi, - int *clo,int *chi) const; - /** - * Computes the element-wise minimum of the two arrays - * which must be of the same types and same number of - * elements. For two dimensional arrays, - * - * c(i, j) = min{a(i,j), b(i,j)} - * - * The result (c) may replace one of the input arrays (a/b). - * This is a collective operation. - * @param g_a, g_b - global array [input] - */ - void elemMinimum(const GlobalArray * g_a, const GlobalArray * g_b) const; - - /** - * Computes the element-wise minimum of the two patches - * which must be of the same types and same number of - * elements. For two-dimensional of noncomplex arrays, - * - * c(i, j) = min{a(i,j), b(i,j)} - * - * If the data type is complex, then - * c(i, j).real = min{ |a(i,j)|, |b(i,j)|} while c(i,j).image = 0. - * - * The result (c) may replace one of the input arrays (a/b). - * This is a collective operation. - * @param g_a, g_b - global array [input] - * @param alo[], ahi[] - g_a patch coordinates [input] - * @param blo[], bhi[] - g_b patch coordinates [input] - * @param clo[], chi[] - g_c patch coordinates [output] - */ - void elemMinimumPatch(const GlobalArray * g_a,int *alo,int *ahi, - const GlobalArray * g_b,int *blo,int *bhi, - int *clo,int *chi) const; - - /** - * Calculates the largest multiple of a vector g_b that can be added - * to this vector g_a while keeping each element of this vector - * nonnegative. - * This is a collective operation. - * @param g_a, g_b - global array where g_b is the step direction.[input] - * @param step - the maximum step [output] - */ - void stepMax(const GlobalArray * g_a, double *step) const; - - void stepMaxPatch(int *alo, int *ahi, - const GlobalArray * g_b, int *blo, int *bhi, - double *step) const; - - /** Matrix Operations */ - - /** - * Adds this constant to the diagonal elements of the matrix. - * This is a collective operation. - * @param double/complex/int/long/float *c [input] - */ - void shiftDiagonal(void *c) const; - - /** - * Sets the diagonal elements of this matrix g_a with the elements of the - * vector g_v. This is a collective operation. - * @param g_v - global array [input] - */ - void setDiagonal(const GlobalArray * g_v) const; - - /** - * Sets the diagonal elements of this matrix g_a with zeros. - * This is a collective operation. - */ - void zeroDiagonal() const; - - /** - * Adds the elements of the vector g_v to the diagonal of this matrix g_a. - * This is a collective operation. - * @param g_v - global array [input] - */ - void addDiagonal(const GlobalArray * g_v) const; - - /** - * Inserts the diagonal elements of this matrix g_a into the vector g_v. - * This is a collective operation. - * @param g_v - global array [input] - */ - void getDiagonal(const GlobalArray * g_a) const; - - /** - * Scales the rows of this matrix g_a using the vector g_v. - * This is a collective operation. - * @param g_v - global array [input] - */ - void scaleRows(const GlobalArray * g_v) const; - - /** - * Scales the columns of this matrix g_a using the vector g_v. - * This is a collective operation. - * @param g_v - global array [input] - */ - void scaleCols(const GlobalArray * g_v) const; - - /** - * Computes the 1-norm of the matrix or vector g_a. - * This is a collective operation. - * @param nm - matrix/vector 1-norm value - */ - void norm1(double *nm) const; - - /** - * Computes the 1-norm of the matrix or vector g_a. - * This is a collective operation. - * @param nm - matrix/vector 1-norm value - */ - void normInfinity(double *nm) const; - - /** - * Computes the componentwise Median of three arrays g_a, g_b, and g_c, and - * stores the result in this array g_m. The result (m) may replace one of the - * input arrays (a/b/c). This is a collective operation. - * @param g_a, g_b, g_c- global array [input] - */ - void median(const GlobalArray * g_a, const GlobalArray * g_b, - const GlobalArray * g_c) const; - - /** - * Computes the componentwise Median of three patches g_a, g_b, and g_c, and - * stores the result in this patch g_m. The result (m) may replace one of the - * input patches (a/b/c). This is a collective operation. - * @param g_a, g_b, g_c - global array [input] - * @param alo[], ahi[] - g_a patch coordinates [input] - * @param blo[], bhi[] - g_b patch coordinates [input] - * @param clo[], chi[] - g_c patch coordinates [intput] - * @param mlo[], mhi[] - g_m patch coordinates [output] - */ - void medianPatch(const GlobalArray * g_a, int *alo, int *ahi, - const GlobalArray * g_b, int *blo, int *bhi, - const GlobalArray * g_c, int *clo, int *chi, - int *mlo, int *mhi) const; - - GlobalArray& operator=(const GlobalArray &g_a); - int operator==(const GlobalArray &g_a) const; - int operator!=(const GlobalArray &g_a) const; - - private: - int mHandle; /* g_a handle */ -}; - -#endif // _GLOBALARRAY_H diff --git a/cca/ga_cca_classic/Makefile b/cca/ga_cca_classic/Makefile deleted file mode 100644 index cc0689c23..000000000 --- a/cca/ga_cca_classic/Makefile +++ /dev/null @@ -1,118 +0,0 @@ -# CCAFE_HOME= To be set as an Environment variable: -# Eg: export CCAFE_HOME=/msrc/proj/cca/manoj/dccafe/cxx - -# Here we must contend with two kinds of .o file -- -# those built normally and those built for use in -# dynamic libraries. To that end we delete .o files -# built for DL as soon as we've converted them to .so. - -zeroth: first - -EG_ROOT=$(CCA_ROOT)/cxx/eg - -include $(CCAFE_HOME)/Makefile.Rules -include $(CCAFE_HOME)/MakeIncl.CCAFE_Component -include $(EG_ROOT)/ccaComponent/MakeIncl.EGComponent - -#OTHERFLAGS = -D_DBG_ILOOP -D_DBG_PLATE1 - -ifdef LINUX64_QUADRICS - EXTRA_LIBS = -lshmem -lelan3 -lelan -lpthread -endif - -GA_INC = -I$(GA_HOME)/include -GA_LIBS = -L$(GA_HOME)/lib/$(TARGET) -L$(MPI_LIB) -ldl -lc -lglobal\ - -lma -llinalg -larmci -ltcgmsg-mpi $(LIBMPI)\ - -lg2c -lnsl $(EXTRA_LIBS) -lm - -LOCALFLAGS= $(OTHERFLAGS) $(EG_INC) $(GA_INC) -I$(MPI_INCLUDE) -I. -WRAPPERFLAGS= -w -WRAPPERFLAGS= - -SHARED_FLAGS = -fpic - -%.o : %.C - $(CXX) -c $(SHARED_FLAGS) $(CXXFLAGS) $< - -%.o : %.cxx - $(CXX) -c $(SHARED_FLAGS) $(CXXFLAGS) $(LOCALFLAGS) $< - -%.i : %.cxx - -$(RM) $@ - $(CXX) $(CPPFLAGS) $(LOCALFLAGS) $< | egrep -v ^$$ |egrep -v '^# ' > $@ - - -%.class : %.java - $(JAVAC) $(JAVAFLAGS) $< -%.h : %.class - $(JAVAH) $(JAVAHFLAGS) $(<:%.class=%) - -JAVASRC = - -# dl stuff that is not ccaffeine dependent -CXXDLSTRICTSRC= \ -GAServices.cxx - -# dl stuff that is ccaffeine dependent -CXXDLLOOSESRC= - -# static link stuff -CXXDLNONESRC= - -# stuff built with or without a parallel environment. -CXXSRC = $(CXXDLSTRICTSRC) $(CXXDLLOOSESRC) $(CXXDLNONESRC) - -NEWSRC = \ -GlobalArray.cxx \ -overload.cxx \ -GAServices_DADF.cxx \ -GA_DADFArray.cxx \ -DADFAxisInfo.cxx \ -DADFTemplate.cxx \ -DADFDescriptor.cxx \ -DADFRegionInfo.cxx - -CXXOBJ = $(CXXSRC:%.cxx=%.o) -CXXDSSOBJ = $(CXXDLSTRICTSRC:%.cxx=%.so) -CXXDLSOBJ = $(CXXDLLOOSESRC:%.cxx=%.so) -NEWOBJ = $(NEWSRC:%.cxx=%.o) - -$(CXXDSSOBJ) : %$(CCAFE_SHLIB_SUFFIX) : %.cxx allobj - -$(RM) $@ $*.o - #$(CCAFE_HOME)/dc/component/genDLWrapperStrict $* > $*_wrapper.cxx - #$(CCAFE_HOME)/dc/component/genDLIndex $@ create_$* $* > $*.cca - $(CXX) -c $(CXXFLAGS) $(LOCALFLAGS) $(CCAFE_SHLIB_CFLAGS) $< - $(CXX) -c $(CXXFLAGS) $(LOCALFLAGS) $(CCAFE_SHLIB_CFLAGS) \ - $(WRAPPERFLAGS) $*_wrapper.cxx - $(CCAFE_SHLIB_LD) $(CCAFE_SHLIB_LDFLAGS) -o $@ $*.o $*_wrapper.o \ - $(NEWOBJ) $(GA_LIBS) - -$(RM) $*.o $(NEWOBJ) $*_wrapper.o - -LIB= libcomponent.a - -first all: lib dllib - @echo done $(CXXOBJ) $(JAVACLS) $(JNIHDR) - -lib : allobj - ar $(ARFLAGS) $(LIB) $(CXXOBJ) $(NEWOBJ) - $(RANLIB) $(LIB) - -dllib: $(CXXDSSOBJ) $(CXXDLSOBJ) - -allobj : $(CXXOBJ) $(JAVACLS) $(JNIHDR) $(NEWOBJ) - -new: $(NEWOBJ) - -$(CXXOBJ) : $(CXXSRC) - - -clean: - $(RM) -f $(CXXOBJ) $(LIB) $(JAVACLS) $(JNIHDR) $(NEWOBJ) *.so *_wrapper.o *.i *~ - -depend: - $(DEPEND) $(CXXFLAGS) $(LOCALFLAGS) $(CXXSRC) $(NEWSRC) - -depend-sys: - $(DEPENDSYS) $(CXXFLAGS) $(LOCALFLAGS) $(CXXSRC) $(NEWSRC) - -#include Makefile.depends diff --git a/cca/ga_cca_classic/TestComponent/Makefile b/cca/ga_cca_classic/TestComponent/Makefile deleted file mode 100644 index 0ff17b947..000000000 --- a/cca/ga_cca_classic/TestComponent/Makefile +++ /dev/null @@ -1,111 +0,0 @@ -# CCAFE_HOME= To be set as an Environment variable: -# Eg: export CCAFE_HOME=/msrc/proj/cca/manoj/dccafe/cxx - -# Here we must contend with two kinds of .o file -- -# those built normally and those built for use in -# dynamic libraries. To that end we delete .o files -# built for DL as soon as we've converted them to .so. - -zeroth: first - -EG_ROOT=$(CCA_ROOT)/cxx/eg - -include $(CCAFE_HOME)/Makefile.Rules -include $(CCAFE_HOME)/MakeIncl.CCAFE_Component -include $(EG_ROOT)/ccaComponent/MakeIncl.EGComponent - -#OTHERFLAGS = -D_DBG_ILOOP -D_DBG_PLATE1 - -ifdef LINUX64_QUADRICS - EXTRA_LIBS = -lshmem -lelan3 -lelan -lpthread -endif - -GA_INC = -I$(GA_HOME)/include -GA_LIBS = -L$(GA_HOME)/lib/$(TARGET) -L$(MPI_LIB) -ldl -lc -lglobal\ - -lma -llinalg -larmci -ltcgmsg-mpi $(LIBMPI)\ - -lg2c -lnsl $(EXTRA_LIBS) -lm - -LOCALFLAGS= $(OTHERFLAGS) $(EG_INC) $(GA_INC) -I$(MPI_INCLUDE) -I. -WRAPPERFLAGS= -w -WRAPPERFLAGS= - -SHARED_FLAGS = -fpic - -%.o : %.C - $(CXX) -c $(SHARED_FLAGS) $(CXXFLAGS) $< - -%.o : %.cxx - $(CXX) -c $(SHARED_FLAGS) $(CXXFLAGS) $(LOCALFLAGS) $< - -%.i : %.cxx - -$(RM) $@ - $(CXX) $(CPPFLAGS) $(LOCALFLAGS) $< | egrep -v ^$$ |egrep -v '^# ' > $@ - - -%.class : %.java - $(JAVAC) $(JAVAFLAGS) $< -%.h : %.class - $(JAVAH) $(JAVAHFLAGS) $(<:%.class=%) - -JAVASRC = - -# dl stuff that is not ccaffeine dependent -CXXDLSTRICTSRC= \ -TestComponent.cxx - -# dl stuff that is ccaffeine dependent -CXXDLLOOSESRC= - -# static link stuff -CXXDLNONESRC= - -# stuff built with or without a parallel environment. -CXXSRC = $(CXXDLSTRICTSRC) $(CXXDLLOOSESRC) $(CXXDLNONESRC) - -NEWSRC = - - -CXXOBJ = $(CXXSRC:%.cxx=%.o) -CXXDSSOBJ = $(CXXDLSTRICTSRC:%.cxx=%.so) -CXXDLSOBJ = $(CXXDLLOOSESRC:%.cxx=%.so) -NEWOBJ = $(NEWSRC:%.cxx=%.o) - -$(CXXDSSOBJ) : %$(CCAFE_SHLIB_SUFFIX) : %.cxx - -$(RM) $@ $*.o - #$(CCAFE_HOME)/dc/component/genDLWrapperStrict $* > $*_wrapper.cxx - #$(CCAFE_HOME)/dc/component/genDLIndex $@ create_$* $* > $*.cca - $(CXX) -c $(CXXFLAGS) $(LOCALFLAGS) $(CCAFE_SHLIB_CFLAGS) $< - $(CXX) -c $(CXXFLAGS) $(LOCALFLAGS) $(CCAFE_SHLIB_CFLAGS) \ - $(WRAPPERFLAGS) $*_wrapper.cxx - $(CCAFE_SHLIB_LD) $(CCAFE_SHLIB_LDFLAGS) -o $@ $*.o $*_wrapper.o \ - $(NEWOBJ) $(GA_LIBS) - -$(RM) $*.o $(NEWOBJ) $*_wrapper.o - -LIB= libcomponent.a - -first all: lib dllib - @echo done $(CXXOBJ) $(JAVACLS) $(JNIHDR) - -lib : allobj - ar $(ARFLAGS) $(LIB) $(CXXOBJ) $(NEWOBJ) - $(RANLIB) $(LIB) - -dllib: $(CXXDSSOBJ) $(CXXDLSOBJ) - -allobj : $(CXXOBJ) $(JAVACLS) $(JNIHDR) $(NEWOBJ) - -new: $(NEWOBJ) - -$(CXXOBJ) : $(CXXSRC) - - -clean: - $(RM) -f $(CXXOBJ) $(LIB) $(JAVACLS) $(JNIHDR) $(NEWOBJ) *.so *_wrapper.o *.i *~ - -depend: - $(DEPEND) $(CXXFLAGS) $(LOCALFLAGS) $(CXXSRC) $(NEWSRC) - -depend-sys: - $(DEPENDSYS) $(CXXFLAGS) $(LOCALFLAGS) $(CXXSRC) $(NEWSRC) - -#include Makefile.depends diff --git a/cca/ga_cca_classic/TestComponent/TestComponent.cca b/cca/ga_cca_classic/TestComponent/TestComponent.cca deleted file mode 100644 index efb5ba094..000000000 --- a/cca/ga_cca_classic/TestComponent/TestComponent.cca +++ /dev/null @@ -1,23 +0,0 @@ -# generated CCAFFEINE dynamic library index. -# The ! lines should not contain whitespace before the = -# unless they begin with something other than date,builder,location. -# There may be more than 1 library in a .cca file, in which case -# last ! meta data seen wins for following library and component info. -# Note that if you want to change the user's "apparent" class name -# seen in the UI, you can change the name following the constructor function -# name in this file. -# N.B.: -# This is the exceedingly-poor-mans version of some of the info -# we'd want from a repository. We'd also want a list of ports -# that must be known (dependencies), a list of ports provided or -# at least potentially provided, parallel fabric expectations, -# version knowledge, and the usual ton of other metadata. -# Finally, we'd like to get all this info directly from the -# library itself (perhaps by filtering the source or by running -# 'strings' over the libraries. -!date=Fri Jan 31 16:29:59 PST 2003 -!builder=manoj@ -!location=/msrc/home/manoj/GlobalArray/GA-Component/Version5/TestComponent -TestComponent.so -create_TestComponent TestComponent - diff --git a/cca/ga_cca_classic/TestComponent/TestComponent.cxx b/cca/ga_cca_classic/TestComponent/TestComponent.cxx deleted file mode 100644 index a32951989..000000000 --- a/cca/ga_cca_classic/TestComponent/TestComponent.cxx +++ /dev/null @@ -1,277 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include "jc++/jc++.h" -#include "jc++/util/jc++util.h" -#include "parameters/parametersStar.h" -#include "util/IO.h" - -#include "../gacca.h" -#include "TestComponent.h" - - -using std::cout; -using std::printf; -using std::sqrt; - -#define N 100 -#define DIM 2 -#define GA_DATA_TYPE MT_F_DBL - -#define CHECKERR(err) if (err < 0) cerr<<"Line "<<__LINE__<<": Error # " << err << endl; - -void doWork(GA::GAClassicPort *ga_port); -void doWork_DADF(GA::GAClassicPort * ga_port, - classic::gov::cca::DistArrayTemplFactoryPort * templFactory, - classic::gov::cca::DistArrayDescrFactoryPort * descrFactory); - - -TestComponent::TestComponent() { - svc = 0; -} - -TestComponent::~TestComponent() { - svc = 0; -} - -void -TestComponent::setServices(classic::gov::cca::Services *cc) { - - svc = cc; - - // Contact the PrintfService - classic::gov::cca::PortInfo* pinfo = cc->createPortInfo("pSvc", "gov.cca.JPrintfService", 0); - cc->registerUsesPort(pinfo); - pinfo = 0; - pfp = dynamic_cast(cc->getPort("pSvc")); - CHECKDC(pfp); - if(pfp == 0) { - cc->addProvidesPort(this, cc->createPortInfo("DEAD=NoJPrintf", "classic::gov::cca::GoPort", 0)); - ::printf("!!! No JPrintfService available from framework."); - return; - } - - // register "GA" uses port - pinfo = svc->createPortInfo("ga_classic_port", "GA::GAClassicPort",0); - svc->registerUsesPort(pinfo); - pinfo = 0; - // register "GA" uses port - pinfo = svc->createPortInfo("TemplateFactory", - "DistArrayTemplFactoryPort",0); - svc->registerUsesPort(pinfo); - pinfo = 0; - // register "GA" uses port - pinfo = svc->createPortInfo("DescriptorFactory", - "DistArrayDescrFactoryPort",0); - svc->registerUsesPort(pinfo); - pinfo = 0; - - // Provides "Go" Port - pinfo = svc->createPortInfo("go", "classic::gov::cca::GoPort",0); - svc->addProvidesPort(this, pinfo); - pinfo = 0; -} - - -int -TestComponent::go() { - classic::gov::cca::Port *port = 0, *pTemplate = 0, *pDescr = 0; - - /* get the ga classic port */ - port = svc->getPort("ga_classic_port"); - if (pfp && port == 0) { - pfp->en("TestComponent::go(): ga_clasic_port not apparently connected"); - } - /* get the ga dadf template port */ - pTemplate = svc->getPort("TemplateFactory"); - if (pfp && pTemplate == 0) { - pfp->en("TestComponent::go(): TemplateFactory not apparently connected"); - } - /* get the ga dadf descriptor port */ - pDescr = svc->getPort("DescriptorFactory"); - if (pfp && pDescr == 0) { - pfp->en("TestComponent::go(): DescriptorFactory not apparently connected"); - } - - /* type-casting */ - GA::GAClassicPort *ga_port; - ga_port = dynamic_cast < GA::GAClassicPort *> (port); - if(ga_port == 0) { - if (pfp) { - pfp->en("BSTest::go(): ga_classic_port not castable to correct type!"); - } - return -1; - } - /* type-casting */ - classic::gov::cca::DistArrayTemplFactoryPort * templFactory; - templFactory = dynamic_cast < classic::gov::cca::DistArrayTemplFactoryPort *> (pTemplate); - if(templFactory == 0) { - if (pfp) { - pfp->en("BSTest::go(): TemplateFactory not castable to correct type!"); - } - return -1; - } - /* type-casting */ - classic::gov::cca::DistArrayDescrFactoryPort * descrFactory; - descrFactory = dynamic_cast < classic::gov::cca::DistArrayDescrFactoryPort *> (pTemplate); - if(descrFactory == 0) { - if (pfp) { - pfp->en("BSTest::go(): DescriptorFactory not castable to correct type!"); - } - return -1; - } - - - /****************************************/ - - int me = ga_port->nodeid(); - int nproc = ga_port->nodes(); - int len; char proc_name[MPI_MAX_PROCESSOR_NAME]; - - MPI_Get_processor_name(proc_name, &len); - cout << proc_name << " : Rank = " << me << " : Size = " << nproc << "\n"; - - cout << "\n---------------------------------------------------\n"; - cout << " TESTING GA CLASSIC COMPONENT\n"; - cout << "---------------------------------------------------\n\n"; - doWork(ga_port); - cout << "After doWork()\n"; - - cout << "\n---------------------------------------------------\n"; - cout << " TESTING GA DADF COMPONENT\n"; - cout << "---------------------------------------------------\n\n"; - doWork_DADF(ga_port, templFactory, descrFactory); - cout << "After doWork_DADF()\n"; - - /****************************************/ - - cout << "\nReleasing Port(s) ...\n"; - svc->releasePort("ga_classic_port"); - svc->releasePort("TemplateFactory"); - svc->releasePort("DescriptorFactory"); - - return 0; - -} - -void doWork(GA::GAClassicPort *ga_port) { - - int ONE=1;/* useful constants */ - int n=N, type=MT_F_DBL; - int me=ga_port->nodeid(), nproc=ga_port->nodes(); - int i, row; - int dims[2]={N,N}; - int lo[2], hi[2]; - - /* Note: on all current platforms DoublePrecision == double */ - double buf[N], err, alpha, beta; - - if(me==0)printf("size = %d\n", nproc); - - if(me==0)printf("Creating matrix A\n"); - GA::GlobalArray *g_a = ga_port->createGA(type, 2, dims, "A", NULL); - if(me==0)printf("OK\n"); - - if(me==0)printf("Creating matrix B\n"); - /* create matrix B so that it has dims and distribution of A*/ - GA::GlobalArray *g_b = ga_port->createGA(g_a, "B"); - if(me==0)printf("OK\n"); - - g_a->zero(); /* zero the matrix */ - - if(me==0)printf("Initializing matrix A\n"); - /* fill in matrix A with random values in range 0.. 1 */ - lo[1]=0; hi[1]=n-1; - for(row=me; rowput(lo, hi, buf, &n); - } - - - if(me==0)printf("Symmetrizing matrix A\n"); - g_a->symmetrize(); /* symmetrize the matrix A = 0.5*(A+A') */ - - - /* check if A is symmetric */ - if(me==0)printf("Checking if matrix A is symmetric\n"); - g_a->transpose(g_b); /* B=A' */ - alpha=1.; beta=-1.; - g_b->add(&alpha, g_a, &beta, g_b); /* B= A - B */ - err= g_b->ddot(g_b); - - if(me==0)printf("Error=%lf\n",(double)err); - - if(me==0)printf("\nChecking atomic accumulate \n"); - - g_a->zero(); /* zero the matrix */ - for(i=0; iacc(lo, hi, buf, &ONE, &alpha ); - ga_port->sync(); - - if(me==0){ /* node 0 is checking the result */ - - g_a->get(lo, hi, buf,&ONE); - for(i=0; ierror("failed: column=",i); - printf("OK\n\n"); - - } - - g_a->destroy(); - g_b->destroy(); - -} - -void doWork_DADF(GA::GAClassicPort * ga_port, - classic::gov::cca::DistArrayTemplFactoryPort * templFactory, - classic::gov::cca::DistArrayDescrFactoryPort * descrFactory) { - - int i; - int me=ga_port->nodeid();// nproc=ga_port->nodes(); - int lo[DIM] = {0, 0}; - int hi[DIM] = {N, N}; - int chunk[DIM] = {2,2}; - int topology[DIM] = {2, 2}; - DistArrayTemplate * templ; - DistArray * darr; - - /********* Creating Template ********/ - if(!me) cout << "\nCreating a DADF Template:\n\n"; - templ = templFactory->createTemplate( "one" ); - DistArrayTemplate::DistType dist[DIM] ; - - for(i=0; isetRank(DIM); CHECKERR(ierr); - ierr = templ->setGlobalBounds(lo, hi); CHECKERR(ierr); - ierr = templ->setProcTopology(topology); CHECKERR(ierr); - ierr = templ->setDistType(dist); CHECKERR(ierr); - for(i=0; isetDistParameters(i, chunk[i], i); CHECKERR(ierr); - } - templ->commit(); - templ->printTemplate(); - - /** Create Distributed Array **/ - if(!me) cout << "\n\nCreating a GA Style Distributed array:\n\n"; - darr = descrFactory->createArray("My_Array"); - ierr = darr->setDataType(DistArray::stv_Double); CHECKERR(ierr); - ierr = darr->setTemplate(templ); CHECKERR(ierr); - ierr = darr->setIdentityAlignmentMap(); CHECKERR(ierr); - ierr = darr->commit(); CHECKERR(ierr); - darr->printArrayDistribution(); - - templFactory->destroyTemplate(templ); - descrFactory->destroyArray(darr); -} diff --git a/cca/ga_cca_classic/TestComponent/TestComponent.h b/cca/ga_cca_classic/TestComponent/TestComponent.h deleted file mode 100644 index 2d09890b2..000000000 --- a/cca/ga_cca_classic/TestComponent/TestComponent.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _TESTCOMPONENT_H -#define _TESTCOMPONENT_H - -class TestComponent : public virtual classic::gov::cca::Component, - public virtual classic::gov::cca::GoPort { - -public: - TestComponent(); - ~TestComponent(); - - // Component interface: - virtual void setServices(classic::gov::cca::Services *cc); - - // GoPort interface: - virtual int go(); - -private: - classic::gov::cca::Services *svc; - classic::gov::cca::JPrintfPort *pfp; -}; - - -#endif // _TESTCOMPONENT_H diff --git a/cca/ga_cca_classic/TestComponent/TestComponent_wrapper.cxx b/cca/ga_cca_classic/TestComponent/TestComponent_wrapper.cxx deleted file mode 100644 index d1bcc1d5e..000000000 --- a/cca/ga_cca_classic/TestComponent/TestComponent_wrapper.cxx +++ /dev/null @@ -1,25 +0,0 @@ -// This is a generated file. Do not commit to CVS. -#include -#include -#include -#include "TestComponent.h" - -extern "C" { - -classic::gov::cca::Component *create_TestComponent() { - classic::gov::cca::Component *wanker; - TestComponent *component; - component = new TestComponent(); - wanker = dynamic_cast(component); - return wanker; -} - -char **getComponentList() { - static char *list[2]; - list[0] = "create_TestComponent TestComponent"; - list[1] = 0; - return list; -} - -} -static char id[]="$Id: TestComponent_wrapper.cxx,v 1.1 2003-08-01 00:41:02 manoj Exp $"; diff --git a/cca/ga_cca_classic/gacca.h b/cca/ga_cca_classic/gacca.h deleted file mode 100644 index 6a636e650..000000000 --- a/cca/ga_cca_classic/gacca.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef _GAPP_H -#define _GAPP_H - -#if defined(__cplusplus) || defined(c_plusplus) - -#include -#include -#include -#include "ga.h" -#include "macdecls.h" -#include - - -// CCAFFEINE Includes -#include -#include -#include -#include "jc++/jc++.h" -#include "jc++/lang/jc++lang.h" -#include "jc++/util/jc++util.h" -#include "parameters/parametersStar.h" -#include "util/IO.h" - -// DADF Includes -#include "DistArrayTemplFactoryPort.h" -#include "DistArrayDescrFactoryPort.h" - - -#define _GA_USENAMESPACE_ 1 - -#if _GA_USENAMESPACE_ -#define _GA_STATIC_ -#define _GA_EXTERN_ extern -#else -#define _GA_STATIC_ static -#define _GA_EXTERN_ -#endif - -#if _GA_USENAMESPACE_ -namespace GA { -#else -class GA { - public: -#endif - class GAClassicPort; - class GAServices; - -#include "GlobalArray.h" -#include "GAClassicPort.h" -#include "GAServices.h" - - //GAServices SERVICES; - -#if ! _GA_USENAMESPACE_ - private: - GA() { } -#endif -}; - -#endif // _GAPP_H -#endif diff --git a/cca/ga_cca_classic/overload.cxx b/cca/ga_cca_classic/overload.cxx deleted file mode 100644 index 73784fd78..000000000 --- a/cca/ga_cca_classic/overload.cxx +++ /dev/null @@ -1,92 +0,0 @@ -#include "gacca.h" - -#ifdef FALSE -#undef FALSE -#endif -#ifdef TRUE -#undef TRUE -#endif -#define FALSE 0 -#define TRUE 1 - -/** - * More operator overloading stuff (a lot!!) to come. - */ - -GA::GlobalArray& -GA::GlobalArray::operator=(const GA::GlobalArray &g_a) { - - if(this != &g_a) { - GA_Destroy(mHandle); - - mHandle = GA_Duplicate(g_a.mHandle, g_a.inquireName()); - if(!mHandle) GA_Error((char *)" GA creation failed",0); - - GA_Copy(g_a.mHandle, mHandle); - } - return *this; -} - -int -GA::GlobalArray::operator==(const GA::GlobalArray &g_a) const { - - long isEqual = TRUE; - - int i, type1, type2, ndim1, ndim2, dims1[GA_MAX_DIM], dims2[GA_MAX_DIM]; - int alo[GA_MAX_DIM], ahi[GA_MAX_DIM], blo[GA_MAX_DIM], bhi[GA_MAX_DIM]; - - NGA_Inquire(mHandle, &type1, &ndim1, dims1); - NGA_Inquire(g_a.mHandle, &type2, &ndim2, dims2); - if(type1 != type2) isEqual = FALSE; // check type - if(GA_Compare_distr(mHandle, g_a.mHandle)) isEqual = FALSE; - NGA_Distribution(mHandle, GA_Nodeid(), alo, ahi); - NGA_Distribution(g_a.mHandle, GA_Nodeid(), blo, bhi); - if(ahi[0] != bhi[0]) isEqual = FALSE; // check process owns data? - - if(ahi[0] >= 0) { // true => process owns data - void *ptr1 = NULL, *ptr2 = NULL; - int ld1[GA_MAX_DIM]; - int ld2[GA_MAX_DIM]; - int num = 0; - - NGA_Access(mHandle, alo, ahi, &ptr1, ld1); - NGA_Access(g_a.mHandle, blo, bhi, &ptr2, ld2); - - // number of elements I own. - for(i=0; i,$>:-m64> # Make default integers 64-bit for Fortran $<$,$>:-i8> - $<$,$>:-fdefault-integer-8> + $<$,$>:-fdefault-integer-8> ) set( ILP64_COMPILE_OPTIONS "${ILP64_COMPILE_OPTIONS}" CACHE STRING "ILP64 compile options" FORCE ) @@ -19,7 +19,7 @@ foreach (lang C CXX Fortran) if ( CMAKE_Fortran_COMPILER_ID STREQUAL Intel OR CMAKE_Fortran_COMPILER_ID STREQUAL PGI ) list( APPEND ILP64_${lang}_COMPILE_OPTIONS -i8 ) endif() - if ( CMAKE_Fortran_COMPILER_ID STREQUAL GNU OR CMAKE_Fortran_COMPILER_ID STREQUAL Flang ) + if ( CMAKE_Fortran_COMPILER_ID STREQUAL GNU OR CMAKE_Fortran_COMPILER_ID STREQUAL Flang OR CMAKE_Fortran_COMPILER_ID STREQUAL LLVMFlang ) list( APPEND ILP64_${lang}_COMPILE_OPTIONS -fdefault-integer-8 ) endif() endif() diff --git a/cmake/linalg-modules/FindOpenBLAS.cmake b/cmake/linalg-modules/FindOpenBLAS.cmake index c4e3c4b8f..eed630ef5 100644 --- a/cmake/linalg-modules/FindOpenBLAS.cmake +++ b/cmake/linalg-modules/FindOpenBLAS.cmake @@ -4,13 +4,15 @@ if( "ilp64" IN_LIST OpenBLAS_FIND_COMPONENTS AND "lp64" IN_LIST OpenBLAS_FIND_CO endif() if( OpenBLAS_PREFERS_STATIC ) - set( OpenBLAS_LIBRARY_NAME "libopenblas.a" ) + set( OpenBLAS_LP64_LIBRARY_NAME "libopenblas.a" ) + set( OpenBLAS_ILP64_LIBRARY_NAME "libopenblas_64.a" ) else() - set( OpenBLAS_LIBRARY_NAME "openblas" ) + set( OpenBLAS_LP64_LIBRARY_NAME "openblas" ) + set( OpenBLAS_ILP64_LIBRARY_NAME "openblas_64" ) endif() find_library( OpenBLAS_LIBRARIES - NAMES ${OpenBLAS_LIBRARY_NAME} + NAMES ${OpenBLAS_LP64_LIBRARY_NAME} ${OpenBLAS_ILP64_LIBRARY_NAME} HINTS ${OpenBLAS_PREFIX} PATHS ${OpenBLAS_LIBRARY_DIR} ${CMAKE_C_IMPLICIT_LINK_DIRECTORIES} PATH_SUFFIXES lib lib64 lib32 @@ -21,10 +23,10 @@ find_path( OpenBLAS_INCLUDE_DIR NAMES openblas_config.h HINTS ${OpenBLAS_PREFIX} PATHS ${OpenBLAS_INCLUDE_DIR} - PATH_SUFFIXES include + PATH_SUFFIXES include include/openblas include/openblas64 DOC "OpenBLAS header" ) - + #if( OpenBLAS_LIBRARY AND OpenBLAS_PREFERS_STATIC ) # include( CMakeFindDependency ) # find_package( Threads QUIET ) diff --git a/cmake/linalg-modules/FindStandardFortran.cmake b/cmake/linalg-modules/FindStandardFortran.cmake index c58db6aa3..9ab46dd4d 100644 --- a/cmake/linalg-modules/FindStandardFortran.cmake +++ b/cmake/linalg-modules/FindStandardFortran.cmake @@ -20,6 +20,8 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU") set(STANDARDFORTRAN_LIBS gfortran) elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Intel") set(STANDARDFORTRAN_LIBS ifcore) +elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + set(STANDARDFORTRAN_LIBS FortranRuntime FortranDecimal) elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Flang") set(STANDARDFORTRAN_LIBS flang flangrti pgmath) #CMAKE_Fortran_COMPILER_ID does not give "ArmFlang" @@ -31,10 +33,10 @@ else() endif() foreach(STANDARDFORTRAN_LIB ${STANDARDFORTRAN_LIBS}) - set(STANDARDFORTRAN_LIB_NAME - lib${STANDARDFORTRAN_LIB}${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(STANDARDFORTRAN_LIB_NAMES + lib${STANDARDFORTRAN_LIB}${CMAKE_SHARED_LIBRARY_SUFFIX} lib${STANDARDFORTRAN_LIB}.a) find_library(${STANDARDFORTRAN_LIB}_LIBRARY - ${STANDARDFORTRAN_LIB_NAME} + ${STANDARDFORTRAN_LIB_NAMES} HINTS ${CMAKE_Fortran_IMPLICIT_LINK_DIRECTORIES} ) list(APPEND STANDARDFORTRAN_LIBRARIES ${${STANDARDFORTRAN_LIB}_LIBRARY}) diff --git a/cmake/linalg-modules/FindTBB.cmake b/cmake/linalg-modules/FindTBB.cmake index 3ee5b0b15..cc965d291 100644 --- a/cmake/linalg-modules/FindTBB.cmake +++ b/cmake/linalg-modules/FindTBB.cmake @@ -290,8 +290,6 @@ if (WIN32 AND MSVC) # for each prefix path, add ia32/64\${COMPILER_PREFIX}\lib to the lib search path foreach (dir IN LISTS TBB_PREFIX_PATH) if (CMAKE_CL_64) - list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia64/${COMPILER_PREFIX}/lib) - list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia64/${COMPILER_PREFIX}) list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${COMPILER_PREFIX}/lib) list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${COMPILER_PREFIX}) else () diff --git a/cmake/linalg-modules/util/CommonFunctions.cmake b/cmake/linalg-modules/util/CommonFunctions.cmake index 953cd1f22..f9f8d696a 100644 --- a/cmake/linalg-modules/util/CommonFunctions.cmake +++ b/cmake/linalg-modules/util/CommonFunctions.cmake @@ -169,7 +169,7 @@ function( append_possibly_missing_libs _linker_test __compile_output _orig_libs set( _tmp_libs ) # Check for missing Fortran symbols - if( ${__compile_output} MATCHES "fortran" OR ${__compile_output} MATCHES "f90_" ) + if( ${__compile_output} MATCHES "fortran" OR ${__compile_output} MATCHES "Fortran" OR ${__compile_output} MATCHES "f90_" ) message( STATUS " * Missing Standard Fortran Libs - Adding to ${_linker_test} linker" ) # Check for Standard Fortran Libraries diff --git a/cmake/mafdecls.fh.in b/cmake/mafdecls.fh.in index 0f4ded0df..e90cf9473 100644 --- a/cmake/mafdecls.fh.in +++ b/cmake/mafdecls.fh.in @@ -124,60 +124,36 @@ ! variables ! -#ifdef HPUX -# define HP_SHARED_COMMON_ -#endif ! common blocks #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_byte -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_byte/ #endif common /mbc_byte/ byte_mb(2) character*1 byte_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_int -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_int/ #endif common /mbc_int/ int_mb(2) integer int_mb -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_log/ -#endif common /mbc_log/ log_mb(2) logical log_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_real -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_real/ #endif common /mbc_real/ real_mb(2) real real_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_dbl -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_dbl/ #endif common /mbc_dbl/ dbl_mb(2) double precision dbl_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_scpl -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_scpl/ #endif common /mbc_scpl/ scpl_mb(2) complex scpl_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_dcpl -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_dcpl/ #endif common /mbc_dcpl/ dcpl_mb(2) double complex dcpl_mb diff --git a/cmx/CMakeLists.txt b/cmx/CMakeLists.txt index 55f1219d3..3e2ea9c89 100644 --- a/cmx/CMakeLists.txt +++ b/cmx/CMakeLists.txt @@ -424,9 +424,11 @@ set(CMX_DEVICE_HEADERS install (FILES ${CMX_DEVICE_HEADERS} - DESTINATION include + DESTINATION include/ga ) +list (APPEND GA_HEADER_PATHS ${CMAKE_CURRENT_LIST_DIR}/src-common) + # ------------------------------------------------------------- # CMX library installation # ------------------------------------------------------------- @@ -455,9 +457,13 @@ set(ARMCI_DEVICE_HEADERS install (FILES ${ARMCI_DEVICE_HEADERS} - DESTINATION include + DESTINATION include/ga ) +list (APPEND GA_HEADER_PATHS ${CMAKE_CURRENT_LIST_DIR}/src-armci) + +set (GA_HEADER_PATHS ${GA_HEADER_PATHS} PARENT_SCOPE) + # ------------------------------------------------------------- # ARMCI library installation # ------------------------------------------------------------- diff --git a/cmx/src-armci/armci.h b/cmx/src-armci/armci.h index c10fd6120..c28e94870 100644 --- a/cmx/src-armci/armci.h +++ b/cmx/src-armci/armci.h @@ -240,15 +240,6 @@ extern int armci_domain_my_id(armci_domain_t domain); extern int armci_domain_count(armci_domain_t domain); extern int armci_domain_same_id(armci_domain_t domain, int proc); - -/* PVM group - * On CrayT3E: the default group is the global group which is (char *)NULL - * It is the only working group. - * On Workstations: the default group is "mp_working_group". User can set - * the group name by calling the ARMCI_PVM_init (defined - * in message.c) and passing the group name to the library. - */ - extern char *mp_group_name; /*********************stuff for non-blocking API******************************/ diff --git a/cmx/src-armci/testing/testwrap.c b/cmx/src-armci/testing/testwrap.c index a3d8a0c00..2820722e3 100644 --- a/cmx/src-armci/testing/testwrap.c +++ b/cmx/src-armci/testing/testwrap.c @@ -61,12 +61,7 @@ extern void armci_unlockmem(void); #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif - /***************************** macros ************************/ #define COPY(src, dst, bytes) memcpy((dst),(src),(bytes)) @@ -78,56 +73,6 @@ extern void armci_unlockmem(void); int me, nproc; int work[MAXPROC]; /* work array for propagating addresses */ - - -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) { - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) { - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - /*\ generate random range for a section of multidimensional array \*/ void get_range(int ndim, int dims[], int lo[], int hi[]) diff --git a/cmx/src-common/acc.h b/cmx/src-common/acc.h index 6e15a73f8..7e076b21a 100644 --- a/cmx/src-common/acc.h +++ b/cmx/src-common/acc.h @@ -56,7 +56,7 @@ static inline void _scale( MUL_##WHICH(iterator[m], value[m], calc_scale); \ } \ } else -#if HAVE_BLAS +#if 0 //HAVE_BLAS SCALE_BLAS(CMX_ACC_DBL, double, D) SCALE_BLAS(CMX_ACC_FLT, float, S) SCALE(REG, CMX_ACC_INT, int) diff --git a/cmx/src-mpi-pr/cmx_impl.h b/cmx/src-mpi-pr/cmx_impl.h index 37a703921..4c35cc0ae 100644 --- a/cmx/src-mpi-pr/cmx_impl.h +++ b/cmx/src-mpi-pr/cmx_impl.h @@ -12,7 +12,7 @@ #define CMX_STATIC_BUFFER_SIZE (2u*1048576u) #define SHM_NAME_SIZE 31 #define UNLOCKED -1 /* performance or correctness related settings */ -#if defined(__bgq__) || defined(__bgp__) +#if 0 #define ENABLE_UNNAMED_SEM 1 #else #define ENABLE_UNNAMED_SEM 0 diff --git a/cmx/src-mpi-pr/groups.c b/cmx/src-mpi-pr/groups.c index 1af85b980..f0b6ac84a 100644 --- a/cmx/src-mpi-pr/groups.c +++ b/cmx/src-mpi-pr/groups.c @@ -6,16 +6,6 @@ #include -#if defined(__bgp__) -#include -#include -#include -#elif defined(__bgq__) -# include -#elif defined(__CRAYXT) || defined(__CRAYXE) -# include -#endif - #include "cmx.h" #include "cmx_impl.h" #include "groups.h" @@ -599,46 +589,6 @@ void cmx_group_finalize() static long xgethostid() { -#if defined(__bgp__) -#warning BGP - long nodeid; - int matched,midplane,nodecard,computecard; - char rack_row,rack_col; - char location[128]; - char location_clean[128]; - (void) memset(location, '\0', 128); - (void) memset(location_clean, '\0', 128); - _BGP_Personality_t personality; - Kernel_GetPersonality(&personality, sizeof(personality)); - BGP_Personality_getLocationString(&personality, location); - matched = sscanf(location, "R%c%c-M%1d-N%2d-J%2d", - &rack_row, &rack_col, &midplane, &nodecard, &computecard); - assert(matched == 5); - sprintf(location_clean, "%2d%02d%1d%02d%02d", - (int)rack_row, (int)rack_col, midplane, nodecard, computecard); - nodeid = atol(location_clean); -#elif defined(__bgq__) -#warning BGQ - int nodeid; - MPIX_Hardware_t hw; - MPIX_Hardware(&hw); - - nodeid = hw.Coords[0] * hw.Size[1] * hw.Size[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[1] * hw.Size[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[3] * hw.Size[4] - + hw.Coords[4]; -#elif defined(__CRAYXT) || defined(__CRAYXE) -#warning CRAY - int nodeid; -# if defined(__CRAYXT) - PMI_Portals_get_nid(g_state.rank, &nodeid); -# elif defined(__CRAYXE) - PMI_Get_nid(g_state.rank, &nodeid); -# endif -#else long nodeid = gethostid(); -#endif - return nodeid; } diff --git a/cmx/testing/test.c b/cmx/testing/test.c index 2c9a158fd..e8a32fd23 100644 --- a/cmx/testing/test.c +++ b/cmx/testing/test.c @@ -51,11 +51,7 @@ #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif /***************************** macros ************************/ @@ -120,54 +116,6 @@ static double timer() } -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) { - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) { - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - /*\ generate random range for a section of multidimensional array \*/ void get_range(int ndim, int dims[], int lo[], int hi[]) diff --git a/comex/CMakeLists.txt b/comex/CMakeLists.txt index f8b0e6a9f..ece560300 100644 --- a/comex/CMakeLists.txt +++ b/comex/CMakeLists.txt @@ -103,6 +103,9 @@ install (FILES DESTINATION include/ga ) +list (APPEND GA_HEADER_PATHS ${CMAKE_CURRENT_LIST_DIR}/src-armci ${CMAKE_CURRENT_LIST_DIR}/src-common) +set (GA_HEADER_PATHS ${GA_HEADER_PATHS} PARENT_SCOPE) + # ------------------------------------------------------------- # ARMCI and COMEX library installation # ------------------------------------------------------------- diff --git a/comex/Makefile.am b/comex/Makefile.am index d5f1ea74e..9f37ec539 100644 --- a/comex/Makefile.am +++ b/comex/Makefile.am @@ -101,13 +101,6 @@ if COMEX_NETWORK_PORTALS4 include $(top_srcdir)/src-portals4/Makefile.inc endif -############################################################################## -# src-dmapp -# -if COMEX_NETWORK_DMAPP -include $(top_srcdir)/src-dmapp/Makefile.inc -endif - ############################################################################## # src-ofa # diff --git a/comex/cmake/config.h.in b/comex/cmake/config.h.in index 65b84665f..17027da37 100644 --- a/comex/cmake/config.h.in +++ b/comex/cmake/config.h.in @@ -33,7 +33,6 @@ #cmakedefine01 HAVE_SYS_WEAK_ALIAS_PRAGMA #cmakedefine NDEBUG -#cmakedefine __CRAYXE #cmakedefine01 ENABLE_SYSV diff --git a/comex/configure.ac b/comex/configure.ac index 71f6f8fd9..fafbabf1a 100644 --- a/comex/configure.ac +++ b/comex/configure.ac @@ -62,7 +62,7 @@ AS_CASE([$enable_mpi_tests], AM_CONDITIONAL([CROSS_COMPILING], [test "x$cross_compiling" = xyes]) -# Establish the underlying network infrastructure (MPI, OFA, DMAPP, etc) +# Establish the underlying network infrastructure (MPI, OFA, etc) COMEX_NETWORK_SETUP # Checks for C header files. diff --git a/comex/m4/comex_ar.m4 b/comex/m4/comex_ar.m4 index 329f43f49..d5b757112 100644 --- a/comex/m4/comex_ar.m4 +++ b/comex/m4/comex_ar.m4 @@ -7,14 +7,9 @@ # # Known archivers: # ar - all known systems -# sxar - special to NEC/NEC64 # AC_DEFUN([COMEX_AR], [ AC_ARG_VAR([AR], [archiver used by libtool (default: ar)]) AC_ARG_VAR([AR_FLAGS], [archiver flags used by libtool (default: cru)]) AC_ARG_VAR([RANLIB], [generates index to archive (default: ranlib)]) -AS_IF([test "x$AR" = x], - [AS_CASE([$comex_cv_target], [NEC|NEC64], [AR=sxar])]) -AS_IF([test "x$RANLIB" = x], - [AS_CASE([$comex_cv_target], [NEC|NEC64], [RANLIB=true])]) ])dnl diff --git a/comex/m4/comex_blas.m4 b/comex/m4/comex_blas.m4 index 10e681bd7..c56213608 100644 --- a/comex/m4/comex_blas.m4 +++ b/comex/m4/comex_blas.m4 @@ -281,26 +281,6 @@ AS_IF([test $comex_blas_ok = no], LIBS="$comex_save_LIBS"]) AC_MSG_RESULT([$comex_blas_ok])]) -# SCSL library (SCSL stands for SGI/Cray Scientific Library) -AS_IF([test $comex_blas_ok = no], - [AC_MSG_CHECKING([for BLAS in SGI/Cray Scientific Library]) - # add -lscs to BLAS_LIBS if missing from LIBS - AS_CASE([$LIBS], [*scs*], [], [BLAS_LIBS="-lscs"]) - LIBS="$BLAS_LIBS $LIBS" - COMEX_RUN_BLAS_TEST() - LIBS="$comex_save_LIBS" - AC_MSG_RESULT([$comex_blas_ok])]) - -# SGIMATH library -AS_IF([test $comex_blas_ok = no], - [AC_MSG_CHECKING([for BLAS in SGIMATH library]) - # add -lcomplib.sgimath to BLAS_LIBS if missing from LIBS - AS_CASE([$LIBS], [*complib.sgimath*], [], [BLAS_LIBS="-lcomplib.sgimath"]) - LIBS="$BLAS_LIBS $LIBS" - COMEX_RUN_BLAS_TEST() - LIBS="$comex_save_LIBS" - AC_MSG_RESULT([$comex_blas_ok])]) - # IBM ESSL library (might require generic BLAS lib, too) AS_IF([test $comex_blas_ok = no], [AC_MSG_CHECKING([for BLAS in IBM ESSL library]) diff --git a/comex/m4/comex_mpi_unwrap.m4 b/comex/m4/comex_mpi_unwrap.m4 index 61d0a700d..1e87b7e78 100644 --- a/comex/m4/comex_mpi_unwrap.m4 +++ b/comex/m4/comex_mpi_unwrap.m4 @@ -76,7 +76,7 @@ AC_LANG_CASE( ], [C++], [AS_CASE([$wrapped], [*_r], [compilers="bgxlC_r xlC_r"], - [*], [compilers="icpc pgCC pathCC sxc++ xlC bgxlC openCC sunCC crayc++ g++ c++ gpp aCC cxx cc++ cl.exe FCC KCC RCC CC"]) + [*], [compilers="icpc pgCC pathCC sxc++ xlC bgxlC openCC sunCC craycxx g++ c++ gpp aCC cxx cc++ cl.exe FCC KCC RCC CC"]) ], [Fortran 77], [AS_CASE([$wrapped], [*_r], [compilers="bgxlf95_r xlf95_r bgxlf90_r xlf90_r bgxlf_r xlf_r"], diff --git a/comex/m4/comex_mpicc.m4 b/comex/m4/comex_mpicc.m4 index 1b96d023c..7d5f4133c 100644 --- a/comex/m4/comex_mpicc.m4 +++ b/comex/m4/comex_mpicc.m4 @@ -4,18 +4,13 @@ # # Known C compilers # cc generic compiler name -# ccc Fujitsu ?? old Cray ?? # cl -# ecc Intel on IA64 ?? # gcc GNU # icc Intel -# bgxlc Intel on BG/P -# bgxlc_r Intel on BG/P, thread safe # xlc Intel # xlc_r Intel, thread safe # pgcc Portland Group # pathcc PathScale -# sxcc NEC SX # fcc Fujitsu # opencc AMD's x86 open64 # suncc Sun's Studio @@ -28,7 +23,6 @@ # hcc # mpxlc_r # mpxlc -# sxmpicc NEC SX # mpifcc Fujitsu # mpgcc # mpcc @@ -37,9 +31,6 @@ # AC_DEFUN([COMEX_PROG_MPICC], [AC_ARG_VAR([MPICC], [MPI C compiler]) -AS_CASE([$comex_cv_target_base], -[BGP], [comex_mpicc_pref=mpixlc_r; comex_cc_pref=bgxlc_r], -[]) # In the case of using MPI wrappers, set CC=MPICC since CC will override # absolutely everything in our list of compilers. # Save CC, just in case. diff --git a/comex/m4/comex_network_setup.m4 b/comex/m4/comex_network_setup.m4 index 13ba4c9b2..9fdb617d4 100644 --- a/comex/m4/comex_network_setup.m4 +++ b/comex/m4/comex_network_setup.m4 @@ -175,35 +175,6 @@ AS_IF([test "x$happy" = xyes], [$2]) ])dnl -# _COMEX_NETWORK_DMAPP([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -# ------------------------------------------------------------------ -# TODO when dmapp headers and libraries become available, fix this -AC_DEFUN([_COMEX_NETWORK_DMAPP], [ -AC_MSG_NOTICE([searching for DMAPP...]) -happy=yes -AS_IF([test "x$happy" = xyes], - [AC_CHECK_HEADER([dmapp.h], [], [happy=no])]) -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([gethugepagesize], [hugetlbfs], - [AC_DEFINE([HAVE_LIBHUGETLBFS], [1], - [Define to 1 if you have the `hugetlbfs' library.])], - [AC_DEFINE([HAVE_LIBHUGETLBFS], [0], - [Define to 1 if you have the `hugetlbfs' library.])]) - AS_CASE([$ac_cv_search_gethugepagesize], - ["none required"], [], - [no], [], - [# add missing lib to COMEX_NETWORK_LIBS if not there - AS_CASE([$COMEX_NETWORK_LIBS], - [*$ac_cv_search_gethugepagesize*], [], - [COMEX_NETWORK_LIBS="$COMEX_NETWORK_LIBS $ac_cv_search_gethugepagesize"])]) - AC_CHECK_TYPES([dmapp_lock_desc_t], [], [], [[#include ]]) - AC_CHECK_TYPES([dmapp_lock_handle_t], [], [], [[#include ]]) - ]) -AS_IF([test "x$happy" = xyes], - [comex_network=DMAPP; with_dmapp=yes; $1], - [$2]) -])dnl - # _COMEX_NETWORK_OFI([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # ------------------------------------------------------------------- AC_DEFUN([_COMEX_NETWORK_OFI], [ @@ -259,7 +230,6 @@ _COMEX_NETWORK_WITH([mpi-pr], [MPI-1 two-sided with progress rank]) _COMEX_NETWORK_WITH([mpi3], [MPI-3 one-sided]) _COMEX_NETWORK_WITH([ofa], [Infiniband OpenIB]) _COMEX_NETWORK_WITH([portals4], [Portals4]) -_COMEX_NETWORK_WITH([dmapp], [Cray DMAPP]) _COMEX_NETWORK_WITH([ofi], [OFI]) # Temporarily add COMEX_NETWORK_CPPFLAGS to CPPFLAGS. comex_save_CPPFLAGS="$CPPFLAGS"; CPPFLAGS="$CPPFLAGS $COMEX_NETWORK_CPPFLAGS" @@ -273,8 +243,6 @@ AS_IF([test "x$enable_autodetect" = xyes], [_COMEX_NETWORK_OFA()]) AS_IF([test "x$comex_network" = x && test "x$with_portals4" != xno], [_COMEX_NETWORK_PORTALS4()]) - AS_IF([test "x$comex_network" = x && test "x$with_dmapp" != xno], - [_COMEX_NETWORK_DMAPP()]) AS_IF([test "x$comex_network" = x && test "x$with_ofi" != xno], [_COMEX_NETWORK_OFI()]) AS_IF([test "x$comex_network" = x], @@ -308,9 +276,6 @@ AS_IF([test "x$enable_autodetect" = xyes], AS_IF([test "x$comex_network" = xPORTALS4], [_COMEX_NETWORK_PORTALS4([], [AC_MSG_ERROR([test for COMEX_NETWORK=PORTALS4 failed])])]) - AS_IF([test "x$comex_network" = xDMAPP], - [_COMEX_NETWORK_DMAPP([], - [AC_MSG_ERROR([test for COMEX_NETWORK=DMAPP failed])])]) AS_IF([test "x$comex_network" = xOFI], [_COMEX_NETWORK_OFI([], [AC_MSG_ERROR([test for COMEX_NETWORK=OFI failed])])]) @@ -324,7 +289,6 @@ AS_IF([test "x$enable_autodetect" = xyes], _COMEX_NETWORK_WARN([mpi3]) _COMEX_NETWORK_WARN([ofa]) _COMEX_NETWORK_WARN([portals4]) - _COMEX_NETWORK_WARN([dmapp]) _COMEX_NETWORK_WARN([ofi]) AC_MSG_ERROR([please select only one comex network])])]) # Remove COMEX_NETWORK_CPPFLAGS from CPPFLAGS. @@ -340,7 +304,6 @@ _COMEX_NETWORK_AM_CONDITIONAL([mpi-pr]) _COMEX_NETWORK_AM_CONDITIONAL([mpi3]) _COMEX_NETWORK_AM_CONDITIONAL([ofa]) _COMEX_NETWORK_AM_CONDITIONAL([portals4]) -_COMEX_NETWORK_AM_CONDITIONAL([dmapp]) _COMEX_NETWORK_AM_CONDITIONAL([ofi]) _COMEX_NETWORK_AC_DEFINE([mpi-ts]) _COMEX_NETWORK_AC_DEFINE([mpi-mt]) @@ -349,7 +312,6 @@ _COMEX_NETWORK_AC_DEFINE([mpi-pr]) _COMEX_NETWORK_AC_DEFINE([mpi3]) _COMEX_NETWORK_AC_DEFINE([ofa]) _COMEX_NETWORK_AC_DEFINE([portals4]) -_COMEX_NETWORK_AC_DEFINE([dmapp]) _COMEX_NETWORK_AC_DEFINE([ofi]) AC_SUBST([COMEX_NETWORK_LDFLAGS]) AC_SUBST([COMEX_NETWORK_LIBS]) diff --git a/comex/src-armci/armci.h b/comex/src-armci/armci.h index da5ed1300..5faf8e905 100644 --- a/comex/src-armci/armci.h +++ b/comex/src-armci/armci.h @@ -241,15 +241,6 @@ extern int armci_domain_my_id(armci_domain_t domain); extern int armci_domain_count(armci_domain_t domain); extern int armci_domain_same_id(armci_domain_t domain, int proc); - -/* PVM group - * On CrayT3E: the default group is the global group which is (char *)NULL - * It is the only working group. - * On Workstations: the default group is "mp_working_group". User can set - * the group name by calling the ARMCI_PVM_init (defined - * in message.c) and passing the group name to the library. - */ - extern char *mp_group_name; /*********************stuff for non-blocking API******************************/ diff --git a/comex/src-common/acc.h b/comex/src-common/acc.h index 4d3433845..7ad7112bf 100644 --- a/comex/src-common/acc.h +++ b/comex/src-common/acc.h @@ -77,7 +77,7 @@ static inline void _scale( MUL_##WHICH(iterator[m], value[m], calc_scale); \ } \ } else -#if HAVE_BLAS +#if 0 // HAVE_BLAS SCALE_BLAS(COMEX_ACC_DBL, double, D) SCALE_BLAS(COMEX_ACC_FLT, float, S) SCALE(REG, COMEX_ACC_INT, int) diff --git a/comex/src-dmapp/Makefile.inc b/comex/src-dmapp/Makefile.inc deleted file mode 100644 index baa7e1d10..000000000 --- a/comex/src-dmapp/Makefile.inc +++ /dev/null @@ -1,10 +0,0 @@ -libcomex_la_SOURCES += src-dmapp/comex.c -libcomex_la_SOURCES += src-dmapp/comex_impl.h -#libcomex_la_SOURCES += src-dmapp/clusterinfo.c -#libcomex_la_SOURCES += src-dmapp/clusterinfo.h -libcomex_la_SOURCES += src-dmapp/groups.c -libcomex_la_SOURCES += src-dmapp/groups.h -libcomex_la_SOURCES += src-dmapp/reg_cache.c -libcomex_la_SOURCES += src-dmapp/reg_cache.h - -AM_CPPFLAGS += -I$(top_srcdir)/src-dmapp diff --git a/comex/src-dmapp/clusterinfo.c b/comex/src-dmapp/clusterinfo.c deleted file mode 100644 index 4c2bc7f3e..000000000 --- a/comex/src-dmapp/clusterinfo.c +++ /dev/null @@ -1,477 +0,0 @@ -/****************************************************************************** - * file: clusterinfo.c - * purpose: Determine cluster info i.e., number of machines and processes - * running on each of them. - * - *******************************************************************************/ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include -#include -#include -#include - -#include "clusterinfo.h" -#include "message.h" - -/* NO_SHMEM enables to simulate cluster environment on a single workstation. - * Must define NO_SHMMAX_SEARCH in shmem.c to prevent depleting shared memory - * due to a gready shmem request by the master process on cluster node 0. */ -#define armci_enable_alpha_hack() 1 - -#define ARMCI_TAG 30000 -#define DEBUG 0 -#define MAX_HOSTNAME 80 -#define CHECK_NODE_NAMES -#define CLUSTER 1 /* until I remove all non CLUSTER code */ - -/* print info on how many cluster nodes detected */ -#define PRINT_CLUSTER_INFO 1 - -static const char *network_protocol="DMAPP"; - -/* stores cluster configuration. - * Initialized before user threads are created and then read-only */ -armci_clus_t *armci_clus_info; -int armci_nclus; -int armci_clus_me; -int armci_master; -int armci_clus_first; -int armci_clus_last; - -#ifdef HITACHI -#include -# define GETHOSTNAME sr_gethostname -ndes_t _armci_group; - -static int sr_gethostname(char *name, int len) -{ - int no; - pid_t ppid; - - if(hmpp_nself (&_armci_group,&no,&ppid,0,NULL) <0) - return -1; - - if(len<6)armci_die("len too small",len); - if(no>1024)armci_die("expected node id <1024",no); - sprintf(name,"n%d",no); - return 0; -} -#elif defined(SGIALTIX) -# define GETHOSTNAME altix_gethostname -static int altix_gethostname(char *name, int len) { - sprintf(name,"altix"); - return 0; -} -#elif defined(CRAY_XT) /* && !defined(PORTALS) */ -#define GETHOSTNAME cnos_gethostname -static int cnos_gethostname(char *name, int len) -{ - int size,rank; - size=PMI_Get_rank(&rank); - sprintf(name,"%d",rank); -} -#else -# define GETHOSTNAME gethostname -#endif - -static char* merge_names(char *name) -{ - int jump = 1, rem, to, from; - int lenmes, lenbuf, curlen, totbuflen= armci_nproc*HOSTNAME_LEN; - int len = strlen(name); - char *work = malloc(totbuflen); - - if(!work)armci_die("armci: merge_names: malloc failed: ",totbuflen); - - strcpy(work, name); - curlen = len+1; - - /* prefix tree merges names in the order of process numbering in log(P)time - * result = name_1//name_2//...//name_P-1 - */ - do { - jump *= 2; rem = armci_me%jump; - if(rem){ - to = armci_me - rem; - armci_msg_snd(ARMCI_TAG, work, curlen, to); - break; - }else{ - from = armci_me + jump/2; - if(from < armci_nproc){ - lenbuf = totbuflen - curlen; - armci_msg_rcv(ARMCI_TAG, work+curlen, lenbuf, &lenmes, from); - curlen += lenmes; - } - } - }while (jump < armci_nproc); - return(work); -} - - -static void process_hostlist(char *names) -{ -#ifdef CLUSTER - - int i, cluster=0; - char *s,*master; - int len, root=0; - - /******** inspect list of machine names to determine locality ********/ - if (armci_me==0){ - - /* first find out how many cluster nodes we got */ - armci_nclus =1; s=master=names; - for(i=1; i < armci_nproc; i++){ - s += strlen(s)+1; - if(strcmp(s,master)){ - /* we found a new machine name on the list */ - master = s; - armci_nclus++; - /*fprintf(stderr,"new name %s len =%d\n",master, strlen(master));*/ - - } - } - - /* allocate memory */ - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info) - armci_die("malloc failed for clusinfo",armci_nclus); - - /* fill the data structure -- go through the list again */ - s=names; - master="*-"; /* impossible hostname */ - cluster =0; - for(i=0; i < armci_nproc; i++){ - if(strcmp(s,master)){ - /* we found a new machine name on the list */ - master = s; - armci_clus_info[cluster].nslave=1; - armci_clus_info[cluster].master=i; - strcpy(armci_clus_info[cluster].hostname, master); - -#ifdef CHECK_NODE_NAMES - /* need consecutive task id allocated on the same node - * the current test only compares hostnames against first cluster */ - if(cluster) - if(!strcmp(master,armci_clus_info[0].hostname)){ - /* we have seen that hostname before */ - fprintf(stderr, "ARMCI supports block process mapping only\n"); - armci_die("Cannot run: improper task to host mapping!",0); - } -#endif - cluster++; - - } - else{ - /* the process is still on the same host */ - armci_clus_info[cluster-1].nslave++; - } - s += strlen(s)+1; - } - - if(armci_nclus != cluster) - armci_die("inconsistency processing clusterinfo",armci_nclus); - - } - /******** process 0 got all data ********/ - - /* now broadcast locality info struct to all processes - * two steps are needed because of the unknown length of hostname list - */ - len = sizeof(int); - armci_msg_brdcst(&armci_nclus, len, root); - - if(armci_me){ - /* allocate memory */ - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info) - armci_die("malloc failed for clusinfo",armci_nclus); - } - - len = sizeof(armci_clus_t)*armci_nclus; - armci_msg_brdcst(armci_clus_info, len, root); - - /******** all processes 0 got all data ********/ - - /* now determine current cluster node id by comparing me to master */ - armci_clus_me = armci_nclus-1; - for(i =0; i< armci_nclus-1; i++) - if(armci_me < armci_clus_info[i+1].master){ - armci_clus_me=i; - break; - } -#else - - armci_clus_me=0; - armci_nclus=1; - armci_clus_info = (armci_clus_t*)malloc(armci_nclus*sizeof(armci_clus_t)); - if(!armci_clus_info) - armci_die("malloc failed for clusinfo",armci_nclus); - strcpy(armci_clus_info[0].hostname, names); - armci_clus_info[0].master=0; - armci_clus_info[0].nslave=armci_nproc; -#endif - - /* Starting process ID on my node */ - armci_clus_first = armci_clus_info[armci_clus_me].master; - - /* Last process ID on my node */ - armci_clus_last = armci_clus_first + - armci_clus_info[armci_clus_me].nslave-1; - -} - - -/*\ Substring Replacement: replace needle with nail in a haystack - \*/ -static char *substr_replace(char *haystack, char *needle, char *nail) -{ - char *tmp, *pos, *first; - size_t len=strlen(needle), nlen=strlen(nail),bytes; - size_t left; - - pos = strstr(haystack,needle); - if (pos ==NULL) return NULL; - first= tmp = calloc(strlen(haystack)+nlen-len+1+1,1); - if(first==NULL) return(NULL); - bytes = pos - haystack; - while(bytes){ *tmp = *haystack; tmp++; haystack++; bytes--;} - while(nlen) { *tmp = *nail; tmp++; nail++; nlen--;} - haystack += len; - left = strlen(haystack); - while(left>0){*tmp = *haystack; tmp++; haystack++; left --;} - *tmp='\0'; - return(first); -} - - -/*\ ARMCI_HOSTNAME_REPLACE contains "needle/nail" string to derive new hostname - \*/ -static char *new_hostname(char *host) -{ - char *tmp, *needle, *nail; - if((tmp =getenv("ARMCI_HOSTNAME_REPLACE"))){ - needle = strdup(tmp); - if(needle== NULL) return NULL; - nail = strchr(needle,'/'); - if(nail == NULL) return NULL; - *nail = '\0'; - nail++; - if(nail == (needle+1)){ - char* tmp1 = calloc(strlen(host)+strlen(nail)+1,1); - if(tmp1 == NULL) return NULL; - strcpy(tmp1,host); - strcat(tmp1,nail); - return tmp1; - } - return substr_replace(host,needle,nail); - } else return NULL; -} - - -static void print_clus_info() -{ - int i; - - if(PRINT_CLUSTER_INFO && armci_nclus > 1 && armci_me ==0){ - printf("ARMCI configured for %d cluster nodes. Network protocol is '%s'.\n", - armci_nclus, network_protocol); - fflush(stdout); - } - - if(armci_me==0 && DEBUG) for(i=0;i= MAX_HOSTNAME) - armci_die("armci: hostname too long",strlen(tmp)); - strcpy(name,tmp); - printf("%d using %s hostname\n",armci_me, name); - fflush(stdout); - } - len = strlen(name); -#if ARMCI_ENABLE_GPC_CALLS - /*a simple way to run as many servers as compute processes*/ - enval = getenv("ARMCI_NSERV_EQ_NPROC"); - if(enval != NULL){ - sprintf(name+len,"n%d",getpid()); - len = strlen(name); - printf("\n%s\n",name); - } -#endif - - -#ifdef HOSTNAME_TRUNCATE - { - /* in some cases (e.g.,SP) when name is used to determine - * cluster structure but not to establish communication - * we can truncate hostnames to save memory */ - int i; - limit = HOSTNAME_LEN-1; - for(i=0; i",i+1); - } - if(len>limit)name[limit]='\0'; - len =limit; - } -#else - if(len >= HOSTNAME_LEN-1) - armci_die("armci: gethostname overrun name string length",len); -#endif - -#ifdef NO_SHMEM - if(armci_enable_alpha_hack()) { - name[len]='0'+armci_me; - name[len+1]='\0'; - len++; - } -#endif - - if(DEBUG) - fprintf(stderr,"%d: %s len=%d\n",armci_me, name,(int)strlen(name)); - -#ifdef CLUSTER - merged = merge_names(name); /* create hostname list */ - process_hostlist(merged); /* compute cluster info */ - free(merged); -#else - process_hostlist(name); /* compute cluster info */ -#endif - - armci_master = armci_clus_info[armci_clus_me].master; - -#ifdef NO_SHMEM - if(armci_enable_alpha_hack()) { - int i; - for(i=0;i= armci_nproc) - armci_die("armci_clus_id: out of range",p); - - if (p < armci_clus_first){ - from = 0; - to = armci_clus_me; - } - else { - from = armci_clus_me; - to = armci_nclus; - } - - found = to - 1; - - /* Binary search algorithm to be implemented, - * sequential search for now */ - for(c = from; c < to - 1; c++) - if (p < armci_clus_info[c+1].master) { - found = c; - break; - } - - return found; -} - - -/*\ return number of processes in the domain represented by id; id<0 means my node - \*/ -int armci_domain_nprocs(armci_domain_t domain, int id) -{ - if(id >= armci_nclus) - armci_die2("armci domain error",id,armci_nclus); - /* This is an error condition */ - if(id < 0) { - fprintf(stderr,"[%d] Returned domain is invalid\n", armci_me); - id = armci_clus_me; - } - return armci_clus_info[id].nslave; -} - -/* return number of nodes in diven domain */ -int armci_domain_count(armci_domain_t domain) -{ - return armci_nclus; -} - -/* return domain ID of the specified process */ -int armci_domain_id(armci_domain_t domain, int glob_proc_id) -{ - int id = glob_proc_id; - - if(id <0 || id >= armci_nproc) { - armci_die2("armci domain error",id,armci_nproc); - } - - return armci_clus_id(glob_proc_id); -} - -/* return global ID of a process loc_proc_id in domain identified by id - * armci_domain_nproc(id)< loc_proc_id >=0 - */ - -int armci_domain_glob_proc_id(armci_domain_t domain, int id, int loc_proc_id) -{ - if(id <0 || id >= armci_nclus) - armci_die2("armci domain error",id,armci_nclus); - - if(loc_proc_id<0 || loc_proc_id>= armci_clus_info[id].nslave) - armci_die2("armci domain proc error", - loc_proc_id,armci_clus_info[id].nslave); - - return (armci_clus_info[id].master + loc_proc_id); -} - -/* return ID of domain that the calling process belongs to -*/ -int armci_domain_my_id(armci_domain_t domain) -{ - return armci_clus_me; -} - -/* Check whether the oricess is in the same domain */ -int armci_domain_same_id (armci_domain_t domain, int proc) -{ -#if 0 - int rc = SAMECLUSNODE(proc); - return rc; -#else - assert(0); -#endif -} diff --git a/comex/src-dmapp/clusterinfo.h b/comex/src-dmapp/clusterinfo.h deleted file mode 100644 index a1cc9e13c..000000000 --- a/comex/src-dmapp/clusterinfo.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef CLUSTERINFO_H_ -#define CLUSTERINFO_H_ - -/* consider up to HOSTNAME_LEN characters in host name */ -#define HOSTNAME_LEN 64 - -typedef struct { - int master; - int nslave; - char hostname[HOSTNAME_LEN]; -} armci_clus_t; - -extern armci_clus_t *armci_clus_info; -extern int armci_me; -extern int armci_nproc; -extern int armci_nclus; -extern int armci_clus_me; -extern int armci_master; -extern int armci_clus_first; -extern int armci_clus_last; - -extern int armci_clus_id(int p); -extern void armci_init_clusinfo(); - -#endif /* CLUSTERINFO_H_ */ diff --git a/comex/src-dmapp/comex.c b/comex/src-dmapp/comex.c deleted file mode 100644 index 63dff0a12..000000000 --- a/comex/src-dmapp/comex.c +++ /dev/null @@ -1,1870 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* C and/or system headers */ -#include -#include -#include -#include -#include -#include -#include - -/* 3rd party headers */ -#include -#include - -/* our headers */ -#include "comex.h" -#include "comex_impl.h" -/*#include "clusterinfo.h"*/ -#include "groups.h" -#include "reg_cache.h" -#include "acc.h" - -/* Cray */ -#if HAVE_DMAPP_LOCK_DESC_T && HAVE_DMAPP_LOCK_HANDLE_T -# define HAVE_DMAPP_LOCK 1 -#endif - -#define DEBUG 0 - - -#if HAVE_DMAPP_LOCK -// COMEX_MAX_LOCKS mirrors the default DMAPP_MAX_LOCKS limit -// Larger values of COMEX_MAX_LOCKS will require DMAPP_MAX_LOCKS be set at runtime. -// DMAPP_MAX_LOCKS has a maxium value of 1023 -#define COMEX_MAX_LOCKS 128 -static dmapp_lock_desc_t lock_desc[COMEX_MAX_LOCKS]; -static dmapp_lock_handle_t lock_handle[COMEX_MAX_LOCKS]; -#endif - - -/* exported state */ -local_state l_state; -int comex_me=-1; -int comex_nproc=-1; - -/* static state */ -static int initialized=0; /* for comex_initialized(), 0=false */ -static int total_outstanding=0; -static int max_outstanding_nb=MAX_NB_OUTSTANDING; -static int malloc_is_using_huge_pages=0; /* from env var, 0=false */ -static int comex_is_using_huge_pages=0; /* from env var, 0=false */ -static long hugetlb_default_page_size=0; /* from env var, in bytes */ -static long sc_page_size=0; /* from sysconf, in bytes */ -static long hugepagesize=0; /* from libhugetlbfs, in bytes */ -static long comex_page_size=0; /* page size consensus, in bytes */ -static char skip_lock=0; /* don't acquire or release lock */ -static char skip_sync=0; /* don't sync implicit nb requests */ - -/* static function declarations */ -static void check_envs(void); -static void create_dmapp_locks(void); -static void destroy_dmapp_locks(void); -static void dmapp_alloc_buf(void); -static void dmapp_free_buf(void); -static void dmapp_initialize(void); -static void dmapp_network_lock(int proc); -static void dmapp_network_unlock(int proc); -static void dmapp_terminate(void); -static void increment_total_outstanding(void); -static void wait_and_clear_total_outstanding(void); -static void my_free(void *ptr); -static void* my_malloc(size_t size); -static int my_memalign(void **memptr, size_t alignment, size_t size); -static int comex_get_nbi(void *src, void *dst, int bytes, int proc); -static int comex_get_nb(void *src, void *dst, int bytes, int proc, dmapp_syncid_handle_t *handle); -static int comex_put_nbi(void *src, void *dst, int bytes, int proc); -static int comex_put_nb(void *src, void *dst, int bytes, int proc, dmapp_syncid_handle_t *handle); -static void* _comex_malloc_local(size_t size, dmapp_seg_desc_t *seg); - - -static void* my_malloc(size_t size) -{ - void *memptr=NULL; - -#if DEBUG - if (0 == l_state.rank) { - printf("my_malloc(%lu)\n", (long unsigned)size); - } -#endif - -#if HAVE_LIBHUGETLBFS - if (malloc_is_using_huge_pages) { - memptr = malloc(size); - } - else if (comex_is_using_huge_pages) { - memptr = get_hugepage_region(size, GHR_DEFAULT); - } - else { - memptr = malloc(size); - } -#else - memptr = malloc(size); -#endif - - /* postconditions */ - assert(memptr); - - return memptr; -} - - -static void my_free(void *ptr) -{ -#if DEBUG - if (0 == l_state.rank) { - printf("my_free(%p)\n", ptr); - } -#endif - -#if HAVE_LIBHUGETLBFS - if (malloc_is_using_huge_pages) { - free(ptr); - } - else if (comex_is_using_huge_pages) { - free_hugepage_region(ptr); - } - else { - free(ptr); - } -#else - free(ptr); -#endif -} - - -static int my_memalign(void **memptr, size_t alignment, size_t size) -{ - int status = 0; - -#if DEBUG - if (0 == l_state.rank) { - printf("my_memalign(%lu)\n", (long unsigned)size); - } -#endif - - /* preconditions */ - assert(memptr); - -#if HAVE_LIBHUGETLBFS - if (malloc_is_using_huge_pages) { - status = posix_memalign(memptr, alignment, size); - } - else if (comex_is_using_huge_pages) { - *memptr = get_hugepage_region(size, GHR_DEFAULT); - } - else { - status = posix_memalign(memptr, alignment, size); - } -#else - status = posix_memalign(memptr, alignment, size); -#endif - - /* postconditions */ - assert(*memptr); - - return status; -} - - -static void increment_total_outstanding(void) -{ - ++total_outstanding; - - if (total_outstanding == max_outstanding_nb) { - wait_and_clear_total_outstanding(); - } -} - - -static void wait_and_clear_total_outstanding(void) -{ - int status; - status = dmapp_gsync_wait(); - assert(status == DMAPP_RC_SUCCESS); - total_outstanding = 0; -} - - -/* The blocking implementations should use blocking DMAPP calls */ -int comex_put(void *src, void *dst, int bytes, int proc, comex_group_t group) -{ - int status; - status = comex_put_nbi(src, dst, bytes, proc); - assert(status == DMAPP_RC_SUCCESS); - comex_wait_proc(proc, group); - return COMEX_SUCCESS; -} - - -int comex_get(void *src, void *dst, int bytes, int proc, comex_group_t group) -{ - int status; - status = comex_get_nbi(src, dst, bytes, proc); - assert(status == DMAPP_RC_SUCCESS); - comex_wait_proc(proc, group); - return COMEX_SUCCESS; -} - - -/* The blocking implementations should use blocking DMAPP calls */ -static int comex_put_nbi(void *src, void *dst, int bytes, int proc) -{ - int status = DMAPP_RC_SUCCESS; - int nelems = bytes; - int type = DMAPP_BYTE; - int failure_observed = 0; - reg_entry_t *dst_reg = NULL; - reg_entry_t *src_reg = NULL; - - /* Corner case */ - if (proc == l_state.rank) { - memcpy(dst, src, bytes); - return status; - } - - /* If the number of bytes is even, use Double word datatype, - * DMAPP_BYTE performance is much worse */ - if (0 == bytes%16) { - nelems = bytes/16; - type = DMAPP_DQW; - } - else if (0 == bytes%8) { - nelems = bytes/8; - type = DMAPP_QW; - } - else if (0 == bytes%4) { - nelems = bytes/4; - type = DMAPP_DW; - } - - /* Find the dmapp seg desc */ - dst_reg = reg_cache_find(proc, dst, bytes); - assert(dst_reg); - - src_reg = reg_cache_find(l_state.rank, src, bytes); - - status = dmapp_put_nbi(dst, &(dst_reg->mr), proc, src, nelems, type); - increment_total_outstanding(); - if (status != DMAPP_RC_SUCCESS) { - failure_observed = 1; - } - - /* Fallback */ - if (failure_observed) { - comex_wait_all(COMEX_GROUP_WORLD); - assert(bytes <= l_state.put_buf_len); - memcpy(l_state.put_buf, src, bytes); - status = dmapp_put_nbi(dst, &(dst_reg->mr), - proc, l_state.put_buf, nelems, type); - increment_total_outstanding(); - comex_wait_all(COMEX_GROUP_WORLD); - - /* Fallback must work correctly */ - assert(status == DMAPP_RC_SUCCESS); - } - - return status; -} - - -/* The blocking implementations should use blocking DMAPP calls */ -static int comex_put_nb(void *src, void *dst, int bytes, int proc, dmapp_syncid_handle_t *handle) -{ - int status = DMAPP_RC_SUCCESS; - int nelems = bytes; - int type = DMAPP_BYTE; - int failure_observed = 0; - reg_entry_t *dst_reg = NULL; - reg_entry_t *src_reg = NULL; - - /* If the number of bytes is even, use Double word datatype, - * DMAPP_BYTE performance is much worse */ - if (0 == bytes%16) { - nelems = bytes/16; - type = DMAPP_DQW; - } - else if (0 == bytes%8) { - nelems = bytes/8; - type = DMAPP_QW; - } - else if (0 == bytes%4) { - nelems = bytes/4; - type = DMAPP_DW; - } - - /* Find the dmapp seg desc */ - dst_reg = reg_cache_find(proc, dst, bytes); - assert(dst_reg); - - src_reg = reg_cache_find(l_state.rank, src, bytes); - - status = dmapp_put_nb(dst, &(dst_reg->mr), proc, src, nelems, type, handle); - assert(status == DMAPP_RC_SUCCESS); - - return status; -} - - -static int comex_get_nb(void *src, void *dst, int bytes, int proc, dmapp_syncid_handle_t *handle) -{ - int status = DMAPP_RC_SUCCESS; - int nelems = bytes; - int type = DMAPP_BYTE; - int failure_observed = 0; - reg_entry_t *dst_reg = NULL; - reg_entry_t *src_reg = NULL; - - /* If the number of bytes is even, use Double word datatype, - * DMAPP_BYTE performance is much worse */ - if (0 == bytes%16) { - nelems = bytes/16; - type = DMAPP_DQW; - } - else if (0 == bytes%8) { - nelems = bytes/8; - type = DMAPP_QW; - } - else if (0 == bytes%4) { - nelems = bytes/4; - type = DMAPP_DW; - } - - /* Find the dmapp seg desc */ - dst_reg = reg_cache_find(proc, src, bytes); - assert(dst_reg); - - src_reg = reg_cache_find(l_state.rank, dst, bytes); - - status = dmapp_get_nb(dst, src, &(dst_reg->mr), proc, nelems, type, handle); - assert(status == DMAPP_RC_SUCCESS); - - return COMEX_SUCCESS; -} - - -static int comex_get_nbi(void *src, void *dst, int bytes, int proc) -{ - int status = DMAPP_RC_SUCCESS; - int nelems = bytes; - int type = DMAPP_BYTE; - int failure_observed = 0; - reg_entry_t *dst_reg = NULL; - reg_entry_t *src_reg = NULL; - - /* Corner case */ - if (proc == l_state.rank) { - memcpy(dst, src, bytes); - return status; - } - - /* If the number of bytes is even, use Double word datatype, - * DMAPP_BYTE performance is much worse */ - if (0 == bytes%16) { - nelems = bytes/16; - type = DMAPP_DQW; - } - else if (0 == bytes%8) { - nelems = bytes/8; - type = DMAPP_QW; - } - else if (0 == bytes%4) { - nelems = bytes/4; - type = DMAPP_DW; - } - - /* Find the dmapp seg desc */ - dst_reg = reg_cache_find(proc, src, bytes); - assert(dst_reg); - - src_reg = reg_cache_find(l_state.rank, dst, bytes); - - status = dmapp_get_nbi(dst, src, &(dst_reg->mr), - proc, nelems, type); - increment_total_outstanding(); - if (status != DMAPP_RC_SUCCESS) { - failure_observed = 1; - } - - /* Fallback */ - if (failure_observed) { - comex_wait_all(COMEX_GROUP_WORLD); - assert(bytes <= l_state.get_buf_len); - status = dmapp_get(l_state.get_buf, src, &(dst_reg->mr), - proc, nelems, type); - memcpy(dst, l_state.get_buf, bytes); - } - - /* Original or fallback must work correctly */ - assert(status == DMAPP_RC_SUCCESS); - - return status; -} - - -static void dmapp_network_lock(int proc) -{ - int dmapp_status; - -#if HAVE_DMAPP_LOCK - dmapp_lock_acquire( &lock_desc[0], &(l_state.job.data_seg), proc, 0, &lock_handle[0]); -#else - reg_entry_t *dst_reg= reg_cache_find(proc, - l_state.atomic_lock_buf[proc], sizeof(long)); - - assert(dst_reg); - - do { - dmapp_status = dmapp_acswap_qw(l_state.local_lock_buf, - l_state.atomic_lock_buf[proc], - &(dst_reg->mr), - proc, 0, l_state.rank + 1); - - assert(dmapp_status == DMAPP_RC_SUCCESS); - } - while(*(l_state.local_lock_buf) != 0); -#endif -} - - -static void dmapp_network_unlock(int proc) -{ - int dmapp_status; - -# if HAVE_DMAPP_LOCK - dmapp_lock_release( lock_handle[0], 0 ); -#else - reg_entry_t *dst_reg= reg_cache_find(proc, - l_state.atomic_lock_buf[proc], sizeof(long)); - - assert(dst_reg); - - do { - dmapp_status = dmapp_acswap_qw(l_state.local_lock_buf, - l_state.atomic_lock_buf[proc], - &(dst_reg->mr), - proc, l_state.rank + 1, 0); - assert(dmapp_status == DMAPP_RC_SUCCESS); - } while (*(l_state.local_lock_buf) != (unsigned long)(l_state.rank + 1)); -#endif -} - - -int comex_puts(void *src_ptr, int src_stride_ar[/*stride_levels*/], - void *dst_ptr, int dst_stride_ar[/*stride_levels*/], - int count[/*stride_levels+1*/], int stride_levels, int proc, comex_group_t group) -{ - int i, j; - long src_idx, dst_idx; /* index offset of current block position to ptr */ - int n1dim; /* number of 1 dim block */ - int src_bvalue[7], src_bunit[7]; - int dst_bvalue[7], dst_bunit[7]; - int dmapp_status; - - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) { - n1dim *= count[i]; - } - - /* calculate the destination indices */ - src_bvalue[0] = 0; src_bvalue[1] = 0; src_bunit[0] = 1; src_bunit[1] = 1; - dst_bvalue[0] = 0; dst_bvalue[1] = 0; dst_bunit[0] = 1; dst_bunit[1] = 1; - - for(i=2; i<=stride_levels; i++) { - src_bvalue[i] = 0; - dst_bvalue[i] = 0; - src_bunit[i] = src_bunit[i-1] * count[i-1]; - dst_bunit[i] = dst_bunit[i-1] * count[i-1]; - } - - /* index mangling */ - for(i=0; i (count[j]-1)) { - src_bvalue[j] = 0; - } - } - - for(j=1; j<=stride_levels; j++) { - dst_idx += (long) dst_bvalue[j] * (long) dst_stride_ar[j-1]; - if((i+1) % dst_bunit[j] == 0) { - dst_bvalue[j]++; - } - if(dst_bvalue[j] > (count[j]-1)) { - dst_bvalue[j] = 0; - } - } - - dmapp_status = comex_put_nbi((char *)src_ptr + src_idx, - (char *)dst_ptr + dst_idx, count[0], proc); - assert(dmapp_status == DMAPP_RC_SUCCESS); - } - - if (0 == skip_sync) { - comex_wait_proc(proc, group); - } - - return COMEX_SUCCESS; -} - - -int comex_gets(void *src_ptr, int src_stride_ar[/*stride_levels*/], - void *dst_ptr, int dst_stride_ar[/*stride_levels*/], - int count[/*stride_levels+1*/], int stride_levels, int proc, comex_group_t group) -{ - int i, j; - long src_idx, dst_idx; /* index offset of current block position to ptr */ - int n1dim; /* number of 1 dim block */ - int src_bvalue[7], src_bunit[7]; - int dst_bvalue[7], dst_bunit[7]; - int dmapp_status; - - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) { - n1dim *= count[i]; - } - - /* calculate the destination indices */ - src_bvalue[0] = 0; src_bvalue[1] = 0; src_bunit[0] = 1; src_bunit[1] = 1; - dst_bvalue[0] = 0; dst_bvalue[1] = 0; dst_bunit[0] = 1; dst_bunit[1] = 1; - - for(i=2; i<=stride_levels; i++) { - src_bvalue[i] = 0; - dst_bvalue[i] = 0; - src_bunit[i] = src_bunit[i-1] * count[i-1]; - dst_bunit[i] = dst_bunit[i-1] * count[i-1]; - } - - for(i=0; i (count[j]-1)) { - src_bvalue[j] = 0; - } - } - - dst_idx = 0; - - for(j=1; j<=stride_levels; j++) { - dst_idx += (long) dst_bvalue[j] * (long) dst_stride_ar[j-1]; - if((i+1) % dst_bunit[j] == 0) { - dst_bvalue[j]++; - } - if(dst_bvalue[j] > (count[j]-1)) { - dst_bvalue[j] = 0; - } - } - - dmapp_status = comex_get_nbi((char *)src_ptr + src_idx, - (char *)dst_ptr + dst_idx, count[0], proc); - assert(dmapp_status == DMAPP_RC_SUCCESS); - } - - if (0 == skip_sync) { - comex_wait_proc(proc, group); - } - - return COMEX_SUCCESS; -} - - -int comex_acc(int datatype, void *scale, - void *src_ptr, - void *dst_ptr, - int bytes, int proc, comex_group_t group) -{ - - comex_accs(datatype, scale, src_ptr, NULL, dst_ptr, - NULL, &bytes, 0, proc, group); - return COMEX_SUCCESS; -} - - -int comex_accs(int datatype, void *scale, - void *src_ptr, int src_stride_ar[/*stride_levels*/], - void *dst_ptr, int dst_stride_ar[/*stride_levels*/], - int count[/*stride_levels+1*/], int stride_levels, int proc, comex_group_t group) -{ - int i, j; - long src_idx, dst_idx; /* index offset of current block position to ptr */ - int n1dim; /* number of 1 dim block */ - int src_bvalue[7], src_bunit[7]; - int dst_bvalue[7], dst_bunit[7]; - int sizetogetput; - void *get_buf; - - /* number of n-element of the first dimension */ - n1dim = 1; - for(i=1; i<=stride_levels; i++) - n1dim *= count[i]; - - /* calculate the destination indices */ - src_bvalue[0] = 0; src_bvalue[1] = 0; src_bunit[0] = 1; src_bunit[1] = 1; - dst_bvalue[0] = 0; dst_bvalue[1] = 0; dst_bunit[0] = 1; dst_bunit[1] = 1; - - for(i=2; i<=stride_levels; i++) - { - src_bvalue[i] = 0; - dst_bvalue[i] = 0; - src_bunit[i] = src_bunit[i-1] * count[i-1]; - dst_bunit[i] = dst_bunit[i-1] * count[i-1]; - } - - sizetogetput = count[0]; - - if (0 == skip_lock) { - // grab the atomics lock - dmapp_network_lock(proc); - } - -#if PIPELINED_ACCUMULATE - if (sizetogetput > l_state.pipe_acc_buf_len) -#endif - { - /* fall back to sequential with newly allocated buffer */ - if (sizetogetput <= l_state.acc_buf_len) { - get_buf = l_state.acc_buf; - } - else { - get_buf = (char *)my_malloc(sizeof(char) * sizetogetput); - } - assert(get_buf); - - for(i=0; i (count[j]-1)) { - src_bvalue[j] = 0; - } - } - - dst_idx = 0; - - for(j=1; j<=stride_levels; j++) { - dst_idx += (long) dst_bvalue[j] * (long) dst_stride_ar[j-1]; - if((i+1) % dst_bunit[j] == 0) { - dst_bvalue[j]++; - } - if(dst_bvalue[j] > (count[j]-1)) { - dst_bvalue[j] = 0; - } - } - - // Get the remote data in a temp buffer - comex_get((char *)dst_ptr + dst_idx, get_buf, sizetogetput, proc, group); - - _acc(datatype, count[0], get_buf, ((char*)src_ptr)+src_idx, scale); - - // Write back - comex_put(get_buf, (char *)dst_ptr + dst_idx, sizetogetput, proc, group); - } - if (sizetogetput > l_state.acc_buf_len) { - // unregister temp buffer ? TODO consider keeping temp buf around - // in case another large request comes along? - free(get_buf); - } - } -#if PIPELINED_ACCUMULATE - else { - printf("pipelined acc\n"); - /* pipelined protocol, sort of */ - long pipe_src_idx[PIPELINED_MAX_BUFFERS]; - long pipe_dst_idx[PIPELINED_MAX_BUFFERS]; - dmapp_syncid_handle_t pipe_handle[PIPELINED_MAX_BUFFERS]; - char pipe_state[PIPELINED_MAX_BUFFERS]; - long pipe_index = 0; - char done = 0; - - for (i=0; i (count[j]-1)) { - src_bvalue[j] = 0; - } - } - - /* calculate the dst_idx */ - dst_idx = 0; - for(j=1; j<=stride_levels; j++) { - dst_idx += (long) dst_bvalue[j] * (long) dst_stride_ar[j-1]; - if((i+1) % dst_bunit[j] == 0) { - dst_bvalue[j]++; - } - if(dst_bvalue[j] > (count[j]-1)) { - dst_bvalue[j] = 0; - } - } - - // Get the remote data in a temp buffer - comex_get_nb((char *)dst_ptr + dst_idx, - l_state.pipe_acc_buf[pipe_index], - sizetogetput, proc, &pipe_handle[pipe_index]); - pipe_dst_idx[pipe_index] = dst_idx; - pipe_src_idx[pipe_index] = src_idx; - pipe_state[pipe_index] = 1; - pipe_index = (pipe_index+1) % PIPELINED_MAX_BUFFERS; - } - - /* no more gets to issue, but we may have outstanding gets/puts */ - do { - done = 1; - for (pipe_index=0; pipe_index l_state.acc_buf_len) - my_free(get_buf); - - return COMEX_SUCCESS; -} - - -int comex_fence_all(comex_group_t group) -{ - comex_wait_all(group); - /* noop for DMAPP */ - return COMEX_SUCCESS; -} - - -int comex_fence_proc(int proc, comex_group_t group) -{ - comex_wait_all(group); - /* noop for DMAPP */ - return COMEX_SUCCESS; -} - - -/* comex_barrier is comex_fence_all + MPI_Barrier */ -int comex_barrier(comex_group_t group) -{ - MPI_Comm comm; - - comex_fence_all(group); - assert(COMEX_SUCCESS == comex_group_comm(group, &comm)); - MPI_Barrier(comm); - - return COMEX_SUCCESS; -} - - -void comex_error(char *msg, int code) -{ - if (0 == l_state.rank) - fprintf(stderr,"Received an Error in Communication\n"); - - MPI_Abort(l_state.world_comm, code); -} - - -static void* _comex_malloc_local(size_t size, dmapp_seg_desc_t *seg) -{ - void *ptr; - int rc; - int status; - - rc = my_memalign(&ptr, comex_page_size, sizeof(char)*size); - assert(0 == rc); - assert(ptr); - - status = dmapp_mem_register(ptr, size, seg); - assert(status == DMAPP_RC_SUCCESS); -#if DEBUG - printf("[%d] _comex_malloc_local ptr=%p size=%zu\n", - l_state.rank, ptr, size); - printf("[%d] _comex_malloc_local seg=%p size=%zu\n", - l_state.rank, seg->addr, seg->len); -#endif -#if 0 - assert(seg->addr == ptr); - assert(seg->len == size); /* @TODO this failed! */ -#endif - reg_cache_insert(l_state.rank, ptr, size, *seg); - - return ptr; -} - - -void *comex_malloc_local(size_t size) -{ - void *ptr = NULL; - dmapp_seg_desc_t seg; - - ptr = _comex_malloc_local(size, &seg); - - return ptr; -} - - -int comex_free_local(void *ptr) -{ - reg_return_t status = RR_FAILURE; - - /* preconditions */ - assert(NULL != ptr); - - /* remove from reg cache */ - status = reg_cache_delete(l_state.rank, ptr); - assert(RR_SUCCESS == status); - - /* free the memory */ - my_free(ptr); - - return COMEX_SUCCESS; -} - - -static void destroy_dmapp_locks(void) -{ -#if DMAPP_LOCK -#else - if (l_state.local_lock_buf) - comex_free_local(l_state.local_lock_buf); - - if (l_state.atomic_lock_buf) - comex_free(l_state.atomic_lock_buf[l_state.rank], COMEX_GROUP_WORLD); -#endif -} - - -static void create_dmapp_locks(void) -{ -#if DMAPP_LOCK - bzero(lock_desc, sizeof(lock_desc)); -#else - l_state.local_lock_buf = comex_malloc_local(sizeof(long)); - assert(l_state.local_lock_buf); - - l_state.atomic_lock_buf = - (unsigned long**)my_malloc(l_state.size * sizeof(unsigned long*)); - assert(l_state.atomic_lock_buf); - - comex_malloc((void**)l_state.atomic_lock_buf, sizeof(long), COMEX_GROUP_WORLD); - - *(long *)(l_state.atomic_lock_buf[l_state.rank]) = 0; - *(long *)(l_state.local_lock_buf) = 0; -#endif - - MPI_Barrier(l_state.world_comm); -} - - -static void dmapp_alloc_buf(void) -{ - int i; - - // FAILURE_BUFSIZE should be some multiple of our page size? - -#if PIPELINED_ACCUMULATE - l_state.pipe_acc_buf = malloc(PIPELINED_MAX_BUFFERS * sizeof(void*)); - l_state.pipe_acc_buf_len = comex_page_size; - for (i=0; i= 200112L || _XOPEN_SOURCE >= 600) -# error posix_memalign *NOT* available -#endif - - /* groups */ - comex_group_init(); - - /* Initialize */ - dmapp_initialize(); - - /* mutexes */ - l_state.mutexes = NULL; - l_state.local_mutex = NULL; - l_state.num_mutexes = NULL; - - /* cluster info */ - /*comex_init_clusinfo();*/ - - /* Synch - Sanity Check */ - MPI_Barrier(l_state.world_comm); - - return COMEX_SUCCESS; -} - - -int comex_init() -{ - return _comex_init(MPI_COMM_WORLD); -} - - -int comex_init_comm(MPI_Comm comm) -{ - return _comex_init(comm); -} - - -int comex_init_args(int *argc, char ***argv) -{ - int rc; - int init_flag; - - MPI_Initialized(&init_flag); - - if(!init_flag) - MPI_Init(argc, argv); - - rc = comex_init(); - return rc; -} - - -int comex_finalize() -{ - /* it's okay to call multiple times -- extra calls are no-ops */ - if (!initialized) { - return COMEX_SUCCESS; - } - - initialized = 0; - - /* Make sure that all outstanding operations are done */ - comex_wait_all(COMEX_GROUP_WORLD); - - dmapp_terminate(); - - /* groups */ - comex_group_finalize(); - - MPI_Barrier(l_state.world_comm); - - // destroy the communicators - MPI_Comm_free(&l_state.world_comm); - - return COMEX_SUCCESS; -} - - -int comex_nbput(void *src, void *dst, int bytes, int proc, comex_group_t group, comex_request_t *hdl) -{ - int rc; - rc = comex_put_nbi(src, dst, bytes, proc); - return (rc == DMAPP_RC_SUCCESS) ? COMEX_SUCCESS : COMEX_FAILURE; -} - - -int comex_nbget(void *src, void *dst, int bytes, int proc, comex_group_t group, comex_request_t *hdl) -{ - int rc; - rc = comex_get_nbi(src, dst, bytes, proc); - return (rc == DMAPP_RC_SUCCESS) ? COMEX_SUCCESS : COMEX_FAILURE; -} - - -int comex_wait_proc(int proc, comex_group_t group) -{ - wait_and_clear_total_outstanding(); - return COMEX_SUCCESS; -} - - -int comex_wait(comex_request_t* hdl) -{ - wait_and_clear_total_outstanding(); - return COMEX_SUCCESS; -} - - -int comex_test(comex_request_t* hdl, int *status) -{ - wait_and_clear_total_outstanding(); - *status = 0; - return COMEX_SUCCESS; -} - - -int comex_wait_all(comex_group_t group) -{ - wait_and_clear_total_outstanding(); - return COMEX_SUCCESS; -} - - -int comex_nbputs( - void *src, int *src_stride, - void *dst, int *dst_stride, - int *count, int stride_levels, - int proc, comex_group_t group, - comex_request_t *hdl) -{ - int rc; - assert(0 == skip_sync); - skip_sync = 1; - rc = comex_puts(src, src_stride, dst, dst_stride, - count, stride_levels, proc, group); - skip_sync = 0; - return rc; -} - - -int comex_nbgets( - void *src, int *src_stride, - void *dst, int *dst_stride, - int *count, int stride_levels, - int proc, comex_group_t group, - comex_request_t *hdl) -{ - int rc; - assert(0 == skip_sync); - skip_sync = 1; - rc = comex_gets(src, src_stride, dst, dst_stride, - count, stride_levels, proc, group); - skip_sync = 0; - return rc; -} - - -int comex_nbaccs( - int datatype, void *scale, - void *src, int *src_stride, - void *dst, int *dst_stride, - int *count, int stride_levels, - int proc, comex_group_t group, - comex_request_t *hdl) -{ - int rc; - rc = comex_accs(datatype, scale, - src, src_stride, dst, dst_stride, - count, stride_levels, proc, group); - return rc; -} - - -/* Vector Calls */ - - -int comex_putv(comex_giov_t *iov, int iov_len, int proc, comex_group_t group) -{ - int status; - int i; - for (i=0; imr), proc, extra); - assert(status == DMAPP_RC_SUCCESS); -#else - long tmp; - dmapp_network_lock(proc); - comex_get(prem, ploc, sizeof(long), proc, group); - tmp = *(long*)ploc + extra; - comex_put(&tmp, prem, sizeof(long), proc, group); - dmapp_network_unlock(proc); -#endif - } - else if (op == COMEX_SWAP) { - /* dmapp doesn't have atomic swap for int */ - int tmp; - dmapp_network_lock(proc); - comex_get(prem, &tmp, sizeof(int), proc, group); - comex_put(ploc, prem, sizeof(int), proc, group); - dmapp_network_unlock(proc); - *(int*)ploc = tmp; - } - else if (op == COMEX_SWAP_LONG) { - /* dmapp has atomic cswap for long, but it's non-blocking */ - long tmp; - dmapp_network_lock(proc); - comex_get(prem, &tmp, sizeof(long), proc, group); - comex_put(ploc, prem, sizeof(long), proc, group); - dmapp_network_unlock(proc); - *(long*)ploc = tmp; - } - else { - assert(0); - } - - return COMEX_SUCCESS; -} - - -/* Mutex Operations */ -int comex_create_mutexes(int num) -{ - int i=0; - - assert(NULL == l_state.mutexes); - assert(NULL == l_state.local_mutex); - assert(NULL == l_state.num_mutexes); - - /* every process knows how many mutexes created on every process */ - l_state.num_mutexes = (unsigned int*)my_malloc(l_state.size * sizeof(unsigned int)); - assert(l_state.num_mutexes); - /* gather the counts */ - MPI_Allgather(&num, 1, MPI_INT, - l_state.num_mutexes, 1, MPI_UNSIGNED, l_state.world_comm); - - /* create the 1 element buffer to hold a remote mutex */ - l_state.local_mutex = comex_malloc_local(sizeof(unsigned long)); - assert(l_state.local_mutex); - /* init the local mutex holder to rank+1, indicating no mutex is held */ - *(unsigned long *)(l_state.local_mutex) = l_state.rank+1; - MPI_Barrier(l_state.world_comm); - - /* create all of the mutexes */ - l_state.mutexes = (unsigned long**)my_malloc(l_state.size * sizeof(unsigned long*)); - assert(l_state.mutexes); - comex_malloc((void**)l_state.mutexes, num*sizeof(unsigned long), COMEX_GROUP_WORLD); - /* init all of my mutexes to 0 */ - for (i=0; imr), - proc, 0, l_state.rank + 1); - assert(dmapp_status == DMAPP_RC_SUCCESS); - } - while(*(l_state.local_mutex) != 0); - - return COMEX_SUCCESS; -} - - -int comex_unlock(int mutex, int proc) -{ - int dmapp_status; - reg_entry_t *dst_reg = NULL; - - /* preconditions */ - assert(0 <= proc && proc < l_state.size); - assert(0 <= mutex && (unsigned int)(mutex) < l_state.num_mutexes[proc]); - - dst_reg = reg_cache_find(proc, &(l_state.mutexes[proc][mutex]), sizeof(unsigned long)); - assert(dst_reg); - - do { - dmapp_status = dmapp_acswap_qw(l_state.local_mutex, - &(l_state.mutexes[proc][mutex]), - &(dst_reg->mr), - proc, l_state.rank + 1, 0); - assert(dmapp_status == DMAPP_RC_SUCCESS); - } - while (*(l_state.local_mutex) != (unsigned long)(l_state.rank + 1)); - - return COMEX_SUCCESS; -} - - -int comex_malloc(void *ptrs[], size_t size, comex_group_t group) -{ - comex_igroup_t *igroup = NULL; - MPI_Comm comm = MPI_COMM_NULL; - int comm_rank = -1; - int comm_size = -1; - int rc = MPI_SUCCESS; - void *src_buf = NULL; - size_t max_size = size; - dmapp_seg_desc_t heap_seg; - dmapp_seg_desc_t *allgather_heap_seg = NULL; - int i = 0; - - /* preconditions */ - assert(ptrs); - - igroup = comex_get_igroup_from_group(group); - comm = igroup->comm; - assert(comm != MPI_COMM_NULL); - MPI_Comm_rank(comm, &comm_rank); - MPI_Comm_size(comm, &comm_size); - - /* achieve consensus on the allocation size */ - rc = MPI_Allreduce(&size, &max_size, 1, MPI_LONG, MPI_MAX, comm); - assert(rc == MPI_SUCCESS); - size = max_size; - assert(size > 0); - - /* allocate and register segment */ - ptrs[comm_rank] = _comex_malloc_local(sizeof(char)*max_size, &heap_seg); - - /* exchange buffer address */ - /* @TODO: Consider using MPI_IN_PLACE? */ - memcpy(&src_buf, &ptrs[comm_rank], sizeof(void *)); - MPI_Allgather(&src_buf, sizeof(void *), MPI_BYTE, ptrs, - sizeof(void *), MPI_BYTE, comm); - - /* allocate receive buffer for exchange of registration info */ - allgather_heap_seg = (dmapp_seg_desc_t *)my_malloc( - sizeof(dmapp_seg_desc_t) * comm_size); - assert(allgather_heap_seg); - - /* exchange registration info */ - MPI_Allgather(&heap_seg, sizeof(dmapp_seg_desc_t), MPI_BYTE, - allgather_heap_seg, sizeof(dmapp_seg_desc_t), MPI_BYTE, comm); - - /* insert this info into registration cache */ - for (i = 0; i < comm_size; ++i) { - int world_rank; - assert(COMEX_SUCCESS == - comex_group_translate_world(group, i, &world_rank)); - if (i == comm_rank) - continue; - reg_cache_insert(world_rank, ptrs[i], size, allgather_heap_seg[i]); - } - - // Free the temporary buffer - my_free(allgather_heap_seg); - - MPI_Barrier(comm); - - return COMEX_SUCCESS; -} - -int comex_malloc_mem_dev(void *ptrs[], size_t size, comex_group_t group, - const char* device) -{ - return comex_malloc(ptrs,size,group); -} - - -int comex_free(void *ptr, comex_group_t group) -{ - comex_igroup_t *igroup = NULL; - MPI_Comm comm = MPI_COMM_NULL; - int comm_rank; - int comm_size; - int i; - long **allgather_ptrs = NULL; - - /* preconditions */ - assert(NULL != ptr); - - igroup = comex_get_igroup_from_group(group); - comm = igroup->comm; - assert(comm != MPI_COMM_NULL); - MPI_Comm_rank(comm, &comm_rank); - MPI_Comm_size(comm, &comm_size); - - /* allocate receive buffer for exchange of pointers */ - allgather_ptrs = (long **)my_malloc(sizeof(void *) * comm_size); - assert(allgather_ptrs); - - /* exchange of pointers */ - MPI_Allgather(&ptr, sizeof(void *), MPI_BYTE, - allgather_ptrs, sizeof(void *), MPI_BYTE, comm); - - /* remove all ptrs from registration cache */ - for (i = 0; i < comm_size; i++) { - int world_rank; - assert(COMEX_SUCCESS == - comex_group_translate_world(group, i, &world_rank)); - if (i == comm_rank) - continue; - reg_cache_delete(world_rank, allgather_ptrs[i]); - } - - /* remove my ptr from reg cache and free ptr */ - comex_free_local(ptr); - my_free(allgather_ptrs); - - /* Is this needed? */ - MPI_Barrier(comm); - - return COMEX_SUCCESS; -} - -int comex_free_dev(void *ptr, comex_group_t group) -{ - return comex_free(ptr, group); -} - - -/* DMAPP Functions */ - - -static void check_envs(void) -{ - char *value; - - /* COMEX_DMAPP_ROUTING - * - * TODO description */ - if ((value = getenv("COMEX_DMAPP_ROUTING")) != NULL){ - l_state.dmapp_routing = (atoi(value)); - } - else { - l_state.dmapp_routing = DMAPP_ROUTING_ADAPTIVE; - } - -#if HAVE_LIBHUGETLBFS - - /* hugepagesize - * - * set the static variable hugepagesize */ - hugepagesize = gethugepagesize(); - - /* HUGETLB_MORECORE - * - * this variable controls whether malloc() will use hugepage memory which - * means when we use malloc() and when the user application calls malloc - * we will be competing for hugepage memory! */ - if ((value = getenv("HUGETLB_MORECORE")) != NULL) { - if (0 == strncasecmp(value, "y", 1)) { - malloc_is_using_huge_pages = 1; - } - else if (0 == strncasecmp(value, "n", 1)) { - malloc_is_using_huge_pages = 0; - } - } - - /* COMEX_USE_HUGEPAGES - * - * COMEX can be built with hugepages and still allow the user to disable - * their use. We assume that if libhugetlbfs is linked in, the user wants - * to use it. This env var is then for the user to disable it, for some - * reason. */ - comex_is_using_huge_pages = 1; /* the default if libhugetlbfs is linked */ - if ((value = getenv("COMEX_USE_HUGEPAGES")) != NULL) { - if (0 == strncasecmp(value, "y", 1)) { - comex_is_using_huge_pages = 1; - } - else if (0 == strncasecmp(value, "n", 1)) { - comex_is_using_huge_pages = 0; - } - } - - /* HUGETLB_DEFAULT_PAGE_SIZE - * - * controls the page size that will be used for hugepages - * we look for this value in case it is specified and we want to allocate - * memory aligned to the same page size */ - if ((value = getenv("HUGETLB_DEFAULT_PAGE_SIZE")) != NULL){ - /* must be one of [128K|512K|2M|8M|16M|64M] */ - if (0 == strncasecmp(value, "128K", 4)) { - hugetlb_default_page_size = 131072; - } - else if (0 == strncasecmp(value, "512K", 4)) { - hugetlb_default_page_size = 524288; - } - else if (0 == strncasecmp(value, "2M", 2)) { - hugetlb_default_page_size = 2097152; - } - else if (0 == strncasecmp(value, "8M", 2)) { - hugetlb_default_page_size = 8388608; - } - else if (0 == strncasecmp(value, "16M", 3)) { - hugetlb_default_page_size = 16777216; - } - else if (0 == strncasecmp(value, "64M", 3)) { - hugetlb_default_page_size = 67108864; - } - else { - assert(0); - } - } - - if (malloc_is_using_huge_pages || comex_is_using_huge_pages) { - comex_page_size = hugepagesize; - } - -#endif /* HAVE_LIBHUGETLBFS */ - - /* get page size for memory allocation */ - sc_page_size = sysconf(_SC_PAGESIZE); - assert(sc_page_size >= 1); - if (0 == comex_page_size) { - comex_page_size = sc_page_size; - } - -//#if DEBUG -#if 1 - if (0 == l_state.rank) { - printf("gethugepagesize()=%ld\n", hugepagesize); - printf("hugetlb_default_page_size=%ld\n", hugetlb_default_page_size); - printf("_SC_PAGESIZE=%ld\n", sc_page_size); - printf("comex_page_size=%ld\n", comex_page_size); - printf("comex_is_using_huge_pages=%d\n", comex_is_using_huge_pages); - printf("malloc_is_using_huge_pages=%d\n", malloc_is_using_huge_pages); - } -#endif -} - - -static void dmapp_initialize(void) -{ - dmapp_return_t status; - dmapp_rma_attrs_ext_t requested_attrs; - dmapp_rma_attrs_ext_t actual_attrs; - - memset(&requested_attrs, 0, sizeof(requested_attrs)); - memset(&actual_attrs, 0, sizeof(actual_attrs)); - - status = dmapp_get_rma_attrs_ext(&requested_attrs); - assert(status == DMAPP_RC_SUCCESS); - - // Check envs - check_envs(); - - /* The maximum number of outstanding non-blocking requests supported. You - * can only specify this flag during initialization. The following is the - * range of valid values to be supplied: [DMAPP_MIN_OUTSTANDING_NB, .., - * DMAPP_MAX_OUTSTANDING_NB] Setting the value to one of the extremes may - * lead to a slowdown. The recommended value is DMAPP_DEF_OUTSTANDING_NB. - * Users can experiment with the value to find the optimal setting for - * their application. */ - requested_attrs.max_outstanding_nb = MAX_NB_OUTSTANDING; - assert(MAX_NB_OUTSTANDING > DMAPP_MIN_OUTSTANDING_NB); - assert(MAX_NB_OUTSTANDING < DMAPP_MAX_OUTSTANDING_NB); - if (0 == l_state.rank) { - if (MAX_NB_OUTSTANDING != DMAPP_DEF_OUTSTANDING_NB) { - printf("MAX_NB_OUTSTANDING=%u != DMAPP_DEF_OUTSTANDING_NB=%u\n", - MAX_NB_OUTSTANDING, DMAPP_DEF_OUTSTANDING_NB); - } - } - - /* The threshold, in bytes, for switching between CPU-based - * mechanisms and CPU offload mechanisms. This value can be - * specified at any time and can use any value. The default setting is - * DMAPP_OFFLOAD_THRESHOLD. Very small or very large settings - * may lead to suboptimal performance. The default value is 4k bytes. - * Consider how to best set this threshold. While a threshold increase - * may increase CPU availability, it may also increase transfer latency - * due to BTE involvement. */ - requested_attrs.offload_threshold = COMEX_DMAPP_OFFLOAD_THRESHOLD; - - /* Specifies the type of routing to be used. Applies to RMA requests with - * PUT semantics and all AMOs. The default is DMAPP_ROUTING_ADAPTIVE. - * The value can be specified at any time. Note that - * DMAPP_ROUTING_IN_ORDER guarantees the requests arrive in order and may - * result in poor performance. Valid settings are: - * - DMAPP_ROUTING_IN_ORDER - * - DMAPP_ROUTING_DETERMINISTIC - * - DMAPP_ROUTING_ADAPTIVE */ - requested_attrs.put_relaxed_ordering = l_state.dmapp_routing; - - /* Specifies the type of routing to be used. Applies to RMA requests with - * GET semantics. The default is DMAPP_ROUTING_ADAPTIVE. The value can be - * specified at any time. Note that DMAPP_ROUTING_IN_ORDER may result in - * poor performance. Valid settings are: - * - DMAPP_ROUTING_IN_ORDER - * - DMAPP_ROUTING_DETERMINISTIC - * - DMAPP_ROUTING_ADAPTIVE */ - requested_attrs.get_relaxed_ordering = l_state.dmapp_routing; - - /* The maximum number of threads that can access DMAPP. You can only use - * this when thread-safety is enabled. The default is 1. You can only - * specify this during initialization and it must be >= 1. */ - requested_attrs.max_concurrency = 1; - - /* Defines the PI ordering registration flags used by DMAPP when - * registering all memory regions with GNI. Applies to the data, symmetric - * heap, and user or dynamically mapped regions. The default is - * DMAPP_PI_RELAXED_ORDERING. - * - * The dmapp_pi_reg_type_t enumeration defines the modes of PI access - * ordering to be used by DMAPP during memory registration with uGNI; - * therefore, these modes apply to the data and symmetric heap and any - * user or dynamically mapped regions. - * - * These modes do not affect GET operations. - * - * Strict ordering ensures that posted and non-posted writes arrive at the - * target in strict order. Default and relaxed ordering impose no ordering - * constraints, therefore if an application requires the global visibility - * of data (for example, after a blocking put or gsync/fence), it must - * perform extra synchronization in the form of a remote GET from the - * target node in order to ensure that written data is globally visible. - * - DMAPP_PI_ORDERING_STRICT Strict PI (P_PASS_PW=0, NP_PASS_PW=0) - * - DMAPP_PI_ORDERING_DEFAULT Default GNI PI (P_PASS_PW=0, NP_PASS_PW=1) - * - DMAPP_PI_ORDERING_RELAXED Relaxed PI ordering (P_PASS_PW=1, NP_PASS_PW=1) */ - requested_attrs.PI_ordering = DMAPP_PI_ORDERING_RELAXED; - - // initialize - status = dmapp_init_ext(&requested_attrs, &actual_attrs); - assert(status == DMAPP_RC_SUCCESS); -#define sanity(field) assert(actual_attrs.field == requested_attrs.field) - sanity(max_outstanding_nb); - sanity(offload_threshold); - sanity(put_relaxed_ordering); - sanity(get_relaxed_ordering); - sanity(max_concurrency); - sanity(PI_ordering); -#undef sanity - - // TODO is this the correct place to set this? - max_outstanding_nb = actual_attrs.max_outstanding_nb; - - status = dmapp_get_jobinfo (&(l_state.job)); - assert(status == DMAPP_RC_SUCCESS); - - // Initialize the reg cache - reg_cache_init(l_state.size); - - // Allocate buffers - dmapp_alloc_buf(); - - // Create locks - create_dmapp_locks(); - - /* Synchronize */ - MPI_Barrier(l_state.world_comm); -} - - -static void dmapp_terminate(void) -{ - int status; - - destroy_dmapp_locks(); - - dmapp_free_buf(); - - reg_cache_destroy(l_state.size); - - status = dmapp_finalize(); - assert(status == DMAPP_RC_SUCCESS); - - MPI_Barrier(l_state.world_comm); -} - - -int comex_initialized() -{ - return initialized; -} diff --git a/comex/src-dmapp/comex_impl.h b/comex/src-dmapp/comex_impl.h deleted file mode 100644 index 9213df745..000000000 --- a/comex/src-dmapp/comex_impl.h +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef COMEX_IMPL_H_ -#define COMEX_IMPL_H_ - -#include -#include - -#define COMEX_DMAPP_OFFLOAD_THRESHOLD 2048 -#define DEFAULT_SYM_HEAP_SIZE 32*1048576 -#define DMAPP_ROUTING 1 -#define MAX_NB_OUTSTANDING 1024 -#define FAILURE_BUFSIZE 1048576 -#define PIPELINED_ACCUMULATE 0 -#define PIPELINED_MAX_BUFFERS 5 -#define PIPELINED_BUFSIZE 1048576 - -typedef struct { - - MPI_Comm world_comm; - int rank; - int size; - - /* DMAPP Specific */ - dmapp_jobinfo_t job; - - /* buffers for locks */ - unsigned long **atomic_lock_buf; /**< internal lock, one per process */ - unsigned long *local_lock_buf; /**< holds value of remote lock locally */ - unsigned long **mutexes; /**< all mutexes */ - unsigned long *local_mutex; /**< store the remote mutex value */ - unsigned int *num_mutexes; /**< how many mutexes on each process */ - - /* fallback buffers when errors are detected */ - void *acc_buf; - int acc_buf_len; - void *put_buf; - int put_buf_len; - void *get_buf; - int get_buf_len; - -#if PIPELINED_ACCUMULATE - /* buffers for pipelining */ - void **pipe_acc_buf; - int pipe_acc_buf_len; -#endif - - /* envs */ - int dmapp_routing; -} local_state; - -extern local_state l_state; - -#endif /* COMEX_IMPL_H_ */ diff --git a/comex/src-dmapp/groups.c b/comex/src-dmapp/groups.c deleted file mode 100644 index bc4a082cd..000000000 --- a/comex/src-dmapp/groups.c +++ /dev/null @@ -1,292 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include -#include - -#include "comex.h" -#include "comex_impl.h" -#include "groups.h" - - -/* the HEAD of the group linked list */ -comex_igroup_t *group_list = NULL; - - -/* static functions implemented in this file */ -static void comex_create_group_and_igroup( - comex_group_t *id, comex_igroup_t **igroup); -static void comex_igroup_finalize(comex_igroup_t *igroup); - - -/** - * Return the comex igroup instance given the group id. - * - * The group linked list is searched sequentially until the given group - * is found. It is an error if this function is called before - * comex_group_init(). An error occurs if the given group is not found. - */ -comex_igroup_t* comex_get_igroup_from_group(comex_group_t id) -{ - comex_igroup_t *current_group_list_item = group_list; - - assert(group_list != NULL); - while (current_group_list_item != NULL) { - if (current_group_list_item->id == id) { - return current_group_list_item; - } - current_group_list_item = current_group_list_item->next; - } - comex_error("comex group lookup failed", -1); - - return NULL; -} - - -/** - * Creates and associates an comex group with an comex igroup. - * - * This does *not* initialize the members of the comex igroup. - */ -static void comex_create_group_and_igroup( - comex_group_t *id, comex_igroup_t **igroup) -{ - comex_igroup_t *new_group_list_item = NULL; - comex_igroup_t *last_group_list_item = NULL; - - /* find the last group in the group linked list */ - last_group_list_item = group_list; - while (last_group_list_item->next != NULL) { - last_group_list_item = last_group_list_item->next; - } - - /* create, init, and insert the new node for the linked list */ - new_group_list_item = malloc(sizeof(comex_igroup_t)); - new_group_list_item->id = last_group_list_item->id + 1; - new_group_list_item->comm = MPI_COMM_NULL; - new_group_list_item->group = MPI_GROUP_NULL; - new_group_list_item->next = NULL; - last_group_list_item->next = new_group_list_item; - - /* return the group id and comex igroup */ - *igroup = new_group_list_item; - *id = new_group_list_item->id; -} - - -int comex_group_rank(comex_group_t group, int *rank) -{ - int status; - - comex_igroup_t *igroup = comex_get_igroup_from_group(group); - status = MPI_Group_rank(igroup->group, rank); - if (status != MPI_SUCCESS) { - comex_error("MPI_Group_rank: Failed ", status); - } - - return COMEX_SUCCESS; -} - - -int comex_group_size(comex_group_t group, int *size) -{ - int status; - - comex_igroup_t *igroup = comex_get_igroup_from_group(group); - status = MPI_Group_size(igroup->group, size); - if (status != MPI_SUCCESS) { - comex_error("MPI_Group_size: Failed ", status); - } - - return COMEX_SUCCESS; -} - - -int comex_group_comm(comex_group_t group, MPI_Comm *comm) -{ - int status; - - comex_igroup_t *igroup = comex_get_igroup_from_group(group); - *comm = igroup->comm; - - return COMEX_SUCCESS; -} - - -int comex_group_translate_world(comex_group_t group, int group_rank, int *world_rank) -{ - if (COMEX_GROUP_WORLD == group) { - *world_rank = group_rank; - } - else { - comex_igroup_t *igroup = comex_get_igroup_from_group(group); - comex_igroup_t *world_igroup = comex_get_igroup_from_group(COMEX_GROUP_WORLD); - int status = MPI_Group_translate_ranks( - igroup->group, 1, &group_rank, world_igroup->group, world_rank); - if (status != MPI_SUCCESS) { - comex_error("MPI_Group_translate_ranks: Failed ", status); - } - } - - return COMEX_SUCCESS; -} - - -/** - * Destroys the given comex igroup. - */ -static void comex_igroup_finalize(comex_igroup_t *igroup) -{ - int status; - - assert(igroup); - - if (igroup->group != MPI_GROUP_NULL) { - status = MPI_Group_free(&igroup->group); - if (status != MPI_SUCCESS) { - comex_error("MPI_Group_free: Failed ", status); - } - } - - if (igroup->comm != MPI_COMM_NULL) { - status = MPI_Comm_free(&igroup->comm); - if (status != MPI_SUCCESS) { - comex_error("MPI_Comm_free: Failed ", status); - } - } -} - - -int comex_group_free(comex_group_t id) -{ - comex_igroup_t *current_group_list_item = group_list; - comex_igroup_t *previous_group_list_item = NULL; - - /* find the group to free */ - while (current_group_list_item != NULL) { - if (current_group_list_item->id == id) { - break; - } - previous_group_list_item = current_group_list_item; - current_group_list_item = current_group_list_item->next; - } - /* make sure we found a group */ - assert(current_group_list_item != NULL); - /* remove the group from the linked list */ - if (previous_group_list_item != NULL) { - previous_group_list_item->next = current_group_list_item->next; - } - /* free the group */ - comex_igroup_finalize(current_group_list_item); - free(current_group_list_item); - - return COMEX_SUCCESS; -} - - -int comex_group_create( - int n, int *pid_list, comex_group_t id_parent, comex_group_t *id_child) -{ - int status; - int grp_me; - comex_igroup_t *igroup_child = NULL; - MPI_Group *group_child = NULL; - MPI_Comm *comm_child = NULL; - comex_igroup_t *igroup_parent = NULL; - MPI_Group *group_parent = NULL; - MPI_Comm *comm_parent = NULL; - - /* create the node in the linked list of groups and */ - /* get the child's MPI_Group and MPI_Comm, to be populated shortly */ - comex_create_group_and_igroup(id_child, &igroup_child); - group_child = &(igroup_child->group); - comm_child = &(igroup_child->comm); - - /* get the parent's MPI_Group and MPI_Comm */ - igroup_parent = comex_get_igroup_from_group(id_parent); - group_parent = &(igroup_parent->group); - comm_parent = &(igroup_parent->comm); - - status = MPI_Group_incl(*group_parent, n, pid_list, group_child); - if (status != MPI_SUCCESS) { - comex_error("MPI_Group_incl: Failed ", status); - } - - { - MPI_Comm comm, comm1, comm2; - int lvl=1, local_ldr_pos; - MPI_Group_rank(*group_child, &grp_me); - if (grp_me == MPI_UNDEFINED) { - /* FIXME: keeping the group around for now */ - return COMEX_SUCCESS; - } - /* SK: sanity check for the following bitwise operations */ - assert(grp_me>=0); - MPI_Comm_dup(MPI_COMM_SELF, &comm); /* FIXME: can be optimized away */ - local_ldr_pos = grp_me; - while(n>lvl) { - int tag=0; - int remote_ldr_pos = local_ldr_pos^lvl; - if (remote_ldr_pos < n) { - int remote_leader = pid_list[remote_ldr_pos]; - MPI_Comm peer_comm = *comm_parent; - int high = (local_ldr_posid = COMEX_GROUP_WORLD; - group_list->next = NULL; - - /* save MPI world group and communicatior in COMEX_GROUP_WORLD */ - group_list->comm = l_state.world_comm; - MPI_Comm_group(group_list->comm, &(group_list->group)); -} - - -void comex_group_finalize() -{ - comex_igroup_t *current_group_list_item = group_list; - comex_igroup_t *previous_group_list_item = NULL; - - /* don't free the world group (the list head) */ - current_group_list_item = current_group_list_item->next; - - while (current_group_list_item != NULL) { - previous_group_list_item = current_group_list_item; - current_group_list_item = current_group_list_item->next; - comex_igroup_finalize(previous_group_list_item); - free(previous_group_list_item); - } - - /* ok, now free the world group, but not the world comm */ - MPI_Group_free(&(group_list->group)); - free(group_list); - group_list = NULL; -} diff --git a/comex/src-dmapp/groups.h b/comex/src-dmapp/groups.h deleted file mode 100644 index df2cc8e60..000000000 --- a/comex/src-dmapp/groups.h +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Private header file for comex groups backed by MPI_comm. - * - * The rest of the comex group functions are defined in the public comex.h. - * - * @author Jeff Daily - */ -#ifndef _COMEX_GROUPS_H_ -#define _COMEX_GROUPS_H_ - -#include - -#include "comex.h" - -typedef struct group_link { - struct group_link *next; - comex_group_t id; - MPI_Comm comm; - MPI_Group group; -} comex_igroup_t; - -extern void comex_group_init(MPI_Comm comm); -extern void comex_group_finalize(); -extern comex_igroup_t* comex_get_igroup_from_group(comex_group_t group); - -#endif /* _COMEX_GROUPS_H_ */ diff --git a/comex/src-dmapp/reg_cache.c b/comex/src-dmapp/reg_cache.c deleted file mode 100644 index 30efded99..000000000 --- a/comex/src-dmapp/reg_cache.c +++ /dev/null @@ -1,774 +0,0 @@ -/** - * Registration cache. - * - * Defensive programming via copious assert statements is encouraged. - */ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -/* C headers */ -#include -#include -#include -#include - -/* 3rd party headers */ -#include -#include - -/* our headers */ -#include "comex.h" -#include "comex_impl.h" -#include "reg_cache.h" - - -/** - * A registered dmapp segment. - * - * dmapp segment registrations *can* return the same segment for instance if - * the user is allocating many small buffers which are smaller than the page - * size used by dmapp. We only keep track of local dmapp memory registrations. - */ -typedef struct _dmapp_entry_t { - dmapp_seg_desc_t mr; /**< dmapp registered memory region */ - int count; /**< ref count */ - struct _dmapp_entry_t *next; /**< next memory region in list */ -} dmapp_entry_t; - - -/* the static members in this module */ -static reg_entry_t **reg_cache = NULL; /**< list of caches (one per process) */ -static int reg_nprocs = 0; /**< number of caches (one per process) */ -static dmapp_entry_t *dmapp_cache = NULL; /**< list of cached dmapp segments */ - - -/* the static functions in this module */ -static dmapp_entry_t *dmapp_cache_find(dmapp_seg_desc_t mr); -static dmapp_entry_t *dmapp_cache_find_intersection(dmapp_seg_desc_t mr); -static dmapp_entry_t *dmapp_cache_insert(dmapp_seg_desc_t mr); -static reg_return_t dmapp_cache_delete(dmapp_seg_desc_t mr); -static reg_return_t seg_cmp(void *reg_addr, size_t reg_len, - void *oth_addr, size_t oth_len, int op); -static reg_return_t seg_intersects(void *reg_addr, size_t reg_len, - void *oth_addr, size_t oth_len); -static reg_return_t seg_contains(void *reg_addr, size_t reg_len, - void *oth_addr, size_t oth_len); -static reg_return_t reg_entry_intersects(reg_entry_t *reg_entry, - void *buf, size_t len); -static reg_return_t reg_entry_contains(reg_entry_t *reg_entry, - void *buf, size_t len); -static reg_return_t dmapp_seg_intersects(dmapp_seg_desc_t first, - dmapp_seg_desc_t second); -static reg_return_t dmapp_seg_contains(dmapp_seg_desc_t first, - dmapp_seg_desc_t second); - - -#define TEST_FOR_INTERSECTION 0 -#define TEST_FOR_CONTAINMENT 1 - - -/** - * Detects whether two memory segments intersect or one contains the other. - * - * @param[in] reg_addr starting address of original segment - * @param[in] reg_len length of original segment - * @param[in] oth_addr starting address of other segment - * @param[in] oth_len length of other segment - * @param[in] op op to perform, either TEST_FOR_INTERSECTION or - * TEST_FOR_CONTAINMENT - * - * @pre NULL != reg_beg - * @pre NULL != oth_beg - * - * @return RR_SUCCESS on success - */ -static reg_return_t -seg_cmp(void *reg_addr, size_t reg_len, void *oth_addr, size_t oth_len, int op) -{ - ptrdiff_t reg_beg = 0; - ptrdiff_t reg_end = 0; - ptrdiff_t oth_beg = 0; - ptrdiff_t oth_end = 0; - int result = 0; - - /* preconditions */ - assert(NULL != reg_addr); - assert(NULL != oth_addr); - - /* casts to ptrdiff_t since arithmetic on void* is undefined */ - reg_beg = (ptrdiff_t)(reg_addr); - reg_end = reg_beg + (ptrdiff_t)(reg_len) - 1; - oth_beg = (ptrdiff_t)(oth_addr); - oth_end = oth_beg + (ptrdiff_t)(oth_len) - 1; - - switch (op) { - case TEST_FOR_INTERSECTION: - result = reg_end >= oth_beg && oth_end >= reg_beg; - break; - case TEST_FOR_CONTAINMENT: - result = reg_beg <= oth_beg && oth_end <= reg_end; - break; - default: - assert(0); - } - - if (result) { - return RR_SUCCESS; - } - else { - return RR_FAILURE; - } -} - - -/** - * Detects whether two memory segments intersect. - * - * @param[in] reg_addr starting address of original segment - * @param[in] reg_len length of original segment - * @param[in] oth_addr starting address of other segment - * @param[in] oth_len length of other segment - * - * @pre NULL != reg_beg - * @pre NULL != oth_beg - * - * @return RR_SUCCESS on success - */ -static reg_return_t -seg_intersects(void *reg_addr, size_t reg_len, void *oth_addr, size_t oth_len) -{ - /* preconditions */ - assert(NULL != reg_addr); - assert(NULL != oth_addr); - - return seg_cmp( - reg_addr, reg_len, - oth_addr, oth_len, - TEST_FOR_INTERSECTION); -} - - -/** - * Detects whether the first memory segment contains the other. - * - * @param[in] reg_addr starting address of original segment - * @param[in] reg_len length of original segment - * @param[in] oth_addr starting address of other segment - * @param[in] oth_len length of other segment - * - * @pre NULL != reg_beg - * @pre NULL != oth_beg - * - * @return RR_SUCCESS on success - */ -static reg_return_t -seg_contains(void *reg_addr, size_t reg_len, void *oth_addr, size_t oth_len) -{ - /* preconditions */ - assert(NULL != reg_addr); - assert(NULL != oth_addr); - - return seg_cmp( - reg_addr, reg_len, - oth_addr, oth_len, - TEST_FOR_CONTAINMENT); -} - - -/** - * Detects whether two memory segments intersect. - * - * @param[in] reg_entry the registration entry - * @param[in] buf starting address for the contiguous memory region - * @param[in] len length of the contiguous memory region - * - * @pre NULL != reg_entry - * @pre NULL != buf - * @pre len >= 0 - * - * @return RR_SUCCESS on success - */ -static reg_return_t -reg_entry_intersects(reg_entry_t *reg_entry, void *buf, size_t len) -{ - /* preconditions */ - assert(NULL != reg_entry); - assert(NULL != buf); - assert(len >= 0); - - return seg_intersects( - reg_entry->buf, reg_entry->len, - buf, len); -} - - -/** - * Detects whether the first memory segment contains the other. - * - * @param[in] reg_entry the registration entry - * @param[in] buf starting address for the contiguous memory region - * @param[in] len length of the contiguous memory region - * - * @pre NULL != reg_entry - * @pre NULL != buf - * @pre len >= 0 - * - * @return RR_SUCCESS on success - */ -static reg_return_t -reg_entry_contains(reg_entry_t *reg_entry, void *buf, size_t len) -{ - /* preconditions */ - assert(NULL != reg_entry); - assert(NULL != buf); - assert(len >= 0); - - return seg_contains( - reg_entry->buf, reg_entry->len, - buf, len); -} - - -/** - * Detects whether two dmapp segments intersect. - * - * @param[in] first the original registration entry - * @param[in] second segment to test against - * - * @return RR_SUCCESS on success - */ -static reg_return_t -dmapp_seg_intersects(dmapp_seg_desc_t first, dmapp_seg_desc_t second) -{ - return seg_intersects(first.addr, first.len, second.addr, second.len); -} - - -/** - * Detects whether the first dmapp segment contains the other. - * - * @param[in] first the original registration entry - * @param[in] second segment to test against - * - * @return RR_SUCCESS on success - */ -static reg_return_t -dmapp_seg_contains(dmapp_seg_desc_t first, dmapp_seg_desc_t second) -{ - return seg_contains(first.addr, first.len, second.addr, second.len); -} - - -/** - * Remove registration cache entry without deregistration. - * - * @param[in] rank the rank where the entry came from - * @param[in] reg_entry the entry - * - * @pre NULL != reg_entry - * @pre 0 <= rank && rank < reg_nprocs - * - * @return RR_SUCCESS on success - */ -static reg_return_t -reg_entry_destroy(int rank, reg_entry_t *reg_entry) -{ - /* preconditions */ - assert(NULL != reg_entry); - assert(0 <= rank && rank < reg_nprocs); - - if (l_state.rank == rank) { - dmapp_cache_delete(reg_entry->mr); - } - - /* free cache entry */ - free(reg_entry); - - return RR_SUCCESS; -} - - -/** - * Create internal data structures for the registration cache. - * - * @param[in] nprocs number of registration caches to create i.e. one per - * process - * - * @pre this function is called once to initialize the internal data - * structures and cannot be called again until reg_cache_destroy() has been - * called - * - * @see reg_cache_destroy() - * - * @return RR_SUCCESS on success - */ -reg_return_t -reg_cache_init(int nprocs) -{ - int i = 0; - - /* preconditions */ - assert(NULL == reg_cache); - assert(0 == reg_nprocs); - assert(NULL == dmapp_cache); - - /* keep the number of caches around for later use */ - reg_nprocs = nprocs; - - /* allocate the registration cache list: */ - reg_cache = (reg_entry_t **)malloc( - sizeof(reg_entry_t *) * reg_nprocs); - assert(reg_cache); - - /* initialize the registration cache list: */ - for (i = 0; i < reg_nprocs; ++i) { - reg_cache[i] = NULL; - } - - return RR_SUCCESS; -} - - -/** - * Deregister and destroy all cache entries and associated buffers. - * - * @pre this function is called once to destroy the internal data structures - * and cannot be called again until reg_cache_init() has been called - * - * @see reg_cache_init() - * - * @return RR_SUCCESS on success - */ -reg_return_t -reg_cache_destroy() -{ - int i = 0; - - /* preconditions */ - assert(NULL != reg_cache); - assert(0 != reg_nprocs); - - for (i = 0; i < reg_nprocs; ++i) { - reg_entry_t *runner = reg_cache[i]; - - while (runner) { - reg_entry_t *previous = runner; /* pointer to previous runner */ - - /* get next runner */ - runner = runner->next; - /* destroy the entry */ - reg_entry_destroy(i, previous); - } - } - - /* free registration cache list */ - free(reg_cache); - reg_cache = NULL; - - /* reset the number of caches */ - reg_nprocs = 0; - - /* by the time all entries are destroyed, dmapp cache should be empty */ - assert(NULL == dmapp_cache); - - return RR_SUCCESS; -} - - -/** - * Locate a registration cache entry which contains the given segment - * completely. - * - * @param[in] rank rank of the process - * @param[in] buf starting address of the buffer - * @parma[in] len length of the buffer - * - * @pre 0 <= rank && rank < reg_nprocs - * @pre reg_cache_init() was previously called - * - * @return the reg cache entry, or NULL on failure - */ -reg_entry_t* -reg_cache_find(int rank, void *buf, size_t len) -{ - reg_entry_t *entry = NULL; - reg_entry_t *runner = NULL; - - /* preconditions */ - assert(NULL != reg_cache); - assert(0 <= rank && rank < reg_nprocs); - - runner = reg_cache[rank]; - - while (runner && NULL == entry) { - if (RR_SUCCESS == reg_entry_contains(runner, buf, len)) { - entry = runner; - } - runner = runner->next; - } - -#ifndef NDEBUG - /* we assert that the found entry was unique */ - while (runner) { - if (RR_SUCCESS == reg_entry_contains(runner, buf, len)) { - assert(0); - } - runner = runner->next; - } -#endif - - return entry; -} - - -/** - * Locate a dmapp segment which contains the given segment completely. - * - * @param[in] mr the dmapp segment - * - * @return the reg cache entry, or NULL on failure - */ -static dmapp_entry_t* -dmapp_cache_find(dmapp_seg_desc_t mr) -{ - dmapp_entry_t *entry = NULL; - dmapp_entry_t *runner = NULL; - - runner = dmapp_cache; - - while (runner && NULL == entry) { - if (RR_SUCCESS == dmapp_seg_contains(runner->mr, mr)) { - entry = runner; - } - runner = runner->next; - } - - /* we assert that the found entry was unique */ - while (runner) { - if (RR_SUCCESS == dmapp_seg_contains(runner->mr, mr)) { - assert(0); - } - runner = runner->next; - } - - return entry; -} - - -/** - * Locate a registration cache entry which intersects the given segment. - * - * @param[in] rank rank of the process - * @param[in] buf starting address of the buffer - * @parma[in] len length of the buffer - * - * @pre 0 <= rank && rank < reg_nprocs - * @pre reg_cache_init() was previously called - * - * @return the reg cache entry, or NULL on failure - */ -reg_entry_t* -reg_cache_find_intersection(int rank, void *buf, size_t len) -{ - reg_entry_t *entry = NULL; - reg_entry_t *runner = NULL; - - /* preconditions */ - assert(NULL != reg_cache); - assert(0 <= rank && rank < reg_nprocs); - - runner = reg_cache[rank]; - - while (runner && NULL == entry) { - if (RR_SUCCESS == reg_entry_intersects(runner, buf, len)) { - entry = runner; - } - runner = runner->next; - } - - /* we assert that the found entry was unique */ - while (runner) { - if (RR_SUCCESS == reg_entry_contains(runner, buf, len)) { - assert(0); - } - runner = runner->next; - } - - return entry; -} - - -/** - * Locate a dmapp segment which intersects the given segment. - * - * @param[in] mr the dmapp segment - * - * @return the reg cache entry, or NULL on failure - */ -static dmapp_entry_t* -dmapp_cache_find_intersection(dmapp_seg_desc_t mr) -{ - dmapp_entry_t *entry = NULL; - dmapp_entry_t *runner = NULL; - - runner = dmapp_cache; - - while (runner && NULL == entry) { - if (RR_SUCCESS == dmapp_seg_intersects(runner->mr, mr)) { - entry = runner; - } - runner = runner->next; - } - - /* we assert that the found entry was unique */ - while (runner) { - if (RR_SUCCESS == dmapp_seg_intersects(runner->mr, mr)) { - assert(0); - } - runner = runner->next; - } - - return entry; -} - - -/** - * Create a new registration entry based on the given members. - * - * @pre 0 <= rank && rank < reg_nprocs - * @pre NULL != buf - * @pre 0 <= len - * @pre reg_cache_init() was previously called - * @pre NULL == reg_cache_find(rank, buf, 0) - * @pre NULL == reg_cache_find_intersection(rank, buf, 0) - * - * @return RR_SUCCESS on success - */ -reg_entry_t* -reg_cache_insert(int rank, void *buf, size_t len, dmapp_seg_desc_t mr) -{ - reg_entry_t *node = NULL; - - /* preconditions */ - assert(NULL != reg_cache); - assert(0 <= rank && rank < reg_nprocs); - assert(NULL != buf); - assert(len >= 0); - assert(NULL == reg_cache_find(rank, buf, len)); - assert(NULL == reg_cache_find_intersection(rank, buf, len)); - - if (rank == l_state.rank) { - dmapp_cache_insert(mr); - } - - /* allocate the new entry */ - node = (reg_entry_t *)malloc(sizeof(reg_entry_t)); - assert(node); - - /* initialize the new entry */ - node->buf = buf; - node->len = len; - node->mr = mr; - node->next = NULL; - - /* push new entry to tail of linked list */ - if (NULL == reg_cache[rank]) { - reg_cache[rank] = node; - } - else { - reg_entry_t *runner = reg_cache[rank]; - while (runner->next) { - runner = runner->next; - } - runner->next = node; - } - - return RR_SUCCESS; -} - - -/** - * Removes the reg cache entry associated with the given rank and buffer. - * - * If this process owns the buffer, it will unregister the buffer, as well. - * - * @param[in] rank - * @param[in] buf - * - * @pre 0 <= rank && rank < reg_nprocs - * @pre NULL != buf - * @pre reg_cache_init() was previously called - * @pre NULL != reg_cache_find(rank, buf, 0) - * - * @return RR_SUCCESS on success - * RR_FAILURE otherwise - */ -reg_return_t -reg_cache_delete(int rank, void *buf) -{ - reg_return_t status = RR_FAILURE; - reg_entry_t *runner = NULL; - reg_entry_t *previous_runner = NULL; - - /* preconditions */ - assert(NULL != reg_cache); - assert(0 <= rank && rank < reg_nprocs); - assert(NULL != buf); - assert(NULL != reg_cache_find(rank, buf, 0)); - - /* this is more restrictive than reg_cache_find() in that we locate - * exactlty the same region starting address */ - runner = reg_cache[rank]; - while (runner) { - if (runner->buf == buf) { - break; - } - previous_runner = runner; - runner = runner->next; - } - /* we should have found an entry */ - if (NULL == runner) { - assert(0); - return RR_FAILURE; - } - - /* pop the entry out of the linked list */ - if (previous_runner) { - previous_runner->next = runner->next; - } - else { - reg_cache[rank] = reg_cache[rank]->next; - } - - status = reg_entry_destroy(rank, runner); - - return status; -} - - -/** - * Increments the ref count of an existing dmapp segement or inserts a new - * entry into the list. - * - * @param[in] mr the dmapp segment - * - * @return - */ -static dmapp_entry_t *dmapp_cache_insert(dmapp_seg_desc_t mr) -{ - dmapp_entry_t *runner = NULL; - dmapp_entry_t *previous_runner = NULL; - - /* this is more restrictive than dmapp_cache_find() in that we locate - * exactlty the same region starting address */ - runner = dmapp_cache; - while (runner) { - if (runner->mr.addr == mr.addr) { - break; - } - previous_runner = runner; - runner = runner->next; - } - - if (runner) { - /* make sure it's an exact match */ - assert(runner->mr.len == mr.len); - /* increment ref count */ - ++(runner->count); -#if DEBUG - printf("[%d] incrementing ref count of (%p,%zu) to %d\n", - l_state.rank, runner->mr.addr, runner->mr.len, runner->count); -#endif - } - else { - runner = malloc(sizeof(dmapp_entry_t)); - runner->mr = mr; - runner->count = 1; - runner->next = NULL; - if (previous_runner) { - previous_runner->next = runner; - } - else { - dmapp_cache = runner; - } -#if DEBUG - printf("[%d] inserting (%p,%zu)\n", - l_state.rank, runner->mr.addr, runner->mr.len); -#endif - } - - return runner; -} - - -/** - * Decrements the ref count and possibly removes the dmapp cache entry - * associated with the given dmapp segment. - * - * @param[in] mr the dmapp segment to possibly remove - * - * @pre NULL != dmapp_cache_find(mr) - * - * @return RR_SUCCESS on success - * RR_FAILURE otherwise - */ -reg_return_t -dmapp_cache_delete(dmapp_seg_desc_t mr) -{ - reg_return_t status = RR_FAILURE; - dmapp_entry_t *runner = NULL; - dmapp_entry_t *previous_runner = NULL; - - /* preconditions */ - assert(NULL != dmapp_cache); - if (NULL == dmapp_cache_find(mr)) { - printf("[%d] dmapp_cache_find(mr) failed, mr=(%p,%zu)\n", - l_state.rank, mr.addr, mr.len); - assert(0); - } - - /* this is more restrictive than dmapp_cache_find() in that we locate - * exactlty the same region starting address */ - runner = dmapp_cache; - while (runner) { - if (runner->mr.addr == mr.addr) { - break; - } - previous_runner = runner; - runner = runner->next; - } - /* we should have found an entry */ - if (NULL == runner) { - assert(0); - return RR_FAILURE; - } - - /* decrement ref count */ - --(runner->count); - assert(runner->count >= 0); -#if DEBUG - printf("[%d] decrementing ref count of (%p,%zu) to %d\n", - l_state.rank, runner->mr.addr, runner->mr.len, runner->count); -#endif - - if (0 == runner->count) { - /* pop the entry out of the linked list */ - if (previous_runner) { - previous_runner->next = runner->next; - } - else { - dmapp_cache = dmapp_cache->next; - } -#if DEBUG - printf("[%d] removing (%p,%zu)\n", - l_state.rank, runner->mr.addr, runner->mr.len); -#endif - dmapp_mem_unregister(&(runner->mr)); - - free(runner); - } - - return status; -} - diff --git a/comex/src-dmapp/reg_cache.h b/comex/src-dmapp/reg_cache.h deleted file mode 100644 index d945634c4..000000000 --- a/comex/src-dmapp/reg_cache.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef _REG_CACHE_H_ -#define _REG_CACHE_H_ - -#include - -/** - * Enumerate the return codes for registration cache functions. - */ -typedef enum _reg_return_t { - RR_SUCCESS=0, /**< success */ - RR_FAILURE /**< non-specific failure */ -} reg_return_t; - -/** - * A registered contiguous memory region. - */ -typedef struct _reg_entry_t { - void *buf; /**< starting address of region */ - size_t len; /**< length of region */ - dmapp_seg_desc_t mr; /**< dmapp registered memory region */ - struct _reg_entry_t *next; /**< next memory region in list */ -} reg_entry_t; - -/* functions - * - * documentation is in the *.c file - */ - -reg_return_t reg_cache_init(int nprocs); -reg_return_t reg_cache_destroy(); -reg_entry_t *reg_cache_find(int rank, void *buf, size_t len); -reg_entry_t *reg_cache_insert(int rank, void *buf, size_t len, dmapp_seg_desc_t mr); -reg_return_t reg_cache_delete(int, void *buf); - -#endif /* _REG_CACHE_H_ */ diff --git a/comex/src-mpi-mt/comex.c b/comex/src-mpi-mt/comex.c index b58da872d..03816c600 100644 --- a/comex/src-mpi-mt/comex.c +++ b/comex/src-mpi-mt/comex.c @@ -77,7 +77,7 @@ typedef struct { typedef struct lock_link { struct lock_link *next; int rank; -} lock_t; +} comex_lock_t; typedef struct { @@ -112,7 +112,7 @@ typedef struct { /* static state */ static int *num_mutexes = NULL; /**< (all) how many mutexes on each process */ static int **mutexes = NULL; /**< (masters) value is rank of lock holder */ -static lock_t ***lq_heads = NULL; /**< array of lock queues */ +static comex_lock_t ***lq_heads = NULL; /**< array of lock queues */ static int initialized = 0; /* for comex_initialized(), 0=false */ static char *fence_array = NULL; static pthread_t progress_thread; @@ -317,7 +317,7 @@ int _comex_init(MPI_Comm comm) COMEX_ASSERT(mutexes); /* create one lock queue for each proc for each mutex */ - lq_heads = (lock_t***)malloc(sizeof(lock_t**) * g_state.size); + lq_heads = (comex_lock_t***)malloc(sizeof(comex_lock_t**) * g_state.size); COMEX_ASSERT(lq_heads); /* start the server (each rank does this) */ @@ -2372,7 +2372,7 @@ STATIC void _mutex_create_handler(header_t *header, int proc) #endif mutexes[proc] = (int*)malloc(sizeof(int) * num); - lq_heads[proc] = (lock_t**)malloc(sizeof(lock_t*) * num); + lq_heads[proc] = (comex_lock_t**)malloc(sizeof(comex_lock_t*) * num); for (i=0; inext = NULL; lock->rank = proc; if (lq_heads[rank][id]) { /* insert at tail */ - lock_t *lq = lq_heads[rank][id]; + comex_lock_t *lq = lq_heads[rank][id]; while (lq->next) { lq = lq->next; } @@ -2460,7 +2460,7 @@ STATIC void _unlock_handler(header_t *header, int proc) if (lq_heads[rank][id]) { /* a lock requester was queued */ /* find the next lock request and update queue */ - lock_t *lock = lq_heads[rank][id]; + comex_lock_t *lock = lq_heads[rank][id]; lq_heads[rank][id] = lq_heads[rank][id]->next; /* update lock */ mutexes[rank][id] = lock->rank; diff --git a/comex/src-mpi-mt/groups.c b/comex/src-mpi-mt/groups.c index 5fe2d17c6..aafe6ee16 100644 --- a/comex/src-mpi-mt/groups.c +++ b/comex/src-mpi-mt/groups.c @@ -9,16 +9,6 @@ #include -#if defined(__bgp__) -#include -#include -#include -#elif defined(__bgq__) -# include -#elif defined(__CRAYXT) || defined(__CRAYXE) -# include -#endif - #include "comex.h" #include "comex_impl.h" #include "groups.h" @@ -472,46 +462,6 @@ void comex_group_finalize() static long xgethostid() { -#if defined(__bgp__) -#warning BGP - long nodeid; - int matched,midplane,nodecard,computecard; - char rack_row,rack_col; - char location[128]; - char location_clean[128]; - (void) memset(location, '\0', 128); - (void) memset(location_clean, '\0', 128); - _BGP_Personality_t personality; - Kernel_GetPersonality(&personality, sizeof(personality)); - BGP_Personality_getLocationString(&personality, location); - matched = sscanf(location, "R%c%c-M%1d-N%2d-J%2d", - &rack_row, &rack_col, &midplane, &nodecard, &computecard); - COMEX_ASSERT(matched == 5); - sprintf(location_clean, "%2d%02d%1d%02d%02d", - (int)rack_row, (int)rack_col, midplane, nodecard, computecard); - nodeid = atol(location_clean); -#elif defined(__bgq__) -#warning BGQ - int nodeid; - MPIX_Hardware_t hw; - MPIX_Hardware(&hw); - - nodeid = hw.Coords[0] * hw.Size[1] * hw.Size[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[1] * hw.Size[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[3] * hw.Size[4] - + hw.Coords[4]; -#elif defined(__CRAYXT) || defined(__CRAYXE) -#warning CRAY - int nodeid; -# if defined(__CRAYXT) - PMI_Portals_get_nid(g_state.rank, &nodeid); -# elif defined(__CRAYXE) - PMI_Get_nid(g_state.rank, &nodeid); -# endif -#else long nodeid = gethostid(); -#endif - return nodeid; } diff --git a/comex/src-mpi-mt/groups.h b/comex/src-mpi-mt/groups.h index cc50a687e..9cb067368 100644 --- a/comex/src-mpi-mt/groups.h +++ b/comex/src-mpi-mt/groups.h @@ -44,8 +44,9 @@ extern comex_igroup_t* comex_get_igroup_from_group(comex_group_t group); /* verify that proc is part of group */ #define CHECK_GROUP(GROUP,PROC) do { \ int size; \ + int ierr = comex_group_size(GROUP,&size); \ COMEX_ASSERT(GROUP >= 0); \ - COMEX_ASSERT(COMEX_SUCCESS == comex_group_size(GROUP,&size)); \ + COMEX_ASSERT(COMEX_SUCCESS == ierr); \ COMEX_ASSERT(PROC >= 0); \ COMEX_ASSERT(PROC < size); \ } while(0) diff --git a/comex/src-mpi-pr/comex.c b/comex/src-mpi-pr/comex.c index 633eef364..f485798a4 100644 --- a/comex/src-mpi-pr/comex.c +++ b/comex/src-mpi-pr/comex.c @@ -11,6 +11,9 @@ #include #include #include +#if HAVE_ERRNO_H +# include +#endif #include #include #include @@ -62,6 +65,10 @@ sicm_device_list nill; #define STR(x) XSTR(x) #define MIN(a, b) (((b) < (a)) ? (b) : (a)) +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX 256 +#endif + /* data structures */ typedef enum { @@ -118,7 +125,7 @@ typedef struct { typedef struct lock_link { struct lock_link *next; int rank; -} lock_t; +} comex_lock_t; typedef struct { @@ -160,7 +167,7 @@ typedef struct { /* static state */ static int *num_mutexes = NULL; /**< (all) how many mutexes on each process */ static int **mutexes = NULL; /**< (masters) value is rank of lock holder */ -static lock_t ***lq_heads = NULL; /**< array of lock queues */ +static comex_lock_t ***lq_heads = NULL; /**< array of lock queues */ static char *sem_name = NULL; /* local semaphore name */ static sem_t **semaphores = NULL; /* semaphores for locking within SMP node */ static int initialized = 0; /* for comex_initialized(), 0=false */ @@ -672,7 +679,7 @@ int _comex_init(MPI_Comm comm) mutexes = (int**)malloc(sizeof(int*) * g_state.size); COMEX_ASSERT(mutexes); /* create one lock queue for each proc for each mutex */ - lq_heads = (lock_t***)malloc(sizeof(lock_t**) * g_state.size); + lq_heads = (comex_lock_t***)malloc(sizeof(comex_lock_t**) * g_state.size); COMEX_ASSERT(lq_heads); /* start the server */ _progress_server(); @@ -4512,7 +4519,7 @@ STATIC void _mutex_create_handler(header_t *header, int proc) #endif mutexes[proc] = (int*)malloc(sizeof(int) * num); - lq_heads[proc] = (lock_t**)malloc(sizeof(lock_t*) * num); + lq_heads[proc] = (comex_lock_t**)malloc(sizeof(comex_lock_t*) * num); for (i=0; inext = NULL; lock->rank = proc; if (lq_heads[rank][id]) { /* insert at tail */ - lock_t *lq = lq_heads[rank][id]; + comex_lock_t *lq = lq_heads[rank][id]; while (lq->next) { lq = lq->next; } @@ -4600,7 +4607,7 @@ STATIC void _unlock_handler(header_t *header, int proc) if (lq_heads[rank][id]) { /* a lock requester was queued */ /* find the next lock request and update queue */ - lock_t *lock = lq_heads[rank][id]; + comex_lock_t *lock = lq_heads[rank][id]; lq_heads[rank][id] = lq_heads[rank][id]->next; /* update lock */ mutexes[rank][id] = lock->rank; @@ -7576,9 +7583,11 @@ STATIC void check_devshm(int fd, size_t size){ #endif } if ( newspace > devshm_fs_left ) { - fprintf(stderr, "[%d] /dev/shm fs has size %ld new shm area has size %ld need to increase /dev/shm by %ld Mbytes\n", - g_state.rank, devshm_fs_left/CONVERT_TO_M, newspace/CONVERT_TO_M, (newspace - devshm_fs_left)/CONVERT_TO_M); - perror("check_devshm: /dev/shm out of space"); + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + fprintf(stderr, "hostname: %s, [%d] /dev/shm fs has size %ld bytes left, new shm area has size %ld need to increase /dev/shm by %ld Mbytes\n", hostname, g_state.rank, devshm_fs_left/CONVERT_TO_M, newspace/CONVERT_TO_M, (newspace - devshm_fs_left)/CONVERT_TO_M); + + perror("check_devshm: /dev/shm out of space"); // _free_semaphore(); comex_error("check_devshm: /dev/shm out of space", -1); diff --git a/comex/src-mpi-pr/comex_impl.h b/comex/src-mpi-pr/comex_impl.h index 673ab5162..a188d6e2c 100644 --- a/comex/src-mpi-pr/comex_impl.h +++ b/comex/src-mpi-pr/comex_impl.h @@ -15,7 +15,7 @@ #define UNLOCKED -1 /* performance or correctness related settings */ -#if defined(__bgq__) || defined(__bgp__) +#if 0 #define ENABLE_UNNAMED_SEM 1 #else #define ENABLE_UNNAMED_SEM 0 diff --git a/comex/src-mpi-pr/groups.c b/comex/src-mpi-pr/groups.c index 5c199fa8c..236422193 100644 --- a/comex/src-mpi-pr/groups.c +++ b/comex/src-mpi-pr/groups.c @@ -10,16 +10,6 @@ #include -#if defined(__bgp__) -#include -#include -#include -#elif defined(__bgq__) -# include -#elif defined(__CRAYXT) || defined(__CRAYXE) -# include -#endif - #include "comex.h" #include "comex_impl.h" #include "groups.h" @@ -623,46 +613,6 @@ void comex_group_finalize() static long xgethostid() { -#if defined(__bgp__) -#warning BGP - long nodeid; - int matched,midplane,nodecard,computecard; - char rack_row,rack_col; - char location[128]; - char location_clean[128]; - (void) memset(location, '\0', 128); - (void) memset(location_clean, '\0', 128); - _BGP_Personality_t personality; - Kernel_GetPersonality(&personality, sizeof(personality)); - BGP_Personality_getLocationString(&personality, location); - matched = sscanf(location, "R%c%c-M%1d-N%2d-J%2d", - &rack_row, &rack_col, &midplane, &nodecard, &computecard); - assert(matched == 5); - sprintf(location_clean, "%2d%02d%1d%02d%02d", - (int)rack_row, (int)rack_col, midplane, nodecard, computecard); - nodeid = atol(location_clean); -#elif defined(__bgq__) -#warning BGQ - int nodeid; - MPIX_Hardware_t hw; - MPIX_Hardware(&hw); - - nodeid = hw.Coords[0] * hw.Size[1] * hw.Size[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[1] * hw.Size[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[3] * hw.Size[4] - + hw.Coords[4]; -#elif defined(__CRAYXT) || defined(__CRAYXE) -#warning CRAY - int nodeid; -# if defined(__CRAYXT) - PMI_Portals_get_nid(g_state.rank, &nodeid); -# elif defined(__CRAYXE) - PMI_Get_nid(g_state.rank, &nodeid); -# endif -#else long nodeid = gethostid(); -#endif - return nodeid; } diff --git a/comex/src-mpi-pr/groups.h b/comex/src-mpi-pr/groups.h index 0114807d6..33339d0eb 100644 --- a/comex/src-mpi-pr/groups.h +++ b/comex/src-mpi-pr/groups.h @@ -46,8 +46,9 @@ extern comex_igroup_t* comex_get_igroup_from_group(comex_group_t group); /* verify that proc is part of group */ #define CHECK_GROUP(GROUP,PROC) do { \ int size; \ + int ierr = comex_group_size(GROUP,&size); \ COMEX_ASSERT(GROUP >= 0); \ - COMEX_ASSERT(COMEX_SUCCESS == comex_group_size(GROUP,&size)); \ + COMEX_ASSERT(COMEX_SUCCESS == ierr); \ COMEX_ASSERT(PROC >= 0); \ COMEX_ASSERT(PROC < size); \ } while(0) diff --git a/comex/src-mpi-pt/comex.c b/comex/src-mpi-pt/comex.c index 8143b30cf..a3497d082 100644 --- a/comex/src-mpi-pt/comex.c +++ b/comex/src-mpi-pt/comex.c @@ -6,6 +6,9 @@ #include #include #include +#if HAVE_ERRNO_H +# include +#endif #include #include #include @@ -91,7 +94,7 @@ typedef struct { typedef struct lock_link { struct lock_link *next; int rank; -} lock_t; +} comex_lock_t; typedef struct { @@ -132,7 +135,7 @@ typedef struct { /* static state */ static int *num_mutexes = NULL; /**< (all) how many mutexes on each process */ static int **mutexes = NULL; /**< (masters) value is rank of lock holder */ -static lock_t ***lq_heads = NULL; /**< array of lock queues */ +static comex_lock_t ***lq_heads = NULL; /**< array of lock queues */ static char *sem_name = NULL; /* local semaphore name */ static sem_t **semaphores = NULL; /* semaphores for locking within SMP node */ static int initialized = 0; /* for comex_initialized(), 0=false */ @@ -382,7 +385,7 @@ int _comex_init(MPI_Comm comm) mutexes = (int**)malloc(sizeof(int*) * g_state.size); COMEX_ASSERT(mutexes); /* create one lock queue for each proc for each mutex */ - lq_heads = (lock_t***)malloc(sizeof(lock_t**) * g_state.size); + lq_heads = (comex_lock_t***)malloc(sizeof(comex_lock_t**) * g_state.size); COMEX_ASSERT(lq_heads); /* start the server */ pthread_create(&progress_thread, NULL, _progress_server, NULL); @@ -3306,7 +3309,7 @@ STATIC void _mutex_create_handler(header_t *header, int proc) #endif mutexes[proc] = (int*)malloc(sizeof(int) * num); - lq_heads[proc] = (lock_t**)malloc(sizeof(lock_t*) * num); + lq_heads[proc] = (comex_lock_t**)malloc(sizeof(comex_lock_t*) * num); for (i=0; inext = NULL; lock->rank = proc; if (lq_heads[rank][id]) { /* insert at tail */ - lock_t *lq = lq_heads[rank][id]; + comex_lock_t *lq = lq_heads[rank][id]; while (lq->next) { lq = lq->next; } @@ -3394,7 +3397,7 @@ STATIC void _unlock_handler(header_t *header, int proc) if (lq_heads[rank][id]) { /* a lock requester was queued */ /* find the next lock request and update queue */ - lock_t *lock = lq_heads[rank][id]; + comex_lock_t *lock = lq_heads[rank][id]; lq_heads[rank][id] = lq_heads[rank][id]->next; /* update lock */ mutexes[rank][id] = lock->rank; diff --git a/comex/src-mpi-pt/comex_impl.h b/comex/src-mpi-pt/comex_impl.h index d0b1cc485..3291522b0 100644 --- a/comex/src-mpi-pt/comex_impl.h +++ b/comex/src-mpi-pt/comex_impl.h @@ -15,7 +15,7 @@ #define UNLOCKED -1 /* performance or correctness related settings */ -#if defined(__bgq__) || defined(__bgp__) +#if 0 #define ENABLE_UNNAMED_SEM 1 #else #define ENABLE_UNNAMED_SEM 0 diff --git a/comex/src-mpi-pt/groups.c b/comex/src-mpi-pt/groups.c index 84dfa617c..37bb6ed7f 100644 --- a/comex/src-mpi-pt/groups.c +++ b/comex/src-mpi-pt/groups.c @@ -9,16 +9,6 @@ #include -#if defined(__bgp__) -#include -#include -#include -#elif defined(__bgq__) -# include -#elif defined(__CRAYXT) || defined(__CRAYXE) -# include -#endif - #include "comex.h" #include "comex_impl.h" #include "groups.h" @@ -508,46 +498,6 @@ void comex_group_finalize() static long xgethostid() { -#if defined(__bgp__) -#warning BGP - long nodeid; - int matched,midplane,nodecard,computecard; - char rack_row,rack_col; - char location[128]; - char location_clean[128]; - (void) memset(location, '\0', 128); - (void) memset(location_clean, '\0', 128); - _BGP_Personality_t personality; - Kernel_GetPersonality(&personality, sizeof(personality)); - BGP_Personality_getLocationString(&personality, location); - matched = sscanf(location, "R%c%c-M%1d-N%2d-J%2d", - &rack_row, &rack_col, &midplane, &nodecard, &computecard); - COMEX_ASSERT(matched == 5); - sprintf(location_clean, "%2d%02d%1d%02d%02d", - (int)rack_row, (int)rack_col, midplane, nodecard, computecard); - nodeid = atol(location_clean); -#elif defined(__bgq__) -#warning BGQ - int nodeid; - MPIX_Hardware_t hw; - MPIX_Hardware(&hw); - - nodeid = hw.Coords[0] * hw.Size[1] * hw.Size[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[1] * hw.Size[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[2] * hw.Size[3] * hw.Size[4] - + hw.Coords[3] * hw.Size[4] - + hw.Coords[4]; -#elif defined(__CRAYXT) || defined(__CRAYXE) -#warning CRAY - int nodeid; -# if defined(__CRAYXT) - PMI_Portals_get_nid(g_state.rank, &nodeid); -# elif defined(__CRAYXE) - PMI_Get_nid(g_state.rank, &nodeid); -# endif -#else long nodeid = gethostid(); -#endif - return nodeid; } diff --git a/comex/src-mpi-pt/groups.h b/comex/src-mpi-pt/groups.h index 55a64b01f..563c6906a 100644 --- a/comex/src-mpi-pt/groups.h +++ b/comex/src-mpi-pt/groups.h @@ -45,8 +45,9 @@ extern comex_igroup_t* comex_get_igroup_from_group(comex_group_t group); /* verify that proc is part of group */ #define CHECK_GROUP(GROUP,PROC) do { \ int size; \ + int ierr = comex_group_size(GROUP,&size); \ COMEX_ASSERT(GROUP >= 0); \ - COMEX_ASSERT(COMEX_SUCCESS == comex_group_size(GROUP,&size)); \ + COMEX_ASSERT(COMEX_SUCCESS == ierr); \ COMEX_ASSERT(PROC >= 0); \ COMEX_ASSERT(PROC < size); \ } while(0) diff --git a/comex/src-mpi/comex.c b/comex/src-mpi/comex.c index edcc98110..0239349f8 100644 --- a/comex/src-mpi/comex.c +++ b/comex/src-mpi/comex.c @@ -1909,13 +1909,13 @@ static void _unlock_request_handler(header_t *header, int proc) static void _lq_push(int rank, int id, char *notify) { - lock_t *lock = NULL; + comex_lock_t *lock = NULL; #if DEBUG printf("[%d] _lq_push rank=%d id=%d\n", l_state.rank, rank, id); #endif - lock = _my_malloc(sizeof(lock_t)); + lock = _my_malloc(sizeof(comex_lock_t)); lock->next = NULL; lock->rank = rank; lock->id = id; @@ -1943,9 +1943,9 @@ static void _lq_push(int rank, int id, char *notify) static int _lq_progress(void) { int needs_progress = 0; - lock_t *lock = NULL; - lock_t *new_lock_head = NULL; - lock_t *new_lock_tail = NULL; + comex_lock_t *lock = NULL; + comex_lock_t *new_lock_head = NULL; + comex_lock_t *new_lock_tail = NULL; #if DEBUG if (l_state.num_mutexes > 0) { @@ -1960,7 +1960,7 @@ static int _lq_progress(void) lock = l_state.lq_head; while (lock) { if (l_state.mutexes[lock->id] < 0) { - lock_t *last = NULL; + comex_lock_t *last = NULL; header_t *header = NULL; l_state.mutexes[lock->id] = lock->rank; diff --git a/comex/src-mpi/comex_impl.h b/comex/src-mpi/comex_impl.h index 7500514a2..9406a1167 100644 --- a/comex/src-mpi/comex_impl.h +++ b/comex/src-mpi/comex_impl.h @@ -61,7 +61,7 @@ typedef struct lock_link { int rank; int id; void *notify_address; -} lock_t; +} comex_lock_t; typedef struct { MPI_Comm world_comm; @@ -86,8 +86,8 @@ typedef struct { barrier_t *bq_tail; /* a queue for lock requests */ - lock_t *lq_head; - lock_t *lq_tail; + comex_lock_t *lq_head; + comex_lock_t *lq_tail; } local_state; diff --git a/comex/src-mpi/groups.h b/comex/src-mpi/groups.h index 9065e69da..d00cf0a75 100644 --- a/comex/src-mpi/groups.h +++ b/comex/src-mpi/groups.h @@ -29,7 +29,8 @@ extern comex_igroup_t* comex_get_igroup_from_group(comex_group_t group); * change group to world group */ #define CHECK_GROUP(GROUP,PROC) do { \ int size; \ - assert(COMEX_SUCCESS == comex_group_size(GROUP,&size)); \ + int ierr = comex_group_size(GROUP,&size); \ + assert(COMEX_SUCCESS == ierr); \ assert(PROC >= 0); \ assert(PROC < size); \ if (COMEX_GROUP_WORLD != GROUP) { \ diff --git a/comex/src-ofi/comex.c b/comex/src-ofi/comex.c index 3dc5a4197..556e5c620 100644 --- a/comex/src-ofi/comex.c +++ b/comex/src-ofi/comex.c @@ -49,7 +49,7 @@ do { \ while ((_request)->state != rs_complete) \ { \ - poll(0); \ + myofi_poll(0); \ PAUSE(); \ } \ reset_request((_request)); \ @@ -98,7 +98,7 @@ static uint32_t reply_tag = 0; static fastlock_t mutex_lock; static fastlock_t acc_lock; static fastlock_t poll_lock; -static int poll(int* items_processed); +static int myofi_poll(int* items_processed); static pthread_t tid = 0; static int comex_acc_native( @@ -644,7 +644,7 @@ static void* progress_thread_func(void* __data) { while (!progress_thread_complete) { - poll(0); + myofi_poll(0); PAUSE(); } return 0; @@ -1137,7 +1137,7 @@ do \ } \ } while (0) -static int poll(int* items_processed) +static int myofi_poll(int* items_processed) { ssize_t ret = 0; int locked = 0; @@ -1765,7 +1765,7 @@ int comex_wait_proc(int proc, comex_group_t group) request->proc == proc && request->group == group && request->state == rs_progress) { - COMEX_CHKANDJUMP(poll(0), "failed to poll"); + COMEX_CHKANDJUMP(myofi_poll(0), "failed to poll"); PAUSE(); } } @@ -1778,7 +1778,7 @@ int comex_wait_proc(int proc, comex_group_t group) request_t* request = cache->request + i; while (request->proc == proc && request->state == rs_progress) { - COMEX_CHKANDJUMP(poll(0), "failed to poll"); + COMEX_CHKANDJUMP(myofi_poll(0), "failed to poll"); PAUSE(); } } @@ -1808,7 +1808,7 @@ static inline int wait_request(request_t* request) while (request->state == rs_progress) { - COMEX_CHKANDJUMP(poll(0), "failed to poll"); + COMEX_CHKANDJUMP(myofi_poll(0), "failed to poll"); PAUSE(); } @@ -1856,7 +1856,7 @@ int comex_test(comex_request_t* handle, int* status) /* process all CQ items in queue till request in 'progress' state * or queue is not empty (items_processed is not 0) */ while (request->state == rs_progress && items_processed) - COMEX_CHKANDJUMP(poll(&items_processed), "failed to poll"); + COMEX_CHKANDJUMP(myofi_poll(&items_processed), "failed to poll"); *status = (request->state == rs_progress); @@ -1881,7 +1881,7 @@ int comex_wait_all(comex_group_t group) while (!(request->flags & rf_no_group_wait) && request->group == group && request->state == rs_progress) { - COMEX_CHKANDJUMP(poll(0), "failed to poll"); + COMEX_CHKANDJUMP(myofi_poll(0), "failed to poll"); PAUSE(); } } @@ -1895,7 +1895,7 @@ int comex_wait_all(comex_group_t group) while (!(request->flags & rf_no_group_wait) && request->state == rs_progress) { - COMEX_CHKANDJUMP(poll(0), "failed to poll"); + COMEX_CHKANDJUMP(myofi_poll(0), "failed to poll"); PAUSE(); } } diff --git a/comex/src-ofi/comex_impl.h b/comex/src-ofi/comex_impl.h index 23e266111..d594ca796 100644 --- a/comex/src-ofi/comex_impl.h +++ b/comex/src-ofi/comex_impl.h @@ -155,7 +155,7 @@ if (likely(_ret == 0)) break; \ if (_ret != -FI_EAGAIN) \ OFI_CHKANDJUMP(_ret, __VA_ARGS__); \ - poll(0); \ + myofi_poll(0); \ } while (_ret == -FI_EAGAIN); \ } while (0) diff --git a/comex/testing/test.c b/comex/testing/test.c index 4cdbbf75f..ad5e3582a 100644 --- a/comex/testing/test.c +++ b/comex/testing/test.c @@ -53,11 +53,7 @@ #define MAXPROC 128 #define TIMES 100 -#ifdef CRAY -# define ELEMS 800 -#else # define ELEMS 200 -#endif /***************************** macros ************************/ @@ -121,55 +117,6 @@ static double timer() return tv.tv_sec * 1000000.0 + tv.tv_usec; } - -#ifdef PVM -void pvm_init(int argc, char *argv[]) -{ - int mytid, mygid, ctid[MAXPROC]; - int np, i; - - mytid = pvm_mytid(); - if ((argc != 2) && (argc != 1)) { - goto usage; - } - if (argc == 1) { - np = 1; - } - if (argc == 2) { - if ((np = atoi(argv[1])) < 1) { - goto usage; - } - } - if (np > MAXPROC) { - goto usage; - } - - mygid = pvm_joingroup(MPGROUP); - - if (np > 1) { - if (mygid == 0) { - i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid); - } - } - - while (pvm_gsize(MPGROUP) < np) { - sleep(1); - } - - /* sync */ - pvm_barrier(MPGROUP, np); - - printf("PVM initialization done!\n"); - - return; - -usage: - fprintf(stderr, "usage: %s \n", argv[0]); - pvm_exit(); - exit(-1); -} -#endif - /*\ generate random range for a section of multidimensional array \*/ void get_range(int ndim, int dims[], int lo[], int hi[]) diff --git a/configure.ac b/configure.ac index 58049ebbd..7ab91f6d1 100644 --- a/configure.ac +++ b/configure.ac @@ -300,7 +300,6 @@ AC_MSG_NOTICE AC_MSG_NOTICE([Assembler]) AC_MSG_NOTICE -GA_AS AM_PROG_AS ######################################### @@ -529,7 +528,6 @@ ARMCI_SETUP # Set up MA. MA_ENABLE_CUDA_MEM -MA_ENABLE_ARMCI_MEM_OPTION MA_LONG_DOUBLE_TYPEDEF MA_STATS MA_VERIFY @@ -577,8 +575,6 @@ AC_SUBST([NPROCS]) AC_ARG_VAR([MPIEXEC], [how to run parallel tests if built with MPI e.g. "mpiexec -np %NP%"]) AS_CASE([$ga_msg_comms], - [TCGMSG], [TCGEXEC="`pwd`/armci/parallel.x"], - [TCGMSG5], [TCGEXEC=], [MPI|TCGMSGMPI], [AS_IF([test "x$MPIEXEC" = x], [AC_PATH_PROGS([MPIEXEC], [mpirun mpiexec]) diff --git a/ga++/CMakeLists.txt b/ga++/CMakeLists.txt index 79e8595ab..28765dcac 100644 --- a/ga++/CMakeLists.txt +++ b/ga++/CMakeLists.txt @@ -50,6 +50,9 @@ install (FILES DESTINATION include/ga ) +list (APPEND GA_HEADER_PATHS ${CMAKE_CURRENT_LIST_DIR}/src) +set (GA_HEADER_PATHS ${GA_HEADER_PATHS} PARENT_SCOPE) + # ------------------------------------------------------------- # GA++ library installation # ------------------------------------------------------------- @@ -95,4 +98,4 @@ if(ENABLE_TESTS) gapp_add_parallel_test(ga++/testc_cpp ${PROJECT_SOURCE_DIR}/ga++/testing/testc.cc) gapp_add_parallel_test(ga++/testmult_cpp ${PROJECT_SOURCE_DIR}/ga++/testing/testmult.cc) gapp_add_parallel_test(ga++/thread-safe_cpp ${PROJECT_SOURCE_DIR}/ga++/testing/thread-safe.cc) -endif() \ No newline at end of file +endif() diff --git a/gaf2c/CMakeLists.txt b/gaf2c/CMakeLists.txt index d4863278d..8088be536 100644 --- a/gaf2c/CMakeLists.txt +++ b/gaf2c/CMakeLists.txt @@ -44,6 +44,9 @@ install (FILES DESTINATION include/ga ) +list (APPEND GA_HEADER_PATHS ${CMAKE_CURRENT_BINARY_DIR}) +set (GA_HEADER_PATHS ${GA_HEADER_PATHS} PARENT_SCOPE) + # ------------------------------------------------------------- # GAF2C library installation # ------------------------------------------------------------- @@ -67,5 +70,5 @@ target_include_directories(gaf2c BEFORE PRIVATE ) if (ENABLE_FORTRAN) - add_dependencies(gaf2c GenerateConfigFH) + add_dependencies(gaf2c GenerateConfigFH GenerateF2C_CH) endif() diff --git a/global/README b/global/README index ee83da474..43bec1835 100644 --- a/global/README +++ b/global/README @@ -57,13 +57,6 @@ TEST PROGRAM NOTES or under control of your favorite debugger. - On MPP like T3D, IBM SP, and Intel machines, use appropriate system - command to specify the number of processors, load and run the - programs. For example, to run on four processors of the Cray T3D - use:: - - mppexec testing/test.x -npes 4 - DOCUMENTATION ============= The documentation is located in doc/ and on the web at: diff --git a/global/examples/md_cluster/tstats.F b/global/examples/md_cluster/tstats.F index d5ef32376..a729be0f6 100644 --- a/global/examples/md_cluster/tstats.F +++ b/global/examples/md_cluster/tstats.F @@ -11,8 +11,6 @@ function wraptime() c It is designed to minimize the effort required to c convert the timing statistics in going from one c machine to another. -c -c SGI c wraptime = dble(etime(tarray)) #else diff --git a/global/src/CMakeLists.txt b/global/src/CMakeLists.txt index f2f8d1f17..f5c9f8047 100644 --- a/global/src/CMakeLists.txt +++ b/global/src/CMakeLists.txt @@ -161,3 +161,7 @@ install(FILES ${GA_FORTRAN_INTERFACE_H_FILES} DESTINATION include/ga ) + +list (APPEND GA_HEADER_PATHS ${CMAKE_CURRENT_LIST_DIR} ${CMAKE_CURRENT_BINARY_DIR}) +set (GA_HEADER_PATHS ${GA_HEADER_PATHS} PARENT_SCOPE) + diff --git a/global/src/base.c b/global/src/base.c index 80f660ba1..e77bc8f0e 100644 --- a/global/src/base.c +++ b/global/src/base.c @@ -38,6 +38,9 @@ #if HAVE_STRING_H # include #endif +#if HAVE_STRINGS_H +# include +#endif #if HAVE_STDLIB_H # include #endif @@ -76,7 +79,7 @@ static int calc_maplen(int handle); /*#define CHECK_MA yes */ /*uncomment line below to verify if MA base address is alligned wrt datatype*/ -#if !(defined(LINUX) || defined(CRAY) || defined(CYGWIN)) +#if !(defined(LINUX) || defined(CYGWIN)) #define CHECK_MA_ALGN 1 #endif @@ -168,7 +171,6 @@ int ga_spare_procs; #define ga_ComputeIndexM(_index, _ndim, _subscript, _dims) \ { \ Integer _i, _factor=1; \ - __CRAYX1_PRAGMA("_CRI novector"); \ for(_i=0,*(_index)=0; _i<_ndim; _i++){ \ *(_index) += _subscript[_i]*_factor; \ if(_i<_ndim-1)_factor *= _dims[_i]; \ @@ -181,7 +183,6 @@ int ga_spare_procs; #define ga_UpdateSubscriptM(_ndim, _subscript, _lo, _hi, _dims)\ { \ Integer _i; \ - __CRAYX1_PRAGMA("_CRI novector"); \ for(_i=0; _i<_ndim; _i++){ \ if(_subscript[_i] < _hi[_i]) { _subscript[_i]++; break;} \ _subscript[_i] = _lo[_i]; \ @@ -195,7 +196,6 @@ int ga_spare_procs; { \ Integer _i; \ *_elems = 1; \ - __CRAYX1_PRAGMA("_CRI novector"); \ for(_i=0; _i<_ndim; _i++){ \ *_elems *= _hi[_i]-_lo[_i] +1; \ _subscript[_i] = _lo[_i]; \ @@ -669,7 +669,6 @@ void pnga_initialize_ltd(Integer mem_limit) {\ int _d;\ if(ndim<1||ndim>MAXDIM) pnga_error("unsupported number of dimensions",ndim);\ - __CRAYX1_PRAGMA("_CRI novector"); \ for(_d=0; _dGA[ga_handle].dims[d]) ||(lo[d]>hi[d]))return FALSE; @@ -4555,9 +4552,7 @@ logical pnga_locate_nnodes( Integer g_a, if (GA[ga_handle].distr_type == REGULAR) { /* find "processor coordinates" for the top left corner and store them * in ProcT */ -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){ findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d], GA[ga_handle].scale[d], lo[d], &procT[d]); @@ -4566,9 +4561,7 @@ logical pnga_locate_nnodes( Integer g_a, /* find "processor coordinates" for the right bottom corner and store * them in procB */ -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){ findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d], GA[ga_handle].scale[d], hi[d], &procB[d]); @@ -4626,10 +4619,6 @@ logical pnga_locate_nnodes( Integer g_a, } return(TRUE); } -#ifdef __crayx1 -#pragma _CRI inline nga_locate_nnodes_ -#endif - /** * Locate individual patches and their owner of specified patch of a @@ -4676,9 +4665,7 @@ logical pnga_locate_region( Integer g_a, ga_check_handleM(g_a, "nga_locate_region"); ga_handle = GA_OFFSET + g_a; -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0; d< GA[ga_handle].ndim; d++) if((lo[d]<1 || hi[d]>GA[ga_handle].dims[d]) ||(lo[d]>hi[d]))return FALSE; @@ -4687,9 +4674,7 @@ logical pnga_locate_region( Integer g_a, if (GA[ga_handle].distr_type == REGULAR) { /* find "processor coordinates" for the top left corner and store them * in ProcT */ -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){ findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d], GA[ga_handle].scale[d], lo[d], &procT[d]); @@ -4698,9 +4683,7 @@ logical pnga_locate_region( Integer g_a, /* find "processor coordinates" for the right bottom corner and store * them in procB */ -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){ findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d], GA[ga_handle].scale[d], hi[d], &procB[d]); @@ -4728,14 +4711,9 @@ logical pnga_locate_region( Integer g_a, offset = *np *(ndim*2); /* location in map to put patch range */ -#ifdef __crayx1 -#pragma _CRI novector -#endif for(d = 0; d< ndim; d++) map[d + offset ] = lo[d] < _lo[d] ? _lo[d] : lo[d]; -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0; d< ndim; d++) map[ndim + d + offset ] = hi[d] > _hi[d] ? _hi[d] : hi[d]; @@ -4762,9 +4740,7 @@ logical pnga_locate_region( Integer g_a, /* find "processor coordinates" for the right bottom corner and store * them in procB */ -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){ findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].num_blocks[d], GA[ga_handle].scale[d], hi[d], &procB[d]); @@ -4793,14 +4769,10 @@ logical pnga_locate_region( Integer g_a, offset = *np *(ndim*2); /* location in map to put patch range */ -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0; d< ndim; d++) map[d + offset ] = lo[d] < _lo[d] ? _lo[d] : lo[d]; -#ifdef __crayx1 -#pragma _CRI novector -#endif + for(d = 0; d< ndim; d++) map[ndim + d + offset ] = hi[d] > _hi[d] ? _hi[d] : hi[d]; @@ -4897,9 +4869,6 @@ logical pnga_locate_region( Integer g_a, } return(TRUE); } -#ifdef __crayx1 -#pragma _CRI inline pnga_locate_region -#endif /** * Returns the processor grid for the global array diff --git a/global/src/base.h b/global/src/base.h index a85286595..fc640b12f 100644 --- a/global/src/base.h +++ b/global/src/base.h @@ -19,12 +19,6 @@ extern Integer GA_Debug_flag; #define FNAM 31 /* length of array names */ #define CACHE_SIZE 512 /* size of the cache inside GA DS*/ -#ifdef __crayx1 -#define __CRAYX1_PRAGMA _Pragma -#else -#define __CRAYX1_PRAGMA(_pragf) -#endif - enum data_distribution {REGULAR, BLOCK_CYCLIC, SCALAPACK, TILED, TILED_IRREG}; typedef int ARMCI_Datatype; @@ -160,14 +154,12 @@ extern MPI_Comm GA_MPI_World_comm_dup; Integer _loc, _nb, _d, _index, _dim=ndim,_dimstart=0, _dimpos; \ for(_nb=1, _d=0; _d<_dim; _d++)_nb *= (Integer)nblock[_d]; \ if((Integer)proc > _nb - 1 || proc<0){ \ - __CRAYX1_PRAGMA("_CRI novector"); \ - for(_d=0; _d<_dim; _d++){ \ + for(_d=0; _d<_dim; _d++){ \ lo[_d] = (Integer)0; \ hi[_d] = (Integer)-1;} \ } \ else{ \ _index = proc; \ - __CRAYX1_PRAGMA("_CRI novector"); \ for(_d=0; _d<_dim; _d++){ \ _loc = _index% (Integer)nblock[_d]; \ _index /= (Integer)nblock[_d]; \ @@ -329,7 +321,6 @@ extern MPI_Comm GA_MPI_World_comm_dup; #define gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc){\ int _i; \ stride_rem[0]= stride_loc[0] = (int)size; \ - __CRAYX1_PRAGMA("_CRI novector"); \ for(_i=0;_i hi[_d]){ \ char err_string[ERR_STR_LEN]; \ diff --git a/global/src/collect.c b/global/src/collect.c index 121016811..32b564e1d 100644 --- a/global/src/collect.c +++ b/global/src/collect.c @@ -19,10 +19,6 @@ /* can handle ga_brdcst/igop/dgop via ARMCI or native message-passing library * uncomment line below to use the ARMCI version */ -#ifndef NEC -#define ARMCI_COLLECTIVES -#endif - #ifdef MSG_COMMS_MPI # include extern MPI_Comm ARMCI_COMM_WORLD; @@ -54,7 +50,6 @@ void pnga_msg_brdcst(Integer type, void *buffer, Integer len, Integer root) buffer_ptr=(char *)buffer+istart; if (istart+len_small > len) len_small=((long)(len - istart)); /* printf("%ld step %d of %d len= %d total=%ld istart= %ld\n",GAme,(i+1),nsteps,len_small,len,istart); */ -#ifdef ARMCI_COLLECTIVES p_grp = (int)pnga_pgroup_get_default(); if (p_grp > 0) { # ifdef MSG_COMMS_MPI @@ -64,14 +59,7 @@ void pnga_msg_brdcst(Integer type, void *buffer, Integer len, Integer root) } else { armci_msg_bcast(buffer_ptr, (int)len_small, (int)root); } -#else -# ifdef MSG_COMMS_MPI - MPI_Bcast(buffer_ptr, (int)len_small, MPI_CHAR, (int)root, ARMCI_COMM_WORLD); -# else - tcg_brdcst(type, buffer_ptr, len_small, root); -# endif -#endif - istart+=len_small; + istart+=len_small; } } @@ -183,7 +171,7 @@ void pnga_pgroup_gop(Integer p_grp, Integer type, void *x, Integer n, char *op) { _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/ if (p_grp > 0) { -#if defined(ARMCI_COLLECTIVES) && defined(MSG_COMMS_MPI) +#if defined(MSG_COMMS_MPI) int group = (int)p_grp; switch (type){ case C_INT: @@ -229,7 +217,7 @@ void pnga_gop(Integer type, void *x, Integer n, char *op) if (p_grp > 0) { pnga_pgroup_gop(p_grp, type, x, n, op); } else { -#if defined(ARMCI_COLLECTIVES) || defined(MSG_COMMS_MPI) +#if defined(MSG_COMMS_MPI) switch (type){ case C_INT: armci_msg_igop((int*)x, n, op); diff --git a/global/src/diag.fh b/global/src/diag.fh index bce85d0f0..8a97744f7 100644 --- a/global/src/diag.fh +++ b/global/src/diag.fh @@ -4,16 +4,7 @@ c problem size: c - first group are MPPs c - second group are (multiprocessor) workstations c -#if defined(KSR)||defined(CRAY_T3D)||defined(NX)||defined(SP1)||defined(SP)||defined(LAPI) -# define SIZE1 200 -# define SIZE2 400 -# define SIZE3 800 -# define SIZE4 2000 -# define PROC1 4 -# define PROC2 16 -# define PROC3 64 -# define PROC4 256 -#else + # define SIZE1 400 # define SIZE2 800 # define SIZE3 1200 @@ -22,4 +13,3 @@ c # define PROC2 8 # define PROC3 12 # define PROC4 16 -#endif diff --git a/global/src/ga_dgemmf.F b/global/src/ga_dgemmf.F deleted file mode 100644 index da436477b..000000000 --- a/global/src/ga_dgemmf.F +++ /dev/null @@ -1,261 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.fh" -#endif -#define xx_dgemm dgemm - subroutine ga_dgemm(transa, transb, m, n, k, alpha, g_a, - $ g_b, beta, g_c) -C$Id: ga_dgemm.F,v 1.29 2000/11/04 01:46:31 d3h325 Exp $ - implicit none - Character*1 transa, transb - Integer m, n, k - Double precision alpha, beta - Integer g_a, g_b, g_c -#include "mafdecls.fh" -#include "global.fh" -c -c GA_DGEMM performs one of the matrix-matrix operations: -c C := alpha*op( A )*op( B ) + beta*C, -c where op( X ) is one of -c op( X ) = X or op( X ) = X`, -c -c alpha and beta are scalars, and A, B and C are matrices, with op( A ) -c an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. -c -c On entry, TRANSA specifies the form of op( A ) to be used in -c the matrix multiplication as follows: -c transa = 'N' or 'n', op( A ) = A. -c transa = 'T' or 't', op( A ) = A`. -c -c M - On entry, M specifies the number of rows of the matrix -c op( A ) and of the matrix C. M must be at least zero. -c N - On entry, N specifies the number of columns of the matrix -c op( B ) and the number of columns of the matrix C. N must be -c at least zero. -c K - On entry, K specifies the number of columns of the matrix -c op( A ) and the number of rows of the matrix op( B ). K must -c be at least zero. -c - integer ilo, ihi, jlo, jhi, klo, khi, ichunk, jchunk, kchunk - integer idim, jdim, kdim, adim, bdim, cdim, ijk, me, nproc - integer l_a, k_a, l_b, k_b - logical status -C - Logical Get_New_B ! Allow reuse of B patch when possible -C - Double Precision Chunk_cube - Integer Min_Tasks, Max_Chunk, Mem_Avail - integer l_mxn,k_mxn,i0,i1,j0,j1,ldc,adrc - integer an1, an2, bn1, bn2, cn1, cn2 - integer ilor, ihir,jlor,jhir,klor,khir,itipo,ijmax - double precision t0,t1,gflop - external MPI_Wtime - double precision MPI_Wtime - Parameter ( Min_Tasks = 10) ! Minimum acceptable tasks per node -c -C Set defaults -- platform dependent -#ifdef GATIME - t0=MPI_Wtime() -#endif - ichunk = 512 - jchunk = 512 - kchunk = 512 -C - me = ga_nodeid() - nproc = ga_nnodes() -c if(me.eq.0) -c W write(6,*) ' transa, transb ', transa, transb -C -C Make an estimate of how large patches can be and still insure -C enough tasks per processor that loads will be reasonably balanced. -C -C Patches per dimension are M/chunk, N/chunk, K/chunk so total tasks -C is roughly (K*M*N)/(chunk**3). Assume all chunk sizes are the -C same and solve for the one that provides the minimum acceptable -C number of tasks. -C -C Find out how much memory we can grab. It will be used in -C three chunks, and the result includes only the first one. -C - Mem_Avail = MA_Inquire_Avail( MT_DBL ) - $ - 2 * MA_SizeOf_Overhead( MT_DBL ) - Mem_Avail = 0.9 * Mem_Avail ! Do not use every last drop! -c Call GA_IGOp(42, Mem_Avail, 1, 'min') -C - -c - if (beta .eq. 0.0d0) then - call ga_zero(g_c) - else - call ga_scale(g_c, beta) - endif -c - call ga_distribution(g_c, - . ga_nodeid(), i0, i1, j0, j1) - call ga_inquire(g_a, - . itipo, an1, an2) - call ga_inquire(g_b, - . itipo, bn1, bn2) - call ga_inquire(g_c, - . itipo, cn1, cn2) - if (i0.gt.0 .and. i0.le.i1) then - ilo=i0 - ihi=i1 - idim = ihi - ilo + 1 - jlo=j0 - jhi=j1 - jdim = jhi - jlo + 1 -#if 0 - write(6,'(I4,A,4I6)') ga_nodeid(),' IJ ',i0,i1,j0,j1 - if(ga_nodeid().eq.0) call ffflush(6) - if(ga_nodeid().eq.0) call ffflush(0) -#endif - ijmax=max(idim,jdim) - KChunk = Int((DBLE(Mem_Avail/(2*ijmax)))) - kchunk=min(kchunk,ijmax) - status = .true. - status = ma_push_get(MT_DBL, idim*kchunk, 'ga_dgemm:a', l_a,k_a) - $ .and. status - status = ma_push_get(MT_DBL, kchunk*jdim, 'ga_dgemm:b', l_b,k_b) - $ .and. status - if (.not. status) call ga_error('ga_dgemm: insufficent memory?', - A idim*kchunk+kchunk*jdim) - call ga_access(g_c, i0, i1, j0, j1, adrc, ldc) - ijk = 0 - do klo = 1, k, kchunk - khi = min(k, klo+kchunk-1) - kdim = khi - klo + 1 -C -C Each pass through the outer two loops means we need a -C different patch of B. -C - Get_New_B = .TRUE. -C - cdim = idim - if (transa.eq.'n' .or. transa.eq.'N') then - ilor=min(an1,ilo) - ihir=min(an1,ihi) - klor=min(an2,klo) - khir=min(an2,khi) - kdim=khir-klor+1 - idim=ihir-ilor+1 - adim = idim - cdim = idim - call ga_get(g_a, ilor, ihir, klor, khir, - $ dbl_mb(k_a), adim) - else - klor=min(an1,klo) - khir=min(an1,khi) - ilor=min(an2,ilo) - ihir=min(an2,ihi) - kdim=khir-klor+1 - idim=ihir-ilor+1 - adim = kdim - cdim=idim - call ga_get(g_a, klor, khir, ilor, ihir, - $ dbl_mb(k_a), adim) - endif -C -C Avoid rereading B if it is the same patch as last time. -C - If ( Get_New_B ) then - if (transb.eq.'n' .or. transb.eq.'N') then - klor=min(bn1,klo) - khir=min(bn1,khi) - jlor=min(bn2,jlo) - jhir=min(bn2,jhi) - kdim=khir-klor+1 - idim=ihir-ilor+1 - bdim = kdim - call ga_get(g_b, klor, khir, jlor, jhir, - $ dbl_mb(k_b), bdim) - else - jlor=min(bn1,jlo) - jhir=min(bn1,jhi) - klor=min(bn2,klo) - khir=min(bn2,khi) - kdim=khir-klor+1 - jdim=jhir-jlor+1 - bdim = jdim - call ga_get(g_b, jlor, jhir, klor, khir, - $ dbl_mb(k_b), bdim) - endif - Get_New_B = .FALSE. ! Until J or K change again - EndIf -C - call xx_dgemm(transa, transb, idim, jdim, kdim, - $ alpha, dbl_mb(k_a), adim, dbl_mb(k_b), bdim, - $ 1.0d0, dbl_mb(adrc), cdim) - enddo - status = ma_chop_stack(l_a) - if (.not. status)call ga_error('ga_dgemm: pop of stack failed', 1) - endif - call ga_release_update(g_c, i0, i1, j0, j1) - call ga_sync() -#ifdef GATIME - if(ga_nodeid().eq.0) then - call ffflush(6) - t1=MPI_Wtime()-t0 - gflop=2d0*n*m*k/(t1*1d9) - write(6,'(I4,A,3F14.6)') - G ga_nodeid(),' dgemm done in ',t1, - , gflop,gflop/ga_nnodes() - endif -#endif -c - end - - subroutine lga_acc(out, dim1,dim2, - I ilo, ihi, jlo, jhi, buf, - $ ld, alpha) - implicit none - integer dim1,dim2 - integer ilo, ihi, jlo, jhi,ld - integer i,j - double precision alpha - double precision out(1:dim1,1:dim2) - double precision buf(1:ld, 1:*) - - do j=jlo,jhi - do i=ilo,ihi - out(i, j) = out(i, j) + - + alpha*buf(i-ilo+1, j-jlo+1) - enddo - enddo - return - end - subroutine lga_accbrd(g_c,m,n,out) - implicit none -#include "global.fh" -#include "mafdecls.fh" - integer m,n - double precision out(1:m,1:n) - integer g_c -c - integer ilo,ihi,jlo,jhi,numi,numj,k_in,l_in,i,j - logical status -c - call ga_distribution(g_c, - . ga_nodeid(), ilo, ihi, jlo, jhi) - if (ilo.gt.0 .and. ilo.le.ihi) then - numi = ihi-ilo+1 - numj = jhi-jlo+1 - if (numi.gt.0 .and. numj.gt.0) then - if (.not.MA_Push_Get(MT_Dbl,numi*numj,'MxN',l_in,k_in)) - & call ga_error('dft_scf: cannot allocate eval',0) - call ga_get(g_c,ilo,ihi,jlo,jhi, - . dbl_mb(k_in),numi) - do j=jlo,jhi - do i=ilo,ihi - call daxpy(numi,1d0,dbl_mb(k_in+(j-jlo)*numi),1, - 1 out,1) - enddo - enddo - call ga_put(g_c,ilo,ihi,jlo,jhi, - . out(ilo,jlo),m) - status = ma_pop_stack(l_in) - if (.not. status)call ga_error('gad: pop of stack failed', - 3 numi*numj) - endif - endif - return - end diff --git a/global/src/ga_iterator.h b/global/src/ga_iterator.h index 4ef4aeafe..c66fbb416 100644 --- a/global/src/ga_iterator.h +++ b/global/src/ga_iterator.h @@ -117,7 +117,6 @@ typedef struct { #define gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc){\ int _i; \ stride_rem[0]= stride_loc[0] = (int)size; \ - __CRAYX1_PRAGMA("_CRI novector"); \ for(_i=0;_i # define sleep(x) Sleep(1000*(x)) @@ -128,11 +124,7 @@ extern char ***_ga_argv; #define PERIODIC_ACC 3 #define FLUSH_CACHE -#ifdef CRAY_T3D -# define ALLIGN_SIZE 32 -#else -# define ALLIGN_SIZE 128 -#endif +#define ALLIGN_SIZE 128 #define allign__(n, SIZE) (((n)%SIZE) ? (n)+SIZE - (n)%SIZE: (n)) #define allign_size(n) allign__((long)(n), ALLIGN_SIZE) diff --git a/global/src/iterator.c b/global/src/iterator.c index 9dd664b64..896dd5583 100644 --- a/global/src/iterator.c +++ b/global/src/iterator.c @@ -115,7 +115,6 @@ ga_ownsM(g_handle, proc, _lo, _hi); \ gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim); \ if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0]; \ - __CRAYX1_PRAGMA("_CRI shortloop"); \ for(_d=0; _d < _last; _d++) { \ _w = (Integer)GA[g_handle].width[_d]; \ _offset += (subscript[_d]-_lo[_d]+_w) * _factor; \ @@ -147,7 +146,6 @@ void gam_LocationF(int proc, Integer g_handle, Integer subscript[], ga_ownsM(g_handle, proc, _lo, _hi); gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim); if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0]; - __CRAYX1_PRAGMA("_CRI shortloop"); for(_d=0; _d < _last; _d++) { _w = (Integer)GA[g_handle].width[_d]; _offset += (subscript[_d]-_lo[_d]+_w) * _factor; diff --git a/global/src/matmul.c b/global/src/matmul.c index e1eceff0f..95313e772 100644 --- a/global/src/matmul.c +++ b/global/src/matmul.c @@ -1161,7 +1161,7 @@ static void check_result(cond, transa, transb, alpha, beta, atype, m_t=m; n_t=n; k_t=k; adim_t=adim; bdim_t=bdim; cdim_t=cdim; -#if (defined(CRAY) || defined(WIN32)) && !NOFORT +#if defined(WIN32) && !NOFORT pnga_error("check_result: Serial dgemms not defined", 0L); #else switch(atype) { @@ -1405,9 +1405,6 @@ void pnga_matmul(transa, transb, alpha, beta, CONTIG_CHUNKS_OPT_FLAG = UNSET; DIRECT_ACCESS_OPT_FLAG = UNSET; } -# if defined(__crayx1) || defined(NEC) - use_NB_matmul = UNSET; -# endif } /* if block cyclic, then use regular algorithm. This is turned on for now diff --git a/global/src/onesided.c b/global/src/onesided.c index 0e1501e24..36cc28e8b 100644 --- a/global/src/onesided.c +++ b/global/src/onesided.c @@ -39,6 +39,9 @@ #if HAVE_STRING_H # include #endif +#if HAVE_STRINGS_H +# include +#endif #if HAVE_STDLIB_H # include #endif @@ -70,8 +73,6 @@ #define INVALID_MA_HANDLE -1 #define NEAR_INT(x) (x)< 0.0 ? ceil( (x) - 0.5) : floor((x) + 0.5) -#define BYTE_ADDRESSABLE_MEMORY - #ifdef PROFILE_OLD #include "ga_profile.h" #endif @@ -295,7 +296,6 @@ Integer _lo[MAXDIM], _hi[MAXDIM], _pinv, _p_handle; \ ga_ownsM(g_handle, proc, _lo, _hi); \ gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim); \ if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0]; \ - __CRAYX1_PRAGMA("_CRI shortloop"); \ for(_d=0; _d < _last; _d++) { \ _w = (Integer)GA[g_handle].width[_d]; \ _offset += (subscript[_d]-_lo[_d]+_w) * _factor; \ @@ -330,7 +330,6 @@ Integer _mloc = p* ndim *2;\ #define gam_ComputePatchIndex(ndim, lo, plo, dims, pidx){ \ Integer _d, _factor; \ *pidx = plo[0] -lo[0]; \ - __CRAYX1_PRAGMA("_CRI shortloop"); \ for(_d= 0,_factor=1; _d< ndim -1; _d++){ \ _factor *= (dims[_d]); \ *pidx += _factor * (plo[_d+1]-lo[_d+1]); \ @@ -528,9 +527,6 @@ static void ngai_gets(char *loc_base_ptr, char *prem,int *stride_rem, char *pbuf /** * A common routine called by both non-blocking and blocking GA put calls. */ -#ifdef __crayx1 -#pragma _CRI inline pnga_locate_region -#endif void ngai_put_common(Integer g_a, Integer *lo, Integer *hi, @@ -546,7 +542,7 @@ void ngai_put_common(Integer g_a, int num_loops=2; /* 1st loop for remote procs; 2nd loop for local procs */ Integer n_rstrctd; Integer *rank_rstrctd; -#if defined(__crayx1) || defined(DISABLE_NBOPT) +#if defined(DISABLE_NBOPT) #else Integer ga_nbhandle; int counter=0; @@ -582,7 +578,7 @@ void ngai_put_common(Integer g_a, #endif if(nbhandle)ga_init_nbhandle(nbhandle); -#if !defined(__crayx1) && !defined(DISABLE_NBOPT) +#if !defined(DISABLE_NBOPT) else ga_init_nbhandle(&ga_nbhandle); #endif @@ -591,9 +587,8 @@ void ngai_put_common(Integer g_a, ENABLE_PROFILE_PUT); #endif -#if !defined(__crayx1) && !defined(DISABLE_NBOPT) +#if !defined(DISABLE_NBOPT) for(loop=0; loop ${CMAKE_BINARY_DIR}/${testname}.F + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/global/testing + COMMENT "Generating ${testname} test" + ) + endfunction() + + generate_ngatests(nga-onesided) + generate_ngatests(nga-patch) + generate_ngatests(nga-periodic) + generate_ngatests(nga-scatter) + generate_ngatests(ngatest) + generate_ngatests(nga-util) + + set (NGATEST_GEN_SRCS + ${CMAKE_BINARY_DIR}/nga-onesided.F + ${CMAKE_BINARY_DIR}/nga-patch.F + ${CMAKE_BINARY_DIR}/nga-periodic.F + ${CMAKE_BINARY_DIR}/nga-scatter.F + ${CMAKE_BINARY_DIR}/ngatest.F + ${CMAKE_BINARY_DIR}/nga-util.F + ) + add_custom_target( + GenerateNGATests ALL + DEPENDS ${NGATEST_GEN_SRCS} + ) + set_source_files_properties( + ${NGATEST_GEN_SRCS} + PROPERTIES GENERATED TRUE + ) + + ga_add_parallel_test (nga-onesided "${CMAKE_BINARY_DIR}/nga-onesided.F ffflush.F util.c" ${TEST_NPROCS_2} Fortran) + ga_add_parallel_test (nga-patch "${CMAKE_BINARY_DIR}/nga-patch.F ffflush.F util.c" ${TEST_NPROCS_1} Fortran) + ga_add_parallel_test (nga-periodic "${CMAKE_BINARY_DIR}/nga-periodic.F ffflush.F util.c" Fortran ) + ga_add_parallel_test (nga-scatter "${CMAKE_BINARY_DIR}/nga-scatter.F ffflush.F util.c" Fortran ) + ga_add_parallel_test (ngatest "${CMAKE_BINARY_DIR}/ngatest.F ffflush.F util.c" ${TEST_NPROCS_1} Fortran) + ga_add_parallel_test (nga-util "${CMAKE_BINARY_DIR}/nga-util.F ffflush.F util.c" Fortran ) + + add_dependencies(nga-onesided.x GenerateNGATests) + add_dependencies(nga-patch.x GenerateNGATests) + add_dependencies(nga-periodic.x GenerateNGATests) + add_dependencies(nga-scatter.x GenerateNGATests) + add_dependencies(ngatest.x GenerateNGATests) + add_dependencies(nga-util.x GenerateNGATests) + endif() diff --git a/global/testing/blktest.F b/global/testing/blktest.F index 9e9698ab3..5af371199 100644 --- a/global/testing/blktest.F +++ b/global/testing/blktest.F @@ -2,15 +2,8 @@ # include "config.fh" #endif c vector boxes lack arithmetic precision -#ifdef CRAY_YMP -# define THRESH 1d-10 -#elif defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 1e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/d2test.F b/global/testing/d2test.F index 28c1ffc31..cb13cc16a 100644 --- a/global/testing/d2test.F +++ b/global/testing/d2test.F @@ -2,15 +2,8 @@ # include "config.fh" #endif c vector boxes lack arithmetic precision -#ifdef CRAY_YMP -# define THRESH 1d-10 -#elif defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 1e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/ffflush.F b/global/testing/ffflush.F index 8d2b9cab3..4c8ba8cda 100644 --- a/global/testing/ffflush.F +++ b/global/testing/ffflush.F @@ -5,13 +5,7 @@ subroutine ffflush(unit) integer unit c -#ifdef CRAY -* if(unit.eq.6)then -* call flush(101) -* else -* call flush(unit) -* endif -#elif HAVE_F77_FLUSH +#if HAVE_F77_FLUSH call F77_FLUSH(unit) #endif c diff --git a/global/testing/field-test.F b/global/testing/field-test.F index d9ddf9c8c..2f258e4ce 100644 --- a/global/testing/field-test.F +++ b/global/testing/field-test.F @@ -3,13 +3,8 @@ #endif c $Id: test.F,v 1.64.2.11 2007-04-06 22:37:35 d3g293 Exp $ c vector boxes lack arithmetic precision -#if defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 2e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/g2test.F b/global/testing/g2test.F index 5b8bd4c36..b654a29c0 100644 --- a/global/testing/g2test.F +++ b/global/testing/g2test.F @@ -2,15 +2,8 @@ # include "config.fh" #endif c vector boxes lack arithmetic precision -#ifdef CRAY_YMP -# define THRESH 1d-10 -#elif defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 1e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/g3test.F b/global/testing/g3test.F index ab879732f..3fa30388b 100644 --- a/global/testing/g3test.F +++ b/global/testing/g3test.F @@ -2,15 +2,8 @@ # include "config.fh" #endif c vector boxes lack arithmetic precision -#ifdef CRAY_YMP -# define THRESH 1d-10 -#elif defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 1e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/ghosts.F b/global/testing/ghosts.F index f78b42f7d..198130d18 100644 --- a/global/testing/ghosts.F +++ b/global/testing/ghosts.F @@ -3,16 +3,8 @@ #endif c $Id: ghosts.F,v 1.1.2.1 2007-05-07 19:02:02 d3g293 Exp $ c vector boxes lack arithmetic precision -#ifdef CRAY_YMP -# define THRESH 1d-10 -# define THRESHF 1e-5 -#elif defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 1e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/ipc.clean.c b/global/testing/ipc.clean.c index 771ba2275..2b2c9efc8 100644 --- a/global/testing/ipc.clean.c +++ b/global/testing/ipc.clean.c @@ -112,135 +112,7 @@ fprintf(stderr,"%s %d\n",str, code); exit(0); } -#ifdef ALLIANT - -#include -extern char *valloc(); - -char *CreateSharedRegion(id, size) - long *size, *id; -{ - struct timeval tp; - struct timezone tzp; - char *temp; - int status; - - /* Have to round up to a multiple of page size before allocating - on a page boundary */ - *size = ( (*size + (PAGE_SIZE -1)) / PAGE_SIZE ) * PAGE_SIZE; - - if ( (temp = valloc((unsigned) *size)) == (char *) NULL) - Error("CreateSharedRegion: failed in valloc", (long) 0); - - /* Now have to get a unique id ... try using time of day in centi-sec */ - if ( (status = gettimeofday(&tp, &tzp)) != 0) - Error("CreateSharedRegion: error from gettimeofday", (long) status); - - *id = (tp.tv_sec + 10000*tp.tv_usec) & 0xffffff; - - /* Now make the region */ - if ( (status = create_shared_region(*id, temp, *size, 0)) != 0) - Error("CreateSharedRegion: error from create_shared_region", (long) status); - - return temp; -} - - -long DetachSharedRegion( id, size, addr) - long id, size; - char *addr; -{ - return detach_shared_region( id, addr, size); -} - -long DeleteSharedRegion(id) - long id; -{ - return delete_shared_region(id); -} - -char *AttachSharedRegion(id, size) - long id, size; -{ - char *temp; - int status; - - if (size != (((size + (PAGE_SIZE -1)) / PAGE_SIZE) * PAGE_SIZE)) - Error("AttachSharedRegion: input size is not multiple of PAGE_SIZE", - (long) size); - if ( (temp = valloc((unsigned) size)) == (char *) NULL) - Error("AttachSharedRegion: failed in valloc", (long) 0); - - /* Now try to attach */ - if ( (status = attach_shared_region(id, temp, size)) != 0) - Error("AttachSharedRegion: error from attach_shared_region", - (long) status); - - return temp; -} - -#endif -#if defined(SEQUENT) || defined(ENCORE) - -#ifdef SEQUENT -#define SHMALLOC shmalloc -#define SHFREE shfree -#endif -#ifdef ENCORE -#define SHMALLOC share_malloc -#define SHFREE share_free -#endif - -extern char *SHMALLOC(); -extern int SHFREE(); - -#define MAX_ADDR 20 -static int next_id = 0; /* Keep track of id */ -static char *shaddr[MAX_ADDR]; /* Keep track of addresses */ - -char *CreateSharedRegion(id, size) - long *size, *id; -{ - char *temp; - - if (next_id >= MAX_ADDR) - Error("CreateSharedRegion: too many shared regions", (long) next_id); - - if ( (temp = SHMALLOC((unsigned) *size)) == (char *) NULL) - Error("CreateSharedRegion: failed in SHMALLOC", (long) *size); - - *id = next_id++; - shaddr[*id] = temp; - - return temp; -} - -long DetachSharedRegion( id, size, addr) - long id, size; - char *addr; -{ - /* This needs improving to make more robust */ - return SHFREE(addr); -} - -long DeleteSharedRegion(id) - long id; -{ - /* This needs improving to make more robust */ - return SHFREE(shaddr[id]); -} - -char *AttachSharedRegion(id, size) - long id, size; -{ - Error("AttachSharedRegion: cannot do this on SEQUENT or BALANCE", (long) -1); -} - - -#endif - /* Bizarre sequent has sysv semaphores but proprietary shmem */ - /* Encore has sysv shmem but is limited to total of 16384bytes! */ -#if defined(SYSV) && !defined(SEQUENT) && !defined(ENCORE) +#if defined(SYSV) #include #include @@ -249,10 +121,6 @@ char *AttachSharedRegion(id, size) #include -#ifdef SUN -extern char *shmat(); -#endif - char *CreateSharedRegion(id, size) long *size, *id; { @@ -300,125 +168,7 @@ char *AttachSharedRegion(id, size) } #endif -#if defined(CONVEX) || defined(APOLLO) - -#include -#include -#include -#include - -extern char *strdup(); -extern char *mktemp(); - -#define MAX_ID 20 -static struct id_list_struct { - char *addr; /* pointer to shmem region */ - unsigned size; /* size of region */ - char *filename; /* associated file name */ - int fd; /* file descriptor */ - int status; /* = 1 if in use */ -} id_list[MAX_ID]; - -static int next_id = 0; -static char template[] = "/tmp/SHMEM.XXXXXX"; - -char *CreateSharedRegion(id, size) - long *size, *id; -{ - char *temp; - - if (next_id == MAX_ID) - Error("CreateSharedRegion: MAX_ID exceeded ", MAX_ID); - *id = next_id; - -#ifdef APOLLO - id_list[*id].fd = -1; -#else - if ( (temp = strdup(template)) == (char *) NULL) - Error("CreateSharedRegion: failed to get space for filename", 0); - -/* Generate scratch file to identify region ... need to know this - name to attach to the region so need to establish some policy - before AttachtoSharedRegion can work */ - - id_list[*id].filename = mktemp(temp); - if ( (id_list[*id].fd = open(id_list[*id].filename, - O_RDWR|O_CREAT, 0666)) < 0) - Error("CreateSharedRegion: failed to open temporary file",0); -#endif - - id_list[*id].addr = mmap((caddr_t) 0, (unsigned *) size, - PROT_READ|PROT_WRITE, - MAP_ANON|MAP_SHARED, id_list[*id].fd, 0); -#ifdef APOLLO - if (id_list[*id].addr == (char *) 0) - Error("CreateSharedRegion: mmap failed",-1); -#else - if (id_list[*id].addr == (char *) -1) - Error("CreateSharedRegion: mmap failed",-1); -#endif - - id_list[*id].size = *size; - id_list[*id].status = 1; - next_id++; - return id_list[*id].addr; -} - -long DetachSharedRegion( id, size, addr) - long id, size; - char *addr; -{ - if ( (id < 0) || (id > next_id)) - return (long) -1; - - if (id_list[id].status != 1) - return (long) -1; - - id_list[id].status = 0; - - return (long) munmap(id_list[id].addr, 0); -} - -long DeleteSharedRegion(id) - long id; -{ - if ( (id < 0) || (id > next_id) ) - return (long) -1; - - if (id_list[id].status != 1) - return (long) -1; - - (void) DetachSharedRegion(id, 0, (char *) 0); - - if (id_list[id].fd >= 0) { - (void) close(id_list[id].fd); - (void) unlink(id_list[id].filename); - } - return (long) 0; -} - -char *AttachSharedRegion(id, size) - long id, size; -{ - Error("AttachSharedRegion: need mods for this to work on CONVEX", (long) -1); -} - -long DeleteSharedAll() -{ - long id; - long status = 0; - - for (id=0; id - extern void pvm_init(int argc, char *argv[]); - extern double armci_timer(); -# ifdef CRAY -# define MPGROUP (char *)NULL -# define MP_INIT(argc,argv) -# else -# define MPGROUP "mp_working_group" -# define MP_INIT(argc,argv) pvm_init(argc, argv) -# endif -# define MP_FINALIZE() pvm_exit() -# define MP_BARRIER() pvm_barrier(MPGROUP,-1) -# define MP_MYID(pid) *(pid) = pvm_getinst(MPGROUP,pvm_mytid()) -# define MP_PROCS(pproc) *(pproc) = (int)pvm_gsize(MPGROUP) -# define MP_TIMER armci_timer -# define MP_ASSERT(code) code -#elif defined(MSG_COMMS_TCGMSG) || defined(MSG_COMMS_TCGMSG5) || defined(MSG_COMMS_TCGMSGMPI) +#if defined(MSG_COMMS_TCGMSGMPI) # include # define MP_BARRIER() tcg_synch(30000) # define MP_INIT(argc,argv) tcg_pbegin((argc),(argv)) @@ -24,20 +7,11 @@ # define MP_PROCS(pproc) *(pproc) = (int)tcg_nnodes() # define MP_TIMER tcg_time # define MP_ASSERT(code) code -#elif defined(BGML) - extern double armci_timer(); -# define MP_BARRIER() armci_msg_barrier() -# define MP_FINALIZE() -# define MP_INIT(argc,argv) -# define MP_MYID(pid) *(pid)=armci_msg_me() -# define MP_PROCS(pproc) *(pproc)=armci_msg_nproc() -# define MP_TIMER armci_timer -# define MP_ASSERT(code) code #else # include # define MP_BARRIER() MPI_Barrier(MPI_COMM_WORLD) # define MP_FINALIZE() MPI_Finalize() -# if defined(DCMF) || defined(MPI_MT) || defined(MPI_PT) +# if defined(MPI_MT) || defined(MPI_PT) static inline int MPI_INIT_THREAD(int *argc, char ***argv) { int status; int provided; diff --git a/global/testing/mulmatpatch.F b/global/testing/mulmatpatch.F index 57495d95c..0e6ae10e5 100644 --- a/global/testing/mulmatpatch.F +++ b/global/testing/mulmatpatch.F @@ -2,18 +2,9 @@ # include "config.fh" #endif c $Id: mulmatpatch.F,v 1.6 2005-11-23 10:25:18 manoj Exp $ -#if (defined(CRAY) && !defined(__crayx1)) || defined(KSR) -# define xgemm SGEMM -# define ygemm CGEMM -#else -# define xgemm TEST_DGEMM -# define ygemm TEST_ZGEMM -#endif -#if defined(FUJITSU) || defined(CRAY_YMP) -# define THRESH 1.0d-10 -#else +#define xgemm TEST_DGEMM +#define ygemm TEST_ZGEMM # define THRESH 1.0d-20 -#endif #define MISMATCH(x,y) abs(x-y)/max(1,abs(x)).gt.THRESH #define NMAX 10 c diff --git a/global/testing/mulmatpatchc.c b/global/testing/mulmatpatchc.c index 7100b0784..b99a2f16b 100644 --- a/global/testing/mulmatpatchc.c +++ b/global/testing/mulmatpatchc.c @@ -12,11 +12,7 @@ #include "mp3.h" #include "xgemm.h" -#if defined(FUJITSU) || defined(CRAY_YMP) -# define THRESH 1.0e-10 -#else # define THRESH 1.0e-20 -#endif #define ABS(x) ((x) >= 0.0 ? (x) : -(x)) #define MAX(x,y) ((x) >= (y) ? (x) : (y)) #define MISMATCH(x,y) (ABS((x)-(y)) / MAX(1.0,ABS((x)))) > THRESH diff --git a/global/testing/nb2test.F b/global/testing/nb2test.F index 7a8fc891e..5b2ce0036 100644 --- a/global/testing/nb2test.F +++ b/global/testing/nb2test.F @@ -2,15 +2,8 @@ # include "config.fh" #endif c vector boxes lack arithmetic precision -#ifdef CRAY_YMP -# define THRESH 1d-10 -#elif defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 1e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/ngatest_src/generated/nga-onesided.F b/global/testing/ngatest_src/generated/nga-onesided.F deleted file mode 100644 index 9b6b36174..000000000 --- a/global/testing/ngatest_src/generated/nga-onesided.F +++ /dev/null @@ -1,14601 +0,0 @@ - - -#if HAVE_CONFIG_H -# include "config.fh" -#endif -#define MAXLOOP 100 - - program test - implicit none -#include "mafdecls.fh" -#include "global.fh" - integer stack, heap -c -c*** Intitialize a message passing library -c -#include "mp3.fh" -c -c*** Intitialize the GA package - call ga_initialize() -c if(ga_nodeid().eq.0)print *,ga_nnodes(),' nodes' -c - if(ga_uses_ma()) then - stack = 200000 -c stack = 100000 - else - stack = 60000 -c stack = 20000 - endif - heap = stack - if (.not. ma_init(MT_DBL, heap, stack)) - $ call ga_error("ma init failed",heap+stack) -c -c -c test GA_FILL - -c -c test NGA_GET - - call testit_NGA_GET_int1() - - call testit_NGA_GET_dbl1() - - call testit_NGA_GET_dcpl1() - - call testit_NGA_GET_int2() - - call testit_NGA_GET_dbl2() - - call testit_NGA_GET_dcpl2() - - call testit_NGA_GET_int3() - - call testit_NGA_GET_dbl3() - - call testit_NGA_GET_dcpl3() - - call testit_NGA_GET_int4() - - call testit_NGA_GET_dbl4() - - call testit_NGA_GET_dcpl4() - - call testit_NGA_GET_int5() - - call testit_NGA_GET_dbl5() - - call testit_NGA_GET_dcpl5() - - call testit_NGA_GET_int6() - - call testit_NGA_GET_dbl6() - - call testit_NGA_GET_dcpl6() - - call testit_NGA_GET_int7() - - call testit_NGA_GET_dbl7() - - call testit_NGA_GET_dcpl7() - -c -c test NGA_PUT - - call testit_NGA_PUT_int1() - - call testit_NGA_PUT_dbl1() - - call testit_NGA_PUT_dcpl1() - - call testit_NGA_PUT_int2() - - call testit_NGA_PUT_dbl2() - - call testit_NGA_PUT_dcpl2() - - call testit_NGA_PUT_int3() - - call testit_NGA_PUT_dbl3() - - call testit_NGA_PUT_dcpl3() - - call testit_NGA_PUT_int4() - - call testit_NGA_PUT_dbl4() - - call testit_NGA_PUT_dcpl4() - - call testit_NGA_PUT_int5() - - call testit_NGA_PUT_dbl5() - - call testit_NGA_PUT_dcpl5() - - call testit_NGA_PUT_int6() - - call testit_NGA_PUT_dbl6() - - call testit_NGA_PUT_dcpl6() - - call testit_NGA_PUT_int7() - - call testit_NGA_PUT_dbl7() - - call testit_NGA_PUT_dcpl7() - -c -c test NGA_ACC - - call testit_NGA_ACC_int1() - - call testit_NGA_ACC_dbl1() - - call testit_NGA_ACC_dcpl1() - - call testit_NGA_ACC_int2() - - call testit_NGA_ACC_dbl2() - - call testit_NGA_ACC_dcpl2() - - call testit_NGA_ACC_int3() - - call testit_NGA_ACC_dbl3() - - call testit_NGA_ACC_dcpl3() - - call testit_NGA_ACC_int4() - - call testit_NGA_ACC_dbl4() - - call testit_NGA_ACC_dcpl4() - - call testit_NGA_ACC_int5() - - call testit_NGA_ACC_dbl5() - - call testit_NGA_ACC_dcpl5() - - call testit_NGA_ACC_int6() - - call testit_NGA_ACC_dbl6() - - call testit_NGA_ACC_dcpl6() - - call testit_NGA_ACC_int7() - - call testit_NGA_ACC_dbl7() - - call testit_NGA_ACC_dcpl7() - -c -c test NGA_PERIODIC_GET - -c -c test NGA_PERIODIC_PUT - -c -c test NGA_PERIODIC_ACC - -c -c test NGA_FILL_PATCH - -c -c test NGA_COPY_PATCH - -c -c test NGA_SCALE_PATCH - -c -c test NGA_ADD_PATCH - -c -c test NGA_DOT_PATCH - -c -c test NGA_SCATTER - - call testit_NGA_SCATTER_int1() - - call testit_NGA_SCATTER_dbl1() - - call testit_NGA_SCATTER_dcpl1() - - call testit_NGA_SCATTER_int2() - - call testit_NGA_SCATTER_dbl2() - - call testit_NGA_SCATTER_dcpl2() - - call testit_NGA_SCATTER_int3() - - call testit_NGA_SCATTER_dbl3() - - call testit_NGA_SCATTER_dcpl3() - - call testit_NGA_SCATTER_int4() - - call testit_NGA_SCATTER_dbl4() - - call testit_NGA_SCATTER_dcpl4() - - call testit_NGA_SCATTER_int5() - - call testit_NGA_SCATTER_dbl5() - - call testit_NGA_SCATTER_dcpl5() - - call testit_NGA_SCATTER_int6() - - call testit_NGA_SCATTER_dbl6() - - call testit_NGA_SCATTER_dcpl6() - - call testit_NGA_SCATTER_int7() - - call testit_NGA_SCATTER_dbl7() - - call testit_NGA_SCATTER_dcpl7() - -c -c test NGA_SCATTER_ACC - - call testit_NGA_SCATTER_ACC_int1() - - call testit_NGA_SCATTER_ACC_dbl1() - - call testit_NGA_SCATTER_ACC_dcpl1() - - call testit_NGA_SCATTER_ACC_int2() - - call testit_NGA_SCATTER_ACC_dbl2() - - call testit_NGA_SCATTER_ACC_dcpl2() - - call testit_NGA_SCATTER_ACC_int3() - - call testit_NGA_SCATTER_ACC_dbl3() - - call testit_NGA_SCATTER_ACC_dcpl3() - - call testit_NGA_SCATTER_ACC_int4() - - call testit_NGA_SCATTER_ACC_dbl4() - - call testit_NGA_SCATTER_ACC_dcpl4() - - call testit_NGA_SCATTER_ACC_int5() - - call testit_NGA_SCATTER_ACC_dbl5() - - call testit_NGA_SCATTER_ACC_dcpl5() - - call testit_NGA_SCATTER_ACC_int6() - - call testit_NGA_SCATTER_ACC_dbl6() - - call testit_NGA_SCATTER_ACC_dcpl6() - - call testit_NGA_SCATTER_ACC_int7() - - call testit_NGA_SCATTER_ACC_dbl7() - - call testit_NGA_SCATTER_ACC_dcpl7() - -c -c test NGA_GATHER - - call testit_NGA_GATHER_int1() - - call testit_NGA_GATHER_dbl1() - - call testit_NGA_GATHER_dcpl1() - - call testit_NGA_GATHER_int2() - - call testit_NGA_GATHER_dbl2() - - call testit_NGA_GATHER_dcpl2() - - call testit_NGA_GATHER_int3() - - call testit_NGA_GATHER_dbl3() - - call testit_NGA_GATHER_dcpl3() - - call testit_NGA_GATHER_int4() - - call testit_NGA_GATHER_dbl4() - - call testit_NGA_GATHER_dcpl4() - - call testit_NGA_GATHER_int5() - - call testit_NGA_GATHER_dbl5() - - call testit_NGA_GATHER_dcpl5() - - call testit_NGA_GATHER_int6() - - call testit_NGA_GATHER_dbl6() - - call testit_NGA_GATHER_dcpl6() - - call testit_NGA_GATHER_int7() - - call testit_NGA_GATHER_dbl7() - - call testit_NGA_GATHER_dcpl7() - - - -c - if(ga_nodeid().eq.0) print *, 'All tests successful' -c - call ga_terminate() - call MP_FINALIZE() - end - -c----------------- - - - - - - subroutine testit_NGA_PUT_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer c(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) - a(d(1,i))= - $ a(d(1,i)) - $ *alpha - $ +b(lo(1)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(abs(a(lo(1)) - - $ b(lo(1))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1)), - $ 'b=', - $ b(lo(1)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer c(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) - a(d(1,i),d(2,i))= - $ a(d(1,i),d(2,i)) - $ *alpha - $ +b(lo(1),lo(2)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2)) - - $ b(lo(1),lo(2))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2)), - $ 'b=', - $ b(lo(1),lo(2)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer c(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) - a(d(1,i),d(2,i),d(3,i))= - $ a(d(1,i),d(2,i),d(3,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3)) - - $ b(lo(1),lo(2),lo(3))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3)), - $ 'b=', - $ b(lo(1),lo(2),lo(3)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer c(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4)) - - $ b(lo(1),lo(2),lo(3),lo(4))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer c(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - subroutine testit_NGA_PUT_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - double precision c(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) - a(d(1,i))= - $ a(d(1,i)) - $ *alpha - $ +b(lo(1)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(abs(a(lo(1)) - - $ b(lo(1))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1)), - $ 'b=', - $ b(lo(1)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - double precision c(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) - a(d(1,i),d(2,i))= - $ a(d(1,i),d(2,i)) - $ *alpha - $ +b(lo(1),lo(2)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2)) - - $ b(lo(1),lo(2))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2)), - $ 'b=', - $ b(lo(1),lo(2)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - double precision c(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) - a(d(1,i),d(2,i),d(3,i))= - $ a(d(1,i),d(2,i),d(3,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3)) - - $ b(lo(1),lo(2),lo(3))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3)), - $ 'b=', - $ b(lo(1),lo(2),lo(3)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - double precision c(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4)) - - $ b(lo(1),lo(2),lo(3),lo(4))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - double precision c(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - double precision c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - double precision c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - subroutine testit_NGA_PUT_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - double complex c(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) - a(d(1,i))= - $ a(d(1,i)) - $ *alpha - $ +b(lo(1)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(abs(a(lo(1)) - - $ b(lo(1))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1)), - $ 'b=', - $ b(lo(1)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - double complex c(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) - a(d(1,i),d(2,i))= - $ a(d(1,i),d(2,i)) - $ *alpha - $ +b(lo(1),lo(2)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2)) - - $ b(lo(1),lo(2))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2)), - $ 'b=', - $ b(lo(1),lo(2)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - double complex c(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) - a(d(1,i),d(2,i),d(3,i))= - $ a(d(1,i),d(2,i),d(3,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3)) - - $ b(lo(1),lo(2),lo(3))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3)), - $ 'b=', - $ b(lo(1),lo(2),lo(3)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - double complex c(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4)) - - $ b(lo(1),lo(2),lo(3),lo(4))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - double complex c(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - double complex c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - subroutine testit_NGA_PUT_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - double complex c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GATHER_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - -c----------------------- -c Utility functions - - subroutine random_range(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, swap, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = hi(i)-lo(i)+1 - val = iran(range) - lop(i) = lo(i) + val - val = iran(range) - hip(i) = hi(i) - val - if(hip(i) .lt. lop(i))then - swap =hip(i) - hip(i)=lop(i) - lop(i)=swap - endif - hip(i)=MIN(hip(i),hi(i)) - lop(i)=MAX(lop(i),lo(i)) - enddo - end -c - -c - subroutine random_range_outbound(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = 2*(hi(i)-lo(i)+1) - val = iran(range) - lop(i) = lo(i) + val - range = hi(i)-lo(i)+1 - val = iran(range) - hip(i) = lop(i) + range - val -c - hip(i) = hip(i)-hi(i) - lop(i) = lop(i)-hi(i) - enddo - end -c -c - integer function count_elems(lo,hi,ndim) - implicit none - integer lo(1),hi(1),ndim,elems,i - elems=1 - do i=1,ndim - elems = elems*(hi(i)-lo(i)+1) - enddo - count_elems = elems - end -c - -c get the next available nindex in the range of lo and hi - integer function next_index(ind,total,ndim,lo,hi,dims) - implicit none - integer ind,total,ndim,lo(ndim),hi(ndim),dims(ndim) - integer i - integer indx(8),nindex -c - nindex = ind + 1 - 200 call conv_1ton(ndim,dims,nindex,indx) -c -c test if indx(i) is in the range of lo(i) and hi(i) - do i=1,ndim - if((indx(i).lt.lo(i)).or.(indx(i).gt.hi(i))) then - nindex = nindex + 1 - if(nindex.gt.total) then - next_index = 0 - goto 300 - else - goto 200 - endif - endif - enddo -c - next_index = nindex - 300 end - -c testing if the indices are unique - integer function unique(ind,ndim,m,n) - implicit none - integer ndim,m,n - integer ind(ndim,m) - integer i,j,marker -c - unique = 1 - do i = 1, n-1 - marker = 0 - do j = 1, ndim - if(ind(j,n).eq.ind(j,i)) marker = marker + 1 - enddo -c - if(marker.eq.ndim) unique = 0 - enddo -c - end - -c - subroutine prnt_rng(me,lo,hi,ndim) - implicit none - integer me,ndim - integer lo(ndim),hi(ndim) - integer i -c - print *, me,': array section [',(lo(i),':',hi(i),i=1,ndim),']' -c - end - -c divide the space into equal size patches according to nproc -c and calculate my lo and hi - subroutine my_space(me,nproc,ndim,total,dims,lo,hi) - implicit none - integer me,nproc,ndim,total - integer dims(ndim),lo(ndim),hi(ndim) - integer div,lop,hip,i -c - div = total/nproc -c - lop = div * me + 1 -c - if(me.eq.(nproc-1)) then - hip = total - else - hip = div * (me+1) - endif -c - call conv_1ton(ndim,dims,lop,lo) - call conv_1ton(ndim,dims,hip,hi) -c -c swap the indices if the lo if larger thant hi - do i = 1,ndim - if(lo(i).gt.hi(i)) then - if(i.eq.ndim) call ga_error('bye',0) - lo(i) = 1 - lo(i+1) = lo(i+1) + 1 - endif - enddo - end - -c convert the index from one dimension to n dimension - subroutine conv_1ton(ndim,dims,ind1,indn) - implicit none - integer ndim - integer dims(ndim) - integer ind1,indn(ndim) - integer range(8),remainder,i -c - remainder = ind1 -c get the range of each dimension - do i=1,ndim - if(i.eq.1) then - range(i) = dims(i) - else - range(i) = range(i-1) * dims(i) - endif - enddo -c -c get the indices in each dimension - do i = ndim,1,-1 - if(i.ne.1) then - indn(i) = remainder/range(i-1) - remainder = remainder - indn(i)*range(i-1) - if(remainder.eq.0) then - remainder = range(i-1) - else - indn(i) = indn(i) + 1 - endif - else - indn(i) = remainder - endif - enddo -c - end - - -c fill array with random numbers - subroutine fill_array_int(a,n,val) - implicit none - integer n - integer a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_int(a,n) - implicit none - integer n - integer a(n) - double precision drand - integer i - do i= 1, n - a(i) = int(drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_int(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - integer array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_int(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_int(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - integer a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - integer function dot_patch_int( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - integer res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_int = res -c - end - - -c fill array with random numbers - subroutine fill_array_dbl(a,n,val) - implicit none - integer n - double precision a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dbl(a,n) - implicit none - integer n - double precision a(n) - double precision drand - integer i - do i= 1, n - a(i) = drand(0) * i * 2 - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dbl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double precision array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dbl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - double precision alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dbl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double precision a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double precision function dot_patch_dbl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double precision res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dbl = res -c - end - - -c fill array with random numbers - subroutine fill_array_dcpl(a,n,val) - implicit none - integer n - double complex a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dcpl(a,n) - implicit none - integer n - double complex a(n) - double precision drand - integer i - do i= 1, n - a(i) = dcmplx(drand(0) * i * 2, - $ -drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dcpl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double complex array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dcpl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - double complex alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dcpl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double complex a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double complex function dot_patch_dcpl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double complex res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dcpl = res -c - end - - diff --git a/global/testing/ngatest_src/generated/nga-patch.F b/global/testing/ngatest_src/generated/nga-patch.F deleted file mode 100644 index e27a53275..000000000 --- a/global/testing/ngatest_src/generated/nga-patch.F +++ /dev/null @@ -1,15996 +0,0 @@ - - -#if HAVE_CONFIG_H -# include "config.fh" -#endif -#define MAXLOOP 100 - - program test - implicit none -#include "mafdecls.fh" -#include "global.fh" - integer stack, heap -c -c*** Intitialize a message passing library -c -#include "mp3.fh" -c -c*** Intitialize the GA package - call ga_initialize() -c if(ga_nodeid().eq.0)print *,ga_nnodes(),' nodes' -c - if(ga_uses_ma()) then - stack = 200000 -c stack = 100000 - else - stack = 60000 -c stack = 20000 - endif - heap = stack - if (.not. ma_init(MT_DBL, heap, stack)) - $ call ga_error("ma init failed",heap+stack) -c -c -c test GA_FILL - -c -c test NGA_GET - -c -c test NGA_PUT - -c -c test NGA_ACC - -c -c test NGA_PERIODIC_GET - -c -c test NGA_PERIODIC_PUT - -c -c test NGA_PERIODIC_ACC - -c -c test NGA_FILL_PATCH - - call testit_NGA_FILL_PATCH_int1() - - call testit_NGA_FILL_PATCH_dbl1() - - call testit_NGA_FILL_PATCH_dcpl1() - - call testit_NGA_FILL_PATCH_int2() - - call testit_NGA_FILL_PATCH_dbl2() - - call testit_NGA_FILL_PATCH_dcpl2() - - call testit_NGA_FILL_PATCH_int3() - - call testit_NGA_FILL_PATCH_dbl3() - - call testit_NGA_FILL_PATCH_dcpl3() - - call testit_NGA_FILL_PATCH_int4() - - call testit_NGA_FILL_PATCH_dbl4() - - call testit_NGA_FILL_PATCH_dcpl4() - - call testit_NGA_FILL_PATCH_int5() - - call testit_NGA_FILL_PATCH_dbl5() - - call testit_NGA_FILL_PATCH_dcpl5() - - call testit_NGA_FILL_PATCH_int6() - - call testit_NGA_FILL_PATCH_dbl6() - - call testit_NGA_FILL_PATCH_dcpl6() - - call testit_NGA_FILL_PATCH_int7() - - call testit_NGA_FILL_PATCH_dbl7() - - call testit_NGA_FILL_PATCH_dcpl7() - -c -c test NGA_COPY_PATCH - - call testit_NGA_COPY_PATCH_int1() - - call testit_NGA_COPY_PATCH_dbl1() - - call testit_NGA_COPY_PATCH_dcpl1() - - call testit_NGA_COPY_PATCH_int2() - - call testit_NGA_COPY_PATCH_dbl2() - - call testit_NGA_COPY_PATCH_dcpl2() - - call testit_NGA_COPY_PATCH_int3() - - call testit_NGA_COPY_PATCH_dbl3() - - call testit_NGA_COPY_PATCH_dcpl3() - - call testit_NGA_COPY_PATCH_int4() - - call testit_NGA_COPY_PATCH_dbl4() - - call testit_NGA_COPY_PATCH_dcpl4() - - call testit_NGA_COPY_PATCH_int5() - - call testit_NGA_COPY_PATCH_dbl5() - - call testit_NGA_COPY_PATCH_dcpl5() - - call testit_NGA_COPY_PATCH_int6() - - call testit_NGA_COPY_PATCH_dbl6() - - call testit_NGA_COPY_PATCH_dcpl6() - - call testit_NGA_COPY_PATCH_int7() - - call testit_NGA_COPY_PATCH_dbl7() - - call testit_NGA_COPY_PATCH_dcpl7() - -c -c test NGA_SCALE_PATCH - - call testit_NGA_SCALE_PATCH_int1() - - call testit_NGA_SCALE_PATCH_dbl1() - - call testit_NGA_SCALE_PATCH_dcpl1() - - call testit_NGA_SCALE_PATCH_int2() - - call testit_NGA_SCALE_PATCH_dbl2() - - call testit_NGA_SCALE_PATCH_dcpl2() - - call testit_NGA_SCALE_PATCH_int3() - - call testit_NGA_SCALE_PATCH_dbl3() - - call testit_NGA_SCALE_PATCH_dcpl3() - - call testit_NGA_SCALE_PATCH_int4() - - call testit_NGA_SCALE_PATCH_dbl4() - - call testit_NGA_SCALE_PATCH_dcpl4() - - call testit_NGA_SCALE_PATCH_int5() - - call testit_NGA_SCALE_PATCH_dbl5() - - call testit_NGA_SCALE_PATCH_dcpl5() - - call testit_NGA_SCALE_PATCH_int6() - - call testit_NGA_SCALE_PATCH_dbl6() - - call testit_NGA_SCALE_PATCH_dcpl6() - - call testit_NGA_SCALE_PATCH_int7() - - call testit_NGA_SCALE_PATCH_dbl7() - - call testit_NGA_SCALE_PATCH_dcpl7() - -c -c test NGA_ADD_PATCH - -c -c test NGA_DOT_PATCH - - call testit_NGA_DOT_PATCH_int1() - - call testit_NGA_DOT_PATCH_dbl1() - - call testit_NGA_DOT_PATCH_dcpl1() - - call testit_NGA_DOT_PATCH_int2() - - call testit_NGA_DOT_PATCH_dbl2() - - call testit_NGA_DOT_PATCH_dcpl2() - - call testit_NGA_DOT_PATCH_int3() - - call testit_NGA_DOT_PATCH_dbl3() - - call testit_NGA_DOT_PATCH_dcpl3() - - call testit_NGA_DOT_PATCH_int4() - - call testit_NGA_DOT_PATCH_dbl4() - - call testit_NGA_DOT_PATCH_dcpl4() - - call testit_NGA_DOT_PATCH_int5() - - call testit_NGA_DOT_PATCH_dbl5() - - call testit_NGA_DOT_PATCH_dcpl5() - - call testit_NGA_DOT_PATCH_int6() - - call testit_NGA_DOT_PATCH_dbl6() - - call testit_NGA_DOT_PATCH_dcpl6() - - call testit_NGA_DOT_PATCH_int7() - - call testit_NGA_DOT_PATCH_dbl7() - - call testit_NGA_DOT_PATCH_dcpl7() - -c -c test NGA_SCATTER - -c -c test NGA_SCATTER_ACC - -c -c test NGA_GATHER - - - -c - if(ga_nodeid().eq.0) print *, 'All tests successful' -c - call ga_terminate() - call MP_FINALIZE() - end - -c----------------- - - - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer c(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer c(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer c(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer c(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer c(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer c(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer c(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - double precision c(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - double precision c(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - double precision c(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - double precision c(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - double precision c(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - double precision c(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - double precision c(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - double complex c(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - double complex c(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - double complex c(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - double complex c(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - double complex c(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - double complex c(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - - - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - double complex c(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - -c----------------------- -c Utility functions - - subroutine random_range(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, swap, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = hi(i)-lo(i)+1 - val = iran(range) - lop(i) = lo(i) + val - val = iran(range) - hip(i) = hi(i) - val - if(hip(i) .lt. lop(i))then - swap =hip(i) - hip(i)=lop(i) - lop(i)=swap - endif - hip(i)=MIN(hip(i),hi(i)) - lop(i)=MAX(lop(i),lo(i)) - enddo - end -c - -c - subroutine random_range_outbound(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = 2*(hi(i)-lo(i)+1) - val = iran(range) - lop(i) = lo(i) + val - range = hi(i)-lo(i)+1 - val = iran(range) - hip(i) = lop(i) + range - val -c - hip(i) = hip(i)-hi(i) - lop(i) = lop(i)-hi(i) - enddo - end -c -c - integer function count_elems(lo,hi,ndim) - implicit none - integer lo(1),hi(1),ndim,elems,i - elems=1 - do i=1,ndim - elems = elems*(hi(i)-lo(i)+1) - enddo - count_elems = elems - end -c - -c get the next available nindex in the range of lo and hi - integer function next_index(ind,total,ndim,lo,hi,dims) - implicit none - integer ind,total,ndim,lo(ndim),hi(ndim),dims(ndim) - integer i - integer indx(8),nindex -c - nindex = ind + 1 - 200 call conv_1ton(ndim,dims,nindex,indx) -c -c test if indx(i) is in the range of lo(i) and hi(i) - do i=1,ndim - if((indx(i).lt.lo(i)).or.(indx(i).gt.hi(i))) then - nindex = nindex + 1 - if(nindex.gt.total) then - next_index = 0 - goto 300 - else - goto 200 - endif - endif - enddo -c - next_index = nindex - 300 end - -c testing if the indices are unique - integer function unique(ind,ndim,m,n) - implicit none - integer ndim,m,n - integer ind(ndim,m) - integer i,j,marker -c - unique = 1 - do i = 1, n-1 - marker = 0 - do j = 1, ndim - if(ind(j,n).eq.ind(j,i)) marker = marker + 1 - enddo -c - if(marker.eq.ndim) unique = 0 - enddo -c - end - -c - subroutine prnt_rng(me,lo,hi,ndim) - implicit none - integer me,ndim - integer lo(ndim),hi(ndim) - integer i -c - print *, me,': array section [',(lo(i),':',hi(i),i=1,ndim),']' -c - end - -c divide the space into equal size patches according to nproc -c and calculate my lo and hi - subroutine my_space(me,nproc,ndim,total,dims,lo,hi) - implicit none - integer me,nproc,ndim,total - integer dims(ndim),lo(ndim),hi(ndim) - integer div,lop,hip,i -c - div = total/nproc -c - lop = div * me + 1 -c - if(me.eq.(nproc-1)) then - hip = total - else - hip = div * (me+1) - endif -c - call conv_1ton(ndim,dims,lop,lo) - call conv_1ton(ndim,dims,hip,hi) -c -c swap the indices if the lo if larger thant hi - do i = 1,ndim - if(lo(i).gt.hi(i)) then - if(i.eq.ndim) call ga_error('bye',0) - lo(i) = 1 - lo(i+1) = lo(i+1) + 1 - endif - enddo - end - -c convert the index from one dimension to n dimension - subroutine conv_1ton(ndim,dims,ind1,indn) - implicit none - integer ndim - integer dims(ndim) - integer ind1,indn(ndim) - integer range(8),remainder,i -c - remainder = ind1 -c get the range of each dimension - do i=1,ndim - if(i.eq.1) then - range(i) = dims(i) - else - range(i) = range(i-1) * dims(i) - endif - enddo -c -c get the indices in each dimension - do i = ndim,1,-1 - if(i.ne.1) then - indn(i) = remainder/range(i-1) - remainder = remainder - indn(i)*range(i-1) - if(remainder.eq.0) then - remainder = range(i-1) - else - indn(i) = indn(i) + 1 - endif - else - indn(i) = remainder - endif - enddo -c - end - - -c fill array with random numbers - subroutine fill_array_int(a,n,val) - implicit none - integer n - integer a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_int(a,n) - implicit none - integer n - integer a(n) - double precision drand - integer i - do i= 1, n - a(i) = int(drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_int(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - integer array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_int(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_int(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - integer a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - integer function dot_patch_int( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - integer res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_int = res -c - end - - -c fill array with random numbers - subroutine fill_array_dbl(a,n,val) - implicit none - integer n - double precision a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dbl(a,n) - implicit none - integer n - double precision a(n) - double precision drand - integer i - do i= 1, n - a(i) = drand(0) * i * 2 - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dbl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double precision array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dbl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - double precision alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dbl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double precision a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double precision function dot_patch_dbl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double precision res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dbl = res -c - end - - -c fill array with random numbers - subroutine fill_array_dcpl(a,n,val) - implicit none - integer n - double complex a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dcpl(a,n) - implicit none - integer n - double complex a(n) - double precision drand - integer i - do i= 1, n - a(i) = dcmplx(drand(0) * i * 2, - $ -drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dcpl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double complex array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dcpl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - double complex alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dcpl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double complex a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double complex function dot_patch_dcpl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double complex res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dcpl = res -c - end - - diff --git a/global/testing/ngatest_src/generated/nga-periodic.F b/global/testing/ngatest_src/generated/nga-periodic.F deleted file mode 100644 index 17cabc35f..000000000 --- a/global/testing/ngatest_src/generated/nga-periodic.F +++ /dev/null @@ -1,7608 +0,0 @@ - - -#if HAVE_CONFIG_H -# include "config.fh" -#endif -#define MAXLOOP 100 - - program test - implicit none -#include "mafdecls.fh" -#include "global.fh" - integer stack, heap -c -c*** Intitialize a message passing library -c -#include "mp3.fh" -c -c*** Intitialize the GA package - call ga_initialize() -c if(ga_nodeid().eq.0)print *,ga_nnodes(),' nodes' -c - if(ga_uses_ma()) then - stack = 200000 -c stack = 100000 - else - stack = 60000 -c stack = 20000 - endif - heap = stack - if (.not. ma_init(MT_DBL, heap, stack)) - $ call ga_error("ma init failed",heap+stack) -c -c -c test GA_FILL - -c -c test NGA_GET - -c -c test NGA_PUT - -c -c test NGA_ACC - -c -c test NGA_PERIODIC_GET - - call testit_NGA_PERIODIC_GET_int1() - - call testit_NGA_PERIODIC_GET_dbl1() - - call testit_NGA_PERIODIC_GET_dcpl1() - - call testit_NGA_PERIODIC_GET_int2() - - call testit_NGA_PERIODIC_GET_dbl2() - - call testit_NGA_PERIODIC_GET_dcpl2() - - call testit_NGA_PERIODIC_GET_int3() - - call testit_NGA_PERIODIC_GET_dbl3() - - call testit_NGA_PERIODIC_GET_dcpl3() - - call testit_NGA_PERIODIC_GET_int4() - - call testit_NGA_PERIODIC_GET_dbl4() - - call testit_NGA_PERIODIC_GET_dcpl4() - - call testit_NGA_PERIODIC_GET_int5() - - call testit_NGA_PERIODIC_GET_dbl5() - - call testit_NGA_PERIODIC_GET_dcpl5() - - call testit_NGA_PERIODIC_GET_int6() - - call testit_NGA_PERIODIC_GET_dbl6() - - call testit_NGA_PERIODIC_GET_dcpl6() - - call testit_NGA_PERIODIC_GET_int7() - - call testit_NGA_PERIODIC_GET_dbl7() - - call testit_NGA_PERIODIC_GET_dcpl7() - -c -c test NGA_PERIODIC_PUT - - call testit_NGA_PERIODIC_PUT_int1() - - call testit_NGA_PERIODIC_PUT_dbl1() - - call testit_NGA_PERIODIC_PUT_dcpl1() - - call testit_NGA_PERIODIC_PUT_int2() - - call testit_NGA_PERIODIC_PUT_dbl2() - - call testit_NGA_PERIODIC_PUT_dcpl2() - - call testit_NGA_PERIODIC_PUT_int3() - - call testit_NGA_PERIODIC_PUT_dbl3() - - call testit_NGA_PERIODIC_PUT_dcpl3() - - call testit_NGA_PERIODIC_PUT_int4() - - call testit_NGA_PERIODIC_PUT_dbl4() - - call testit_NGA_PERIODIC_PUT_dcpl4() - - call testit_NGA_PERIODIC_PUT_int5() - - call testit_NGA_PERIODIC_PUT_dbl5() - - call testit_NGA_PERIODIC_PUT_dcpl5() - - call testit_NGA_PERIODIC_PUT_int6() - - call testit_NGA_PERIODIC_PUT_dbl6() - - call testit_NGA_PERIODIC_PUT_dcpl6() - - call testit_NGA_PERIODIC_PUT_int7() - - call testit_NGA_PERIODIC_PUT_dbl7() - - call testit_NGA_PERIODIC_PUT_dcpl7() - -c -c test NGA_PERIODIC_ACC - - call testit_NGA_PERIODIC_ACC_int1() - - call testit_NGA_PERIODIC_ACC_dbl1() - - call testit_NGA_PERIODIC_ACC_dcpl1() - - call testit_NGA_PERIODIC_ACC_int2() - - call testit_NGA_PERIODIC_ACC_dbl2() - - call testit_NGA_PERIODIC_ACC_dcpl2() - - call testit_NGA_PERIODIC_ACC_int3() - - call testit_NGA_PERIODIC_ACC_dbl3() - - call testit_NGA_PERIODIC_ACC_dcpl3() - - call testit_NGA_PERIODIC_ACC_int4() - - call testit_NGA_PERIODIC_ACC_dbl4() - - call testit_NGA_PERIODIC_ACC_dcpl4() - - call testit_NGA_PERIODIC_ACC_int5() - - call testit_NGA_PERIODIC_ACC_dbl5() - - call testit_NGA_PERIODIC_ACC_dcpl5() - - call testit_NGA_PERIODIC_ACC_int6() - - call testit_NGA_PERIODIC_ACC_dbl6() - - call testit_NGA_PERIODIC_ACC_dcpl6() - - call testit_NGA_PERIODIC_ACC_int7() - - call testit_NGA_PERIODIC_ACC_dbl7() - - call testit_NGA_PERIODIC_ACC_dcpl7() - -c -c test NGA_FILL_PATCH - -c -c test NGA_COPY_PATCH - -c -c test NGA_SCALE_PATCH - -c -c test NGA_ADD_PATCH - -c -c test NGA_DOT_PATCH - -c -c test NGA_SCATTER - -c -c test NGA_SCATTER_ACC - -c -c test NGA_GATHER - - - -c - if(ga_nodeid().eq.0) print *, 'All tests successful' -c - call ga_terminate() - call MP_FINALIZE() - end - -c----------------- - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer c(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) - call init_array_int(b,total) -c -c initialize array g_a - call ga_fill(g_a,int(123)) - call ga_sync() -c - alpha = int(drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_int(total, - $ int(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_int(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer c(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) - call init_array_int(b,total) -c -c initialize array g_a - call ga_fill(g_a,int(123)) - call ga_sync() -c - alpha = int(drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_int(total, - $ int(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_int(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer c(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) - call init_array_int(b,total) -c -c initialize array g_a - call ga_fill(g_a,int(123)) - call ga_sync() -c - alpha = int(drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_int(total, - $ int(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_int(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer c(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) - call init_array_int(b,total) -c -c initialize array g_a - call ga_fill(g_a,int(123)) - call ga_sync() -c - alpha = int(drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_int(total, - $ int(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_int(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer c(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) - call init_array_int(b,total) -c -c initialize array g_a - call ga_fill(g_a,int(123)) - call ga_sync() -c - alpha = int(drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_int(total, - $ int(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_int(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) - call init_array_int(b,total) -c -c initialize array g_a - call ga_fill(g_a,int(123)) - call ga_sync() -c - alpha = int(drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_int(total, - $ int(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_int(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_int(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_int(a,total) - call init_array_int(b,total) -c -c initialize array g_a - call ga_fill(g_a,int(123)) - call ga_sync() -c - alpha = int(drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_int(total, - $ int(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_int(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - double precision c(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) - call init_array_dbl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dble(123)) - call ga_sync() -c - alpha = drand(0) * me*2+1 * 2 -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dbl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - double precision c(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) - call init_array_dbl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dble(123)) - call ga_sync() -c - alpha = drand(0) * me*2+1 * 2 -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dbl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - double precision c(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) - call init_array_dbl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dble(123)) - call ga_sync() -c - alpha = drand(0) * me*2+1 * 2 -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dbl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - double precision c(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) - call init_array_dbl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dble(123)) - call ga_sync() -c - alpha = drand(0) * me*2+1 * 2 -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dbl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - double precision c(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) - call init_array_dbl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dble(123)) - call ga_sync() -c - alpha = drand(0) * me*2+1 * 2 -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dbl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - double precision c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) - call init_array_dbl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dble(123)) - call ga_sync() -c - alpha = drand(0) * me*2+1 * 2 -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dbl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dbl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - double precision c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double precision alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dbl(a,total) - call init_array_dbl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dble(123)) - call ga_sync() -c - alpha = drand(0) * me*2+1 * 2 -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dbl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 20000) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - double complex c(n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) - call init_array_dcpl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dcmplx(dble(123),dble(0))) - call ga_sync() -c - alpha = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dcpl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 140) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - double complex c(n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) - call init_array_dcpl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dcmplx(dble(123),dble(0))) - call ga_sync() -c - alpha = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dcpl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 27) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - double complex c(n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) - call init_array_dcpl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dcmplx(dble(123),dble(0))) - call ga_sync() -c - alpha = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dcpl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 11) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - double complex c(n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) - call init_array_dcpl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dcmplx(dble(123),dble(0))) - call ga_sync() -c - alpha = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dcpl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 7) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - double complex c(n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) - call init_array_dcpl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dcmplx(dble(123),dble(0))) - call ga_sync() -c - alpha = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dcpl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 5) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - double complex c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) - call init_array_dcpl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dcmplx(dble(123),dble(0))) - call ga_sync() -c - alpha = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dcpl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - subroutine testit_NGA_PERIODIC_PUT_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_PUT ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) -c -c initialize array a - call ga_zero(g_a) - call ga_sync() -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() - call nga_periodic_put(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_GET_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_GET ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c -c initialize array a - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - do loop = 1, MAXLOOP - call random_range_outbound(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call nga_periodic_get(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_sync() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo - call compare_patches_dcpl(0d0, - $ total,a,blo,bhi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PERIODIC_ACC_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n - integer ndim - parameter (n = 4) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - double complex c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),lop(ndim),hip(ndim) - integer blo(ndim),bhi(ndim) - integer dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total,loop - double complex alpha - double precision drand - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------ NGA_PERIODIC_ACC ---------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_periodic_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - total = 1 - do i = 1,ndim - lo(i) = 1 - hi(i) = n - total = total*dims(i) - enddo -c - call init_array_dcpl(a,total) - call init_array_dcpl(b,total) -c -c initialize array g_a - call ga_fill(g_a,dcmplx(dble(123),dble(0))) - call ga_sync() -c - alpha = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c - do loop = 1, MAXLOOP - if(mod(loop,nproc).eq.me) then - call random_range_outbound(lo,hi,lop,hip,ndim) - if(Mod(loop,10).eq.0) then - call print_range(loop,lop,hip,ndim) - endif - call ga_init_fence() -c keep a copy of the original patch - call nga_periodic_put(g_a,lop,hip, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() - call ga_init_fence() - call nga_periodic_acc(g_a,lop,hip, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) - call ga_fence() - call ga_init_fence() - call nga_periodic_get(g_a,lop,hip, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call ga_fence() -c -c check the result - do i=1,ndim - blo(i) = 1 - bhi(i) = hip(i)-lop(i)+1 - enddo -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),b,blo,bhi,ndim,dims, - $ alpha,a,blo,bhi,ndim,dims) - call compare_patches_dcpl(1d-2, - $ total,b,blo,bhi,ndim,dims,total,c,blo,bhi,ndim,dims) - endif - call ga_sync() - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - -c----------------------- -c Utility functions - - subroutine random_range(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, swap, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = hi(i)-lo(i)+1 - val = iran(range) - lop(i) = lo(i) + val - val = iran(range) - hip(i) = hi(i) - val - if(hip(i) .lt. lop(i))then - swap =hip(i) - hip(i)=lop(i) - lop(i)=swap - endif - hip(i)=MIN(hip(i),hi(i)) - lop(i)=MAX(lop(i),lo(i)) - enddo - end -c - -c - subroutine random_range_outbound(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = 2*(hi(i)-lo(i)+1) - val = iran(range) - lop(i) = lo(i) + val - range = hi(i)-lo(i)+1 - val = iran(range) - hip(i) = lop(i) + range - val -c - hip(i) = hip(i)-hi(i) - lop(i) = lop(i)-hi(i) - enddo - end -c -c - integer function count_elems(lo,hi,ndim) - implicit none - integer lo(1),hi(1),ndim,elems,i - elems=1 - do i=1,ndim - elems = elems*(hi(i)-lo(i)+1) - enddo - count_elems = elems - end -c - -c get the next available nindex in the range of lo and hi - integer function next_index(ind,total,ndim,lo,hi,dims) - implicit none - integer ind,total,ndim,lo(ndim),hi(ndim),dims(ndim) - integer i - integer indx(8),nindex -c - nindex = ind + 1 - 200 call conv_1ton(ndim,dims,nindex,indx) -c -c test if indx(i) is in the range of lo(i) and hi(i) - do i=1,ndim - if((indx(i).lt.lo(i)).or.(indx(i).gt.hi(i))) then - nindex = nindex + 1 - if(nindex.gt.total) then - next_index = 0 - goto 300 - else - goto 200 - endif - endif - enddo -c - next_index = nindex - 300 end - -c testing if the indices are unique - integer function unique(ind,ndim,m,n) - implicit none - integer ndim,m,n - integer ind(ndim,m) - integer i,j,marker -c - unique = 1 - do i = 1, n-1 - marker = 0 - do j = 1, ndim - if(ind(j,n).eq.ind(j,i)) marker = marker + 1 - enddo -c - if(marker.eq.ndim) unique = 0 - enddo -c - end - -c - subroutine prnt_rng(me,lo,hi,ndim) - implicit none - integer me,ndim - integer lo(ndim),hi(ndim) - integer i -c - print *, me,': array section [',(lo(i),':',hi(i),i=1,ndim),']' -c - end - -c divide the space into equal size patches according to nproc -c and calculate my lo and hi - subroutine my_space(me,nproc,ndim,total,dims,lo,hi) - implicit none - integer me,nproc,ndim,total - integer dims(ndim),lo(ndim),hi(ndim) - integer div,lop,hip,i -c - div = total/nproc -c - lop = div * me + 1 -c - if(me.eq.(nproc-1)) then - hip = total - else - hip = div * (me+1) - endif -c - call conv_1ton(ndim,dims,lop,lo) - call conv_1ton(ndim,dims,hip,hi) -c -c swap the indices if the lo if larger thant hi - do i = 1,ndim - if(lo(i).gt.hi(i)) then - if(i.eq.ndim) call ga_error('bye',0) - lo(i) = 1 - lo(i+1) = lo(i+1) + 1 - endif - enddo - end - -c convert the index from one dimension to n dimension - subroutine conv_1ton(ndim,dims,ind1,indn) - implicit none - integer ndim - integer dims(ndim) - integer ind1,indn(ndim) - integer range(8),remainder,i -c - remainder = ind1 -c get the range of each dimension - do i=1,ndim - if(i.eq.1) then - range(i) = dims(i) - else - range(i) = range(i-1) * dims(i) - endif - enddo -c -c get the indices in each dimension - do i = ndim,1,-1 - if(i.ne.1) then - indn(i) = remainder/range(i-1) - remainder = remainder - indn(i)*range(i-1) - if(remainder.eq.0) then - remainder = range(i-1) - else - indn(i) = indn(i) + 1 - endif - else - indn(i) = remainder - endif - enddo -c - end - - -c fill array with random numbers - subroutine fill_array_int(a,n,val) - implicit none - integer n - integer a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_int(a,n) - implicit none - integer n - integer a(n) - double precision drand - integer i - do i= 1, n - a(i) = int(drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_int(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - integer array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_int(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_int(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - integer a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - integer function dot_patch_int( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - integer res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_int = res -c - end - - -c fill array with random numbers - subroutine fill_array_dbl(a,n,val) - implicit none - integer n - double precision a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dbl(a,n) - implicit none - integer n - double precision a(n) - double precision drand - integer i - do i= 1, n - a(i) = drand(0) * i * 2 - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dbl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double precision array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dbl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - double precision alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dbl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double precision a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double precision function dot_patch_dbl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double precision res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dbl = res -c - end - - -c fill array with random numbers - subroutine fill_array_dcpl(a,n,val) - implicit none - integer n - double complex a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dcpl(a,n) - implicit none - integer n - double complex a(n) - double precision drand - integer i - do i= 1, n - a(i) = dcmplx(drand(0) * i * 2, - $ -drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dcpl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double complex array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dcpl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - double complex alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dcpl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double complex a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double complex function dot_patch_dcpl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double complex res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dcpl = res -c - end - - diff --git a/global/testing/ngatest_src/generated/nga-scatter.F b/global/testing/ngatest_src/generated/nga-scatter.F deleted file mode 100644 index 0e30b9e5c..000000000 --- a/global/testing/ngatest_src/generated/nga-scatter.F +++ /dev/null @@ -1,6131 +0,0 @@ - - -#if HAVE_CONFIG_H -# include "config.fh" -#endif -#define MAXLOOP 100 - - program test - implicit none -#include "mafdecls.fh" -#include "global.fh" - integer stack, heap -c -c*** Intitialize a message passing library -c -#include "mp3.fh" -c -c*** Intitialize the GA package - call ga_initialize() -c if(ga_nodeid().eq.0)print *,ga_nnodes(),' nodes' -c - if(ga_uses_ma()) then - stack = 200000 -c stack = 100000 - else - stack = 60000 -c stack = 20000 - endif - heap = stack - if (.not. ma_init(MT_DBL, heap, stack)) - $ call ga_error("ma init failed",heap+stack) -c -c -c test GA_FILL - -c -c test NGA_GET - -c -c test NGA_PUT - -c -c test NGA_ACC - -c -c test NGA_PERIODIC_GET - -c -c test NGA_PERIODIC_PUT - -c -c test NGA_PERIODIC_ACC - -c -c test NGA_FILL_PATCH - -c -c test NGA_COPY_PATCH - -c -c test NGA_SCALE_PATCH - -c -c test NGA_ADD_PATCH - -c -c test NGA_DOT_PATCH - -c -c test NGA_SCATTER - - call testit_NGA_SCATTER_int1() - - call testit_NGA_SCATTER_dbl1() - - call testit_NGA_SCATTER_dcpl1() - - call testit_NGA_SCATTER_int2() - - call testit_NGA_SCATTER_dbl2() - - call testit_NGA_SCATTER_dcpl2() - - call testit_NGA_SCATTER_int3() - - call testit_NGA_SCATTER_dbl3() - - call testit_NGA_SCATTER_dcpl3() - - call testit_NGA_SCATTER_int4() - - call testit_NGA_SCATTER_dbl4() - - call testit_NGA_SCATTER_dcpl4() - - call testit_NGA_SCATTER_int5() - - call testit_NGA_SCATTER_dbl5() - - call testit_NGA_SCATTER_dcpl5() - - call testit_NGA_SCATTER_int6() - - call testit_NGA_SCATTER_dbl6() - - call testit_NGA_SCATTER_dcpl6() - - call testit_NGA_SCATTER_int7() - - call testit_NGA_SCATTER_dbl7() - - call testit_NGA_SCATTER_dcpl7() - -c -c test NGA_SCATTER_ACC - - call testit_NGA_SCATTER_ACC_int1() - - call testit_NGA_SCATTER_ACC_dbl1() - - call testit_NGA_SCATTER_ACC_dcpl1() - - call testit_NGA_SCATTER_ACC_int2() - - call testit_NGA_SCATTER_ACC_dbl2() - - call testit_NGA_SCATTER_ACC_dcpl2() - - call testit_NGA_SCATTER_ACC_int3() - - call testit_NGA_SCATTER_ACC_dbl3() - - call testit_NGA_SCATTER_ACC_dcpl3() - - call testit_NGA_SCATTER_ACC_int4() - - call testit_NGA_SCATTER_ACC_dbl4() - - call testit_NGA_SCATTER_ACC_dcpl4() - - call testit_NGA_SCATTER_ACC_int5() - - call testit_NGA_SCATTER_ACC_dbl5() - - call testit_NGA_SCATTER_ACC_dcpl5() - - call testit_NGA_SCATTER_ACC_int6() - - call testit_NGA_SCATTER_ACC_dbl6() - - call testit_NGA_SCATTER_ACC_dcpl6() - - call testit_NGA_SCATTER_ACC_int7() - - call testit_NGA_SCATTER_ACC_dbl7() - - call testit_NGA_SCATTER_ACC_dcpl7() - -c -c test NGA_GATHER - - - -c - if(ga_nodeid().eq.0) print *, 'All tests successful' -c - call ga_terminate() - call MP_FINALIZE() - end - -c----------------- - - - - - - - - - subroutine testit_NGA_SCATTER_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) - a(d(1,i))= - $ a(d(1,i)) - $ *alpha - $ +b(lo(1)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(abs(a(lo(1)) - - $ b(lo(1))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1)), - $ 'b=', - $ b(lo(1)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) - a(d(1,i),d(2,i))= - $ a(d(1,i),d(2,i)) - $ *alpha - $ +b(lo(1),lo(2)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2)) - - $ b(lo(1),lo(2))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2)), - $ 'b=', - $ b(lo(1),lo(2)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) - a(d(1,i),d(2,i),d(3,i))= - $ a(d(1,i),d(2,i),d(3,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3)) - - $ b(lo(1),lo(2),lo(3))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3)), - $ 'b=', - $ b(lo(1),lo(2),lo(3)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4)) - - $ b(lo(1),lo(2),lo(3),lo(4))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - integer v(m) - integer d(ndim, m) - double precision drand - integer alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=int(drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) - a(d(1,i))= - $ a(d(1,i)) - $ *alpha - $ +b(lo(1)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(abs(a(lo(1)) - - $ b(lo(1))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1)), - $ 'b=', - $ b(lo(1)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) - a(d(1,i),d(2,i))= - $ a(d(1,i),d(2,i)) - $ *alpha - $ +b(lo(1),lo(2)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2)) - - $ b(lo(1),lo(2))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2)), - $ 'b=', - $ b(lo(1),lo(2)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) - a(d(1,i),d(2,i),d(3,i))= - $ a(d(1,i),d(2,i),d(3,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3)) - - $ b(lo(1),lo(2),lo(3))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3)), - $ 'b=', - $ b(lo(1),lo(2),lo(3)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4)) - - $ b(lo(1),lo(2),lo(3),lo(4))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double precision v(m) - integer d(ndim, m) - double precision drand - double precision alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=drand(0) * me*2+1 * 2 -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) - a(d(1,i))= - $ a(d(1,i)) - $ *alpha - $ +b(lo(1)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(abs(a(lo(1)) - - $ b(lo(1))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1)), - $ 'b=', - $ b(lo(1)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) - a(d(1,i),d(2,i))= - $ a(d(1,i),d(2,i)) - $ *alpha - $ +b(lo(1),lo(2)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2)) - - $ b(lo(1),lo(2))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2)), - $ 'b=', - $ b(lo(1),lo(2)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) - a(d(1,i),d(2,i),d(3,i))= - $ a(d(1,i),d(2,i),d(3,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3)) - - $ b(lo(1),lo(2),lo(3))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3)), - $ 'b=', - $ b(lo(1),lo(2),lo(3)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4)) - - $ b(lo(1),lo(2),lo(3),lo(4))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - subroutine testit_NGA_SCATTER_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_ACC_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop, idx - double complex v(m) - integer d(ndim, m) - double precision drand - double complex alpha - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------- NGA_SCATTER_ACC ----------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - alpha=dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) -c -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) -c generate random number - idx = int(drand(0)*(real(total/nproc))) - $ +me*total/nproc - if(idx .eq. 0) idx = idx+1 -c -c convert to ndim - call conv_1ton(ndim,dims,idx,lo) -c - do j=1,ndim - d(j,i) = lo(j) - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c the result should be - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))= - $ a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i)) - $ *alpha - $ +b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - enddo -c - call ga_sync() -c -c scatter the v to the global array - call nga_scatter_acc(g_a, v, d, m, alpha) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(abs(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) - - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) - $ .gt. 1d-5) then - print *,'a=', - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'b=', - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)), - $ 'i=',i,'alpha=',alpha,'v=',v(i) - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - -c----------------------- -c Utility functions - - subroutine random_range(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, swap, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = hi(i)-lo(i)+1 - val = iran(range) - lop(i) = lo(i) + val - val = iran(range) - hip(i) = hi(i) - val - if(hip(i) .lt. lop(i))then - swap =hip(i) - hip(i)=lop(i) - lop(i)=swap - endif - hip(i)=MIN(hip(i),hi(i)) - lop(i)=MAX(lop(i),lo(i)) - enddo - end -c - -c - subroutine random_range_outbound(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = 2*(hi(i)-lo(i)+1) - val = iran(range) - lop(i) = lo(i) + val - range = hi(i)-lo(i)+1 - val = iran(range) - hip(i) = lop(i) + range - val -c - hip(i) = hip(i)-hi(i) - lop(i) = lop(i)-hi(i) - enddo - end -c -c - integer function count_elems(lo,hi,ndim) - implicit none - integer lo(1),hi(1),ndim,elems,i - elems=1 - do i=1,ndim - elems = elems*(hi(i)-lo(i)+1) - enddo - count_elems = elems - end -c - -c get the next available nindex in the range of lo and hi - integer function next_index(ind,total,ndim,lo,hi,dims) - implicit none - integer ind,total,ndim,lo(ndim),hi(ndim),dims(ndim) - integer i - integer indx(8),nindex -c - nindex = ind + 1 - 200 call conv_1ton(ndim,dims,nindex,indx) -c -c test if indx(i) is in the range of lo(i) and hi(i) - do i=1,ndim - if((indx(i).lt.lo(i)).or.(indx(i).gt.hi(i))) then - nindex = nindex + 1 - if(nindex.gt.total) then - next_index = 0 - goto 300 - else - goto 200 - endif - endif - enddo -c - next_index = nindex - 300 end - -c testing if the indices are unique - integer function unique(ind,ndim,m,n) - implicit none - integer ndim,m,n - integer ind(ndim,m) - integer i,j,marker -c - unique = 1 - do i = 1, n-1 - marker = 0 - do j = 1, ndim - if(ind(j,n).eq.ind(j,i)) marker = marker + 1 - enddo -c - if(marker.eq.ndim) unique = 0 - enddo -c - end - -c - subroutine prnt_rng(me,lo,hi,ndim) - implicit none - integer me,ndim - integer lo(ndim),hi(ndim) - integer i -c - print *, me,': array section [',(lo(i),':',hi(i),i=1,ndim),']' -c - end - -c divide the space into equal size patches according to nproc -c and calculate my lo and hi - subroutine my_space(me,nproc,ndim,total,dims,lo,hi) - implicit none - integer me,nproc,ndim,total - integer dims(ndim),lo(ndim),hi(ndim) - integer div,lop,hip,i -c - div = total/nproc -c - lop = div * me + 1 -c - if(me.eq.(nproc-1)) then - hip = total - else - hip = div * (me+1) - endif -c - call conv_1ton(ndim,dims,lop,lo) - call conv_1ton(ndim,dims,hip,hi) -c -c swap the indices if the lo if larger thant hi - do i = 1,ndim - if(lo(i).gt.hi(i)) then - if(i.eq.ndim) call ga_error('bye',0) - lo(i) = 1 - lo(i+1) = lo(i+1) + 1 - endif - enddo - end - -c convert the index from one dimension to n dimension - subroutine conv_1ton(ndim,dims,ind1,indn) - implicit none - integer ndim - integer dims(ndim) - integer ind1,indn(ndim) - integer range(8),remainder,i -c - remainder = ind1 -c get the range of each dimension - do i=1,ndim - if(i.eq.1) then - range(i) = dims(i) - else - range(i) = range(i-1) * dims(i) - endif - enddo -c -c get the indices in each dimension - do i = ndim,1,-1 - if(i.ne.1) then - indn(i) = remainder/range(i-1) - remainder = remainder - indn(i)*range(i-1) - if(remainder.eq.0) then - remainder = range(i-1) - else - indn(i) = indn(i) + 1 - endif - else - indn(i) = remainder - endif - enddo -c - end - - -c fill array with random numbers - subroutine fill_array_int(a,n,val) - implicit none - integer n - integer a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_int(a,n) - implicit none - integer n - integer a(n) - double precision drand - integer i - do i= 1, n - a(i) = int(drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_int(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - integer array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_int(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_int(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - integer a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - integer function dot_patch_int( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - integer res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_int = res -c - end - - -c fill array with random numbers - subroutine fill_array_dbl(a,n,val) - implicit none - integer n - double precision a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dbl(a,n) - implicit none - integer n - double precision a(n) - double precision drand - integer i - do i= 1, n - a(i) = drand(0) * i * 2 - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dbl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double precision array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dbl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - double precision alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dbl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double precision a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double precision function dot_patch_dbl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double precision res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dbl = res -c - end - - -c fill array with random numbers - subroutine fill_array_dcpl(a,n,val) - implicit none - integer n - double complex a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dcpl(a,n) - implicit none - integer n - double complex a(n) - double precision drand - integer i - do i= 1, n - a(i) = dcmplx(drand(0) * i * 2, - $ -drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dcpl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double complex array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dcpl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - double complex alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dcpl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double complex a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double complex function dot_patch_dcpl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double complex res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dcpl = res -c - end - - diff --git a/global/testing/ngatest_src/generated/nga-util.F b/global/testing/ngatest_src/generated/nga-util.F deleted file mode 100644 index d4b59acaf..000000000 --- a/global/testing/ngatest_src/generated/nga-util.F +++ /dev/null @@ -1,2750 +0,0 @@ - - -#if HAVE_CONFIG_H -# include "config.fh" -#endif -#define MAXLOOP 100 - - program test - implicit none -#include "mafdecls.fh" -#include "global.fh" - integer stack, heap -c -c*** Intitialize a message passing library -c -#include "mp3.fh" -c -c*** Intitialize the GA package - call ga_initialize() -c if(ga_nodeid().eq.0)print *,ga_nnodes(),' nodes' -c - if(ga_uses_ma()) then - stack = 200000 -c stack = 100000 - else - stack = 60000 -c stack = 20000 - endif - heap = stack - if (.not. ma_init(MT_DBL, heap, stack)) - $ call ga_error("ma init failed",heap+stack) -c -c -c test GA_FILL - - call testit_GA_FILL_int1() - - call testit_GA_FILL_dbl1() - - call testit_GA_FILL_dcpl1() - - call testit_GA_FILL_int2() - - call testit_GA_FILL_dbl2() - - call testit_GA_FILL_dcpl2() - - call testit_GA_FILL_int3() - - call testit_GA_FILL_dbl3() - - call testit_GA_FILL_dcpl3() - - call testit_GA_FILL_int4() - - call testit_GA_FILL_dbl4() - - call testit_GA_FILL_dcpl4() - - call testit_GA_FILL_int5() - - call testit_GA_FILL_dbl5() - - call testit_GA_FILL_dcpl5() - - call testit_GA_FILL_int6() - - call testit_GA_FILL_dbl6() - - call testit_GA_FILL_dcpl6() - - call testit_GA_FILL_int7() - - call testit_GA_FILL_dbl7() - - call testit_GA_FILL_dcpl7() - -c -c test NGA_GET - -c -c test NGA_PUT - -c -c test NGA_ACC - -c -c test NGA_PERIODIC_GET - -c -c test NGA_PERIODIC_PUT - -c -c test NGA_PERIODIC_ACC - -c -c test NGA_FILL_PATCH - -c -c test NGA_COPY_PATCH - -c -c test NGA_SCALE_PATCH - -c -c test NGA_ADD_PATCH - -c -c test NGA_DOT_PATCH - -c -c test NGA_SCATTER - -c -c test NGA_SCATTER_ACC - -c -c test NGA_GATHER - - - -c - if(ga_nodeid().eq.0) print *, 'All tests successful' -c - call ga_terminate() - call MP_FINALIZE() - end - -c----------------- - - - - - subroutine testit_GA_FILL_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - - - subroutine testit_GA_FILL_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - - - - - - - - - - -c----------------------- -c Utility functions - - subroutine random_range(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, swap, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = hi(i)-lo(i)+1 - val = iran(range) - lop(i) = lo(i) + val - val = iran(range) - hip(i) = hi(i) - val - if(hip(i) .lt. lop(i))then - swap =hip(i) - hip(i)=lop(i) - lop(i)=swap - endif - hip(i)=MIN(hip(i),hi(i)) - lop(i)=MAX(lop(i),lo(i)) - enddo - end -c - -c - subroutine random_range_outbound(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = 2*(hi(i)-lo(i)+1) - val = iran(range) - lop(i) = lo(i) + val - range = hi(i)-lo(i)+1 - val = iran(range) - hip(i) = lop(i) + range - val -c - hip(i) = hip(i)-hi(i) - lop(i) = lop(i)-hi(i) - enddo - end -c -c - integer function count_elems(lo,hi,ndim) - implicit none - integer lo(1),hi(1),ndim,elems,i - elems=1 - do i=1,ndim - elems = elems*(hi(i)-lo(i)+1) - enddo - count_elems = elems - end -c - -c get the next available nindex in the range of lo and hi - integer function next_index(ind,total,ndim,lo,hi,dims) - implicit none - integer ind,total,ndim,lo(ndim),hi(ndim),dims(ndim) - integer i - integer indx(8),nindex -c - nindex = ind + 1 - 200 call conv_1ton(ndim,dims,nindex,indx) -c -c test if indx(i) is in the range of lo(i) and hi(i) - do i=1,ndim - if((indx(i).lt.lo(i)).or.(indx(i).gt.hi(i))) then - nindex = nindex + 1 - if(nindex.gt.total) then - next_index = 0 - goto 300 - else - goto 200 - endif - endif - enddo -c - next_index = nindex - 300 end - -c testing if the indices are unique - integer function unique(ind,ndim,m,n) - implicit none - integer ndim,m,n - integer ind(ndim,m) - integer i,j,marker -c - unique = 1 - do i = 1, n-1 - marker = 0 - do j = 1, ndim - if(ind(j,n).eq.ind(j,i)) marker = marker + 1 - enddo -c - if(marker.eq.ndim) unique = 0 - enddo -c - end - -c - subroutine prnt_rng(me,lo,hi,ndim) - implicit none - integer me,ndim - integer lo(ndim),hi(ndim) - integer i -c - print *, me,': array section [',(lo(i),':',hi(i),i=1,ndim),']' -c - end - -c divide the space into equal size patches according to nproc -c and calculate my lo and hi - subroutine my_space(me,nproc,ndim,total,dims,lo,hi) - implicit none - integer me,nproc,ndim,total - integer dims(ndim),lo(ndim),hi(ndim) - integer div,lop,hip,i -c - div = total/nproc -c - lop = div * me + 1 -c - if(me.eq.(nproc-1)) then - hip = total - else - hip = div * (me+1) - endif -c - call conv_1ton(ndim,dims,lop,lo) - call conv_1ton(ndim,dims,hip,hi) -c -c swap the indices if the lo if larger thant hi - do i = 1,ndim - if(lo(i).gt.hi(i)) then - if(i.eq.ndim) call ga_error('bye',0) - lo(i) = 1 - lo(i+1) = lo(i+1) + 1 - endif - enddo - end - -c convert the index from one dimension to n dimension - subroutine conv_1ton(ndim,dims,ind1,indn) - implicit none - integer ndim - integer dims(ndim) - integer ind1,indn(ndim) - integer range(8),remainder,i -c - remainder = ind1 -c get the range of each dimension - do i=1,ndim - if(i.eq.1) then - range(i) = dims(i) - else - range(i) = range(i-1) * dims(i) - endif - enddo -c -c get the indices in each dimension - do i = ndim,1,-1 - if(i.ne.1) then - indn(i) = remainder/range(i-1) - remainder = remainder - indn(i)*range(i-1) - if(remainder.eq.0) then - remainder = range(i-1) - else - indn(i) = indn(i) + 1 - endif - else - indn(i) = remainder - endif - enddo -c - end - - -c fill array with random numbers - subroutine fill_array_int(a,n,val) - implicit none - integer n - integer a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_int(a,n) - implicit none - integer n - integer a(n) - double precision drand - integer i - do i= 1, n - a(i) = int(drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_int(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - integer array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_int(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_int(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - integer a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - integer function dot_patch_int( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - integer res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_int = res -c - end - - -c fill array with random numbers - subroutine fill_array_dbl(a,n,val) - implicit none - integer n - double precision a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dbl(a,n) - implicit none - integer n - double precision a(n) - double precision drand - integer i - do i= 1, n - a(i) = drand(0) * i * 2 - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dbl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double precision array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dbl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - double precision alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dbl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double precision a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double precision function dot_patch_dbl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double precision res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dbl = res -c - end - - -c fill array with random numbers - subroutine fill_array_dcpl(a,n,val) - implicit none - integer n - double complex a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dcpl(a,n) - implicit none - integer n - double complex a(n) - double precision drand - integer i - do i= 1, n - a(i) = dcmplx(drand(0) * i * 2, - $ -drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dcpl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double complex array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dcpl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - double complex alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dcpl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double complex a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double complex function dot_patch_dcpl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double complex res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dcpl = res -c - end - - diff --git a/global/testing/ngatest_src/generated/ngatest.F b/global/testing/ngatest_src/generated/ngatest.F deleted file mode 100644 index f36b842b2..000000000 --- a/global/testing/ngatest_src/generated/ngatest.F +++ /dev/null @@ -1,28358 +0,0 @@ - - -#if HAVE_CONFIG_H -# include "config.fh" -#endif -#define MAXLOOP 100 - - program test - implicit none -#include "mafdecls.fh" -#include "global.fh" - integer stack, heap -c -c*** Intitialize a message passing library -c -#include "mp3.fh" -c -c*** Intitialize the GA package - call ga_initialize() -c if(ga_nodeid().eq.0)print *,ga_nnodes(),' nodes' -c - if(ga_uses_ma()) then - stack = 200000 -c stack = 100000 - else - stack = 60000 -c stack = 20000 - endif - heap = stack - if (.not. ma_init(MT_DBL, heap, stack)) - $ call ga_error("ma init failed",heap+stack) -c -c -c test GA_FILL - - call testit_GA_FILL_int1() - - call testit_GA_FILL_dbl1() - - call testit_GA_FILL_dcpl1() - - call testit_GA_FILL_int2() - - call testit_GA_FILL_dbl2() - - call testit_GA_FILL_dcpl2() - - call testit_GA_FILL_int3() - - call testit_GA_FILL_dbl3() - - call testit_GA_FILL_dcpl3() - - call testit_GA_FILL_int4() - - call testit_GA_FILL_dbl4() - - call testit_GA_FILL_dcpl4() - - call testit_GA_FILL_int5() - - call testit_GA_FILL_dbl5() - - call testit_GA_FILL_dcpl5() - - call testit_GA_FILL_int6() - - call testit_GA_FILL_dbl6() - - call testit_GA_FILL_dcpl6() - - call testit_GA_FILL_int7() - - call testit_GA_FILL_dbl7() - - call testit_GA_FILL_dcpl7() - -c -c test NGA_GET - - call testit_NGA_GET_int1() - - call testit_NGA_GET_dbl1() - - call testit_NGA_GET_dcpl1() - - call testit_NGA_GET_int2() - - call testit_NGA_GET_dbl2() - - call testit_NGA_GET_dcpl2() - - call testit_NGA_GET_int3() - - call testit_NGA_GET_dbl3() - - call testit_NGA_GET_dcpl3() - - call testit_NGA_GET_int4() - - call testit_NGA_GET_dbl4() - - call testit_NGA_GET_dcpl4() - - call testit_NGA_GET_int5() - - call testit_NGA_GET_dbl5() - - call testit_NGA_GET_dcpl5() - - call testit_NGA_GET_int6() - - call testit_NGA_GET_dbl6() - - call testit_NGA_GET_dcpl6() - - call testit_NGA_GET_int7() - - call testit_NGA_GET_dbl7() - - call testit_NGA_GET_dcpl7() - -c -c test NGA_PUT - - call testit_NGA_PUT_int1() - - call testit_NGA_PUT_dbl1() - - call testit_NGA_PUT_dcpl1() - - call testit_NGA_PUT_int2() - - call testit_NGA_PUT_dbl2() - - call testit_NGA_PUT_dcpl2() - - call testit_NGA_PUT_int3() - - call testit_NGA_PUT_dbl3() - - call testit_NGA_PUT_dcpl3() - - call testit_NGA_PUT_int4() - - call testit_NGA_PUT_dbl4() - - call testit_NGA_PUT_dcpl4() - - call testit_NGA_PUT_int5() - - call testit_NGA_PUT_dbl5() - - call testit_NGA_PUT_dcpl5() - - call testit_NGA_PUT_int6() - - call testit_NGA_PUT_dbl6() - - call testit_NGA_PUT_dcpl6() - - call testit_NGA_PUT_int7() - - call testit_NGA_PUT_dbl7() - - call testit_NGA_PUT_dcpl7() - -c -c test NGA_ACC - - call testit_NGA_ACC_int1() - - call testit_NGA_ACC_dbl1() - - call testit_NGA_ACC_dcpl1() - - call testit_NGA_ACC_int2() - - call testit_NGA_ACC_dbl2() - - call testit_NGA_ACC_dcpl2() - - call testit_NGA_ACC_int3() - - call testit_NGA_ACC_dbl3() - - call testit_NGA_ACC_dcpl3() - - call testit_NGA_ACC_int4() - - call testit_NGA_ACC_dbl4() - - call testit_NGA_ACC_dcpl4() - - call testit_NGA_ACC_int5() - - call testit_NGA_ACC_dbl5() - - call testit_NGA_ACC_dcpl5() - - call testit_NGA_ACC_int6() - - call testit_NGA_ACC_dbl6() - - call testit_NGA_ACC_dcpl6() - - call testit_NGA_ACC_int7() - - call testit_NGA_ACC_dbl7() - - call testit_NGA_ACC_dcpl7() - -c -c test NGA_PERIODIC_GET - -c -c test NGA_PERIODIC_PUT - -c -c test NGA_PERIODIC_ACC - -c -c test NGA_FILL_PATCH - - call testit_NGA_FILL_PATCH_int1() - - call testit_NGA_FILL_PATCH_dbl1() - - call testit_NGA_FILL_PATCH_dcpl1() - - call testit_NGA_FILL_PATCH_int2() - - call testit_NGA_FILL_PATCH_dbl2() - - call testit_NGA_FILL_PATCH_dcpl2() - - call testit_NGA_FILL_PATCH_int3() - - call testit_NGA_FILL_PATCH_dbl3() - - call testit_NGA_FILL_PATCH_dcpl3() - - call testit_NGA_FILL_PATCH_int4() - - call testit_NGA_FILL_PATCH_dbl4() - - call testit_NGA_FILL_PATCH_dcpl4() - - call testit_NGA_FILL_PATCH_int5() - - call testit_NGA_FILL_PATCH_dbl5() - - call testit_NGA_FILL_PATCH_dcpl5() - - call testit_NGA_FILL_PATCH_int6() - - call testit_NGA_FILL_PATCH_dbl6() - - call testit_NGA_FILL_PATCH_dcpl6() - - call testit_NGA_FILL_PATCH_int7() - - call testit_NGA_FILL_PATCH_dbl7() - - call testit_NGA_FILL_PATCH_dcpl7() - -c -c test NGA_COPY_PATCH - - call testit_NGA_COPY_PATCH_int1() - - call testit_NGA_COPY_PATCH_dbl1() - - call testit_NGA_COPY_PATCH_dcpl1() - - call testit_NGA_COPY_PATCH_int2() - - call testit_NGA_COPY_PATCH_dbl2() - - call testit_NGA_COPY_PATCH_dcpl2() - - call testit_NGA_COPY_PATCH_int3() - - call testit_NGA_COPY_PATCH_dbl3() - - call testit_NGA_COPY_PATCH_dcpl3() - - call testit_NGA_COPY_PATCH_int4() - - call testit_NGA_COPY_PATCH_dbl4() - - call testit_NGA_COPY_PATCH_dcpl4() - - call testit_NGA_COPY_PATCH_int5() - - call testit_NGA_COPY_PATCH_dbl5() - - call testit_NGA_COPY_PATCH_dcpl5() - - call testit_NGA_COPY_PATCH_int6() - - call testit_NGA_COPY_PATCH_dbl6() - - call testit_NGA_COPY_PATCH_dcpl6() - - call testit_NGA_COPY_PATCH_int7() - - call testit_NGA_COPY_PATCH_dbl7() - - call testit_NGA_COPY_PATCH_dcpl7() - -c -c test NGA_SCALE_PATCH - - call testit_NGA_SCALE_PATCH_int1() - - call testit_NGA_SCALE_PATCH_dbl1() - - call testit_NGA_SCALE_PATCH_dcpl1() - - call testit_NGA_SCALE_PATCH_int2() - - call testit_NGA_SCALE_PATCH_dbl2() - - call testit_NGA_SCALE_PATCH_dcpl2() - - call testit_NGA_SCALE_PATCH_int3() - - call testit_NGA_SCALE_PATCH_dbl3() - - call testit_NGA_SCALE_PATCH_dcpl3() - - call testit_NGA_SCALE_PATCH_int4() - - call testit_NGA_SCALE_PATCH_dbl4() - - call testit_NGA_SCALE_PATCH_dcpl4() - - call testit_NGA_SCALE_PATCH_int5() - - call testit_NGA_SCALE_PATCH_dbl5() - - call testit_NGA_SCALE_PATCH_dcpl5() - - call testit_NGA_SCALE_PATCH_int6() - - call testit_NGA_SCALE_PATCH_dbl6() - - call testit_NGA_SCALE_PATCH_dcpl6() - - call testit_NGA_SCALE_PATCH_int7() - - call testit_NGA_SCALE_PATCH_dbl7() - - call testit_NGA_SCALE_PATCH_dcpl7() - -c -c test NGA_ADD_PATCH - -c -c test NGA_DOT_PATCH - - call testit_NGA_DOT_PATCH_int1() - - call testit_NGA_DOT_PATCH_dbl1() - - call testit_NGA_DOT_PATCH_dcpl1() - - call testit_NGA_DOT_PATCH_int2() - - call testit_NGA_DOT_PATCH_dbl2() - - call testit_NGA_DOT_PATCH_dcpl2() - - call testit_NGA_DOT_PATCH_int3() - - call testit_NGA_DOT_PATCH_dbl3() - - call testit_NGA_DOT_PATCH_dcpl3() - - call testit_NGA_DOT_PATCH_int4() - - call testit_NGA_DOT_PATCH_dbl4() - - call testit_NGA_DOT_PATCH_dcpl4() - - call testit_NGA_DOT_PATCH_int5() - - call testit_NGA_DOT_PATCH_dbl5() - - call testit_NGA_DOT_PATCH_dcpl5() - - call testit_NGA_DOT_PATCH_int6() - - call testit_NGA_DOT_PATCH_dbl6() - - call testit_NGA_DOT_PATCH_dcpl6() - - call testit_NGA_DOT_PATCH_int7() - - call testit_NGA_DOT_PATCH_dbl7() - - call testit_NGA_DOT_PATCH_dcpl7() - -c -c test NGA_SCATTER - - call testit_NGA_SCATTER_int1() - - call testit_NGA_SCATTER_dbl1() - - call testit_NGA_SCATTER_dcpl1() - - call testit_NGA_SCATTER_int2() - - call testit_NGA_SCATTER_dbl2() - - call testit_NGA_SCATTER_dcpl2() - - call testit_NGA_SCATTER_int3() - - call testit_NGA_SCATTER_dbl3() - - call testit_NGA_SCATTER_dcpl3() - - call testit_NGA_SCATTER_int4() - - call testit_NGA_SCATTER_dbl4() - - call testit_NGA_SCATTER_dcpl4() - - call testit_NGA_SCATTER_int5() - - call testit_NGA_SCATTER_dbl5() - - call testit_NGA_SCATTER_dcpl5() - - call testit_NGA_SCATTER_int6() - - call testit_NGA_SCATTER_dbl6() - - call testit_NGA_SCATTER_dcpl6() - - call testit_NGA_SCATTER_int7() - - call testit_NGA_SCATTER_dbl7() - - call testit_NGA_SCATTER_dcpl7() - -c -c test NGA_SCATTER_ACC - -c -c test NGA_GATHER - - call testit_NGA_GATHER_int1() - - call testit_NGA_GATHER_dbl1() - - call testit_NGA_GATHER_dcpl1() - - call testit_NGA_GATHER_int2() - - call testit_NGA_GATHER_dbl2() - - call testit_NGA_GATHER_dcpl2() - - call testit_NGA_GATHER_int3() - - call testit_NGA_GATHER_dbl3() - - call testit_NGA_GATHER_dcpl3() - - call testit_NGA_GATHER_int4() - - call testit_NGA_GATHER_dbl4() - - call testit_NGA_GATHER_dcpl4() - - call testit_NGA_GATHER_int5() - - call testit_NGA_GATHER_dbl5() - - call testit_NGA_GATHER_dcpl5() - - call testit_NGA_GATHER_int6() - - call testit_NGA_GATHER_dbl6() - - call testit_NGA_GATHER_dcpl6() - - call testit_NGA_GATHER_int7() - - call testit_NGA_GATHER_dbl7() - - call testit_NGA_GATHER_dcpl7() - - - -c - if(ga_nodeid().eq.0) print *, 'All tests successful' -c - call ga_terminate() - call MP_FINALIZE() - end - -c----------------- - - - - - subroutine testit_GA_FILL_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer c(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - integer a(n) - integer b(n) - integer c(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer c(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - integer a(n,n) - integer b(n,n) - integer c(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer c(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - integer a(n,n,n) - integer b(n,n,n) - integer c(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer c(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - integer a(n,n,n,n) - integer b(n,n,n,n) - integer c(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer c(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - integer a(n,n,n,n,n) - integer b(n,n,n,n,n) - integer c(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - integer a(n,n,n,n,n,n) - integer b(n,n,n,n,n,n) - integer c(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = int(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_int(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_int(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - integer val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = int(drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_int(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_int(total, - $ int(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_int(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = int(drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - integer v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = int(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_int(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call compare_patches_int(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - integer d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_int(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call compare_patches_int(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - integer val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = int(drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c -c check the result - call scale_patch_int(total, - $ val,a,lo,hi,ndim,dims, - $ int(0),b,lo,hi,ndim,dims) - - call compare_patches_int(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_int7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - integer a(n,n,n,n,n,n,n) - integer b(n,n,n,n,n,n,n) - integer c(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - integer alpha, beta - integer dot_patch_int -c for different array dimensions - - integer d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_INT, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_INT, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_idot_patch' - print *, ' - Data Type: integer' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_int(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_int(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_INT, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_int(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_int(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_idot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_int(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - subroutine testit_GA_FILL_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - double precision c(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double precision a(n) - double precision b(n) - double precision c(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - double precision c(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double precision a(n,n) - double precision b(n,n) - double precision c(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - double precision c(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double precision a(n,n,n) - double precision b(n,n,n) - double precision c(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - double precision c(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double precision a(n,n,n,n) - double precision b(n,n,n,n) - double precision c(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - double precision c(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double precision a(n,n,n,n,n) - double precision b(n,n,n,n,n) - double precision c(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - double precision c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double precision a(n,n,n,n,n,n) - double precision b(n,n,n,n,n,n) - double precision c(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dble(456) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dbl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(234) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dbl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - double precision c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double precision val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = drand(0) * me*2+1 * 2 - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dbl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dbl(total, - $ dble(1),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_dbl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = drand(0) * * 2 - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double precision v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dble(1234) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dbl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double precision d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dbl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call compare_patches_dbl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double precision val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = drand(0) * 1 * 2 -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c -c check the result - call scale_patch_dbl(total, - $ val,a,lo,hi,ndim,dims, - $ dble(0),b,lo,hi,ndim,dims) - - call compare_patches_dbl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dbl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double precision a(n,n,n,n,n,n,n) - double precision b(n,n,n,n,n,n,n) - double precision c(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double precision alpha, beta - double precision dot_patch_dbl -c for different array dimensions - - double precision d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DBL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DBL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_ddot_patch' - print *, ' - Data Type: double precision' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_dbl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dbl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DBL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dbl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_dbl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_ddot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dbl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - subroutine testit_GA_FILL_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - double complex c(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(a(lo(1)) .ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl1() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 20000) - parameter (m = (20000**1)/100) - parameter (ndim = 1) - double complex a(n) - double complex b(n) - double complex c(n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 1' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - double complex c(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(a(lo(1),lo(2)) .ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl2() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 140) - parameter (m = (140**2)/100) - parameter (ndim = 2) - double complex a(n,n) - double complex b(n,n) - double complex c(n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n) - integer dndim - parameter (dndim = 2-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 2' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - double complex c(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3)) .ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl3() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 27) - parameter (m = (27**3)/100) - parameter (ndim = 3) - double complex a(n,n,n) - double complex b(n,n,n) - double complex c(n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n) - integer dndim - parameter (dndim = 3-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 3' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - double complex c(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl4() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 11) - parameter (m = (11**4)/100) - parameter (ndim = 4) - double complex a(n,n,n,n) - double complex b(n,n,n,n) - double complex c(n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n,n) - integer dndim - parameter (dndim = 4-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 4' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - double complex c(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl5() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 7) - parameter (m = (7**5)/100) - parameter (ndim = 5) - double complex a(n,n,n,n,n) - double complex b(n,n,n,n,n) - double complex c(n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n,n,n) - integer dndim - parameter (dndim = 5-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 5' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - double complex c(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl6() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 5) - parameter (m = (5**6)/100) - parameter (ndim = 6) - double complex a(n,n,n,n,n,n) - double complex b(n,n,n,n,n,n) - double complex c(n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n,n,n,n) - integer dndim - parameter (dndim = 6-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 6' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - subroutine testit_GA_FILL_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim), dims(ndim), ld(ndim) - integer g_a - integer chunk(ndim) - integer i,total - integer elems, count_elems - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c-------------------------------GA_FILL ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING ga_fill' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call nga_distribution(g_a, me, lo,hi) - elems = count_elems(lo,hi,ndim) -c - val = dcmplx(dble(456),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - if(elems.gt.0) then - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - endif - - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_PUT_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim),hip(ndim) - integer elems, count_elems - integer nproc, me - integer proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_PUT ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_put' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a, proc, lo,hi) - elems = count_elems(lo,hi,ndim) - call init_array_dcpl(a,total) -c - call ga_sync() - if(elems.gt.0) then - call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - do loop = 1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lop,hip,ndim) - endif - call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),ld) - enddo - - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - else -c so that the random_range can be call the same number of times -c in other words, drand can generate the same number for the -c collective operations - do loop=1, MAXLOOP - call random_range(lo,hi,lop,hip,ndim) - enddo - endif -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_GET_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, total, loop - integer lop(ndim), hip(ndim) - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GET ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_get' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(234),dble(0)) - call ga_fill(g_a,val) - call ga_sync() -c - call fill_array_dcpl(a,total,val) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0 .and. Mod(loop,10).eq.0)then - call print_range(loop,lo,hi,ndim) - endif -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_ACC_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - double complex c(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer lop(ndim),hip(ndim) - integer chunk(ndim) - integer i, total, loop - double precision drand - double complex val, alpha - integer nproc, me, proc - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_ACC ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_acc' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - call ga_sync() - val = dcmplx(drand(0) * me*2+1 * 2, - $ -drand(0) * me*2+1 * 2) - call ga_fill(g_a,val) - call ga_sync() - proc = nproc-1 -me ! access other process memory - call nga_distribution(g_a,proc,lop,hip) -c - call init_array_dcpl(b,total) -c - do loop = 1, MAXLOOP - call random_range(lop,hip,lo,hi,ndim) - if(Mod(loop,10).eq.0)then - if(Mod((loop/10),ndim).eq.me) then - call print_range(loop,lo,hi,ndim) - endif - endif - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c generate the alpha - alpha = val -c - call ga_sync() -c keep a copy of the original patch - if(hi(1).ne.-1) call nga_acc(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld,alpha) -c - call ga_sync() -c - if(hi(1).ne.-1) then -c scale the local copy of array - call scale_patch_dcpl(total, - $ dcmplx(dble(1),dble(0)),a,lo,hi,ndim,dims, - $ alpha,b,lo,hi,ndim,dims) -c -c get the patch from the global array - call nga_get(g_a,lo,hi, - $ c(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c - call compare_patches_dcpl(1d-2,total, - $ a,lo,hi,ndim,dims,total,c,lo,hi,ndim,dims) - endif - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_SCATTER_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_SCATTER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scatter' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array and the array containing values - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo -c - v(i) = dcmplx(drand(0) * * 2, - $ -drand(0) * * 2) - a(d(1,i),d(2,i),d(3,i),d(4,i),d(5,i),d(6,i),d(7,i))=v(i) - enddo -c -c scatter the v to the global array - call nga_scatter(g_a, v, d, m) - call ga_sync() -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)) .ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_GATHER_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim),ld(ndim) - integer g_a - integer chunk(ndim) - integer i, j, total, loop - integer elems, count_elems - double complex v(m) - integer d(ndim, m) - double precision drand - integer unique,unique_index - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - ld(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c------------------------------- NGA_GATHER ---------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_gather' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lo, hi) - elems = count_elems(lo, hi, ndim) - if(elems.gt.0) call nga_put(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - do loop = 1, MAXLOOP - call ga_sync() -c initialize the index array - do i = 1, m -c generate indices - unique_index = 0 - do while(unique_index.eq.0) - do j=1,ndim - d(j,i) = int(drand(0)*real(n)) + 1 - enddo -c - unique_index = unique(d,ndim,m,i) - enddo - enddo -c -c gather from global array - call nga_gather(g_a, v, d, m) -c -c collect each elements and compare - do i = 1, m - do j=1, ndim - lo(j) = d(j,i) - enddo - call nga_get(g_a,lo,lo, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),ld) -c -c compare the results - if(v(i).ne. - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7))) then - call ga_error('bye', 0) - endif - enddo - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_FILL_PATCH_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer lo(ndim),hi(ndim),dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_FILL_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_fill_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c - val = dcmplx(dble(1234),dble(0)) -c initialize the lo and hi - do i=1, ndim - lo(i) = 2 - hi(i) = n - 1 - enddo -c - call nga_fill_patch(g_a, lo, hi, val) -c -c check the result - call fill_array_dcpl(a,total,val) - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - subroutine testit_NGA_COPY_PATCH_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) -c for different array dimensions - - double complex d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_COPY_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_copy_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c -c keep a copy of the origian array - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' standard copy patch: OK' - print *, ' ' - call ffflush(6) - endif -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping without transpose: OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(a,b,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(ndim-i+1) + 1 - bhi(i) = ahi(ndim-i+1) + 1 - enddo - if(me.eq.0)then - call copy_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ '-->', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - call nga_copy_patch('t', g_a, alo, ahi, g_b, blo, bhi) -c - call nga_get(g_b,blo,bhi, - $ b(blo(1),blo(2),blo(3),blo(4),blo(5),blo(6),blo(7)),dims) -c -c adjust index of array a - do i=1,ndim - tlo(i) = alo(ndim-i+1) - thi(i) = ahi(ndim-i+1) - enddo - call compare_patches_dcpl(0d0,total, - $ a,tlo,thi,ndim,dims,total,b,blo,bhi,ndim,dims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' reshaping transposed: OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing copy patch on different dimensions' -c - call ga_sync() -c -c initialize g_b - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - call ga_sync() -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call copy_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *, loop,': copy [',(alo(i),':',ahi(i),i=1,ndim), -c$$$ $ ']','-->','[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - call nga_copy_patch('n', g_a, alo, ahi, g_b, dlo, dhi) -c - call nga_get(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) - call nga_get(g_a,alo,ahi, - $ a(alo(1),alo(2),alo(3),alo(4),alo(5),alo(6),alo(7)),dims) -c - call compare_patches_dcpl(0d0,total, - $ a,alo,ahi,ndim,dims,total,d,dlo,dhi,dndim,ddims) - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' copy patches on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - - - - - - subroutine testit_NGA_SCALE_PATCH_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim) - integer lo(ndim), hi(ndim) - double precision drand - double complex val - integer nproc, me - logical status -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c - call ga_sync() -c -c--------------------------- NGA_SCALE_PATCH ------------------------- - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_scale_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo -c - do loop=1, 10 - call random_range(lop,hip,lo,hi,ndim) - if(me.eq.0) - $ call print_range(loop,lo,hi,ndim) -c$$$ $ print *, loop,': scale [',(lo(i),':',hi(i), i=1,ndim),']' -c the random number to scale - val = dcmplx(drand(0) * 1 * 2, - $ -drand(0) * 1 * 2) -c -c keep a copy of the origian array - call nga_get(g_a,lo,hi, - $ a(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c - call nga_scale_patch(g_a,lo,hi,val) -c - call nga_get(g_a,lo,hi, - $ b(lo(1),lo(2),lo(3),lo(4),lo(5),lo(6),lo(7)),dims) -c -c check the result - call scale_patch_dcpl(total, - $ val,a,lo,hi,ndim,dims, - $ dcmplx(dble(0),dble(0)),b,lo,hi,ndim,dims) - - call compare_patches_dcpl(1d-10,total, - $ a,lo,hi,ndim,dims,total,b,lo,hi,ndim,dims) -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, 'OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status= ga_destroy(g_a) - end - - - - subroutine testit_NGA_DOT_PATCH_dcpl7() - implicit none -#include "mafdecls.fh" -#include "global.fh" -c - integer n,m - integer ndim - parameter (n = 4) - parameter (m = (4**7)/100) - parameter (ndim = 7) - double complex a(n,n,n,n,n,n,n) - double complex b(n,n,n,n,n,n,n) - double complex c(n,n,n,n,n,n,n) - integer dims(ndim) - integer g_a, g_b - integer chunk(ndim) - integer i, total - integer elems, count_elems - integer loop - integer lop(ndim), hip(ndim), hipl(ndim) - integer alo(ndim), ahi(ndim) - integer blo(ndim), bhi(ndim) - integer tlo(ndim), thi(ndim) - double complex alpha, beta - double complex dot_patch_dcpl -c for different array dimensions - - double complex d(n,n,n,n,n,n) - integer dndim - parameter (dndim = 7-1) - integer ddims(dndim),dlo(dndim),dhi(dndim),dtotal - -c - integer nproc, me - logical status - integer repeat -c - nproc = ga_nnodes() - me = ga_nodeid() -c -c---------------------- initialize the GA ----------------------- -c initialize the chunk, dims, ld, and calculate the number -c of elements - total=1 - do i = 1,ndim - chunk(i) = 0 - dims(i) = n - total = total * dims(i) - enddo -c -c*** Create global arrays - if (.not. nga_create(MT_DCPL, ndim, dims, 'a', chunk, g_a)) - $ call ga_error(' ga_create failed ',1) -c -c test the same distribution and different distribution seperately - do repeat=1,2 - if(repeat.eq.1) then - status = ga_duplicate(g_a, g_b, 'a_duplicated') - if(.not.ga_compare_distr(g_a, g_b)) - $ call ga_error("g_b distribution different",0) -c - else - do i = 1,ndim - if(mod(i,2).eq.0) chunk(i) = n - enddo - if (.not. nga_create(MT_DCPL, ndim, dims, 'b', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) - endif -c - call ga_sync() -c -c---------------------------NGA_DOT_PATCH ------------------------- -c - if(repeat.eq.1) then - if(me.eq.0)then - print *, ' ' - print *, 'TESTING nga_zdot_patch' - print *, ' - Data Type: double complex' - print *, ' - Dimension: 7' - print *, ' - Running on',nproc,'processes (processors)' - call ffflush(6) - endif - if(me.eq.0) print *, 'Testing with the same distributions' - else - if(me.eq.0) print *, 'Testing with different distributions' - endif -c -c initialize GA - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_dcpl(b,total) - call nga_distribution(g_b, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_b,lop,hip, - $ b(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) -c - call ga_sync() - do i = 1,ndim - lop(i) = 1 - hipl(i) = n-1 - hip(i) = n - enddo -c -c--- - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'n',blo,bhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,blo,bhi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' without transpose OK' - print *, ' ' - call ffflush(6) - endif -c--- -c prepare array a, make it transposed - call transpose_dcpl(b,c,total,ndim,dims) -c - do loop=1, 10 - call random_range(lop,hipl,alo,ahi,ndim) - do i=1, ndim - blo(i) = alo(i) + 1 - bhi(i) = ahi(i) + 1 - enddo - if(me.eq.0)then - call dot_range(loop,alo,ahi,ndim,blo,bhi,ndim) -c$$$ print *, loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(blo(i),':',bhi(i), i=1,ndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi,g_b,'t',blo,bhi) -c -c adjust index of array a - do i=1,ndim - tlo(i) = blo(ndim-i+1) - thi(i) = bhi(ndim-i+1) - enddo -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,b,tlo,thi,ndim,dims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' with transpose OK' - print *, ' ' - call ffflush(6) - endif -c--------------------------- -c - status = ga_destroy(g_b) - enddo -c -c----------------------------------------------------------------- - - -c testing copy on differet dimensions - dtotal = 1 - do i = 1,dndim - ddims(i) = n - dtotal = dtotal * ddims(i) - enddo -c - if (.not. nga_create(MT_DCPL, dndim, ddims, 'd', chunk, g_b)) - $ call ga_error(' ga_create failed ',1) -c - if(me.eq.0) - $ print *, 'Testing dot patch on different dimensions' -c -c initialize GAs - call init_array_dcpl(a,total) - call nga_distribution(g_a, me, lop, hip) - elems = count_elems(lop, hip, ndim) - if(elems.gt.0) call nga_put(g_a,lop,hip, - $ a(lop(1),lop(2),lop(3),lop(4),lop(5),lop(6),lop(7)),dims) - call init_array_dcpl(d,dtotal) - call nga_distribution(g_b, me, dlo, dhi) - elems = count_elems(dlo, dhi, dndim) - if(elems.gt.0) call nga_put(g_b,dlo,dhi, - $ d(dlo(1),dlo(2),dlo(3),dlo(4),dlo(5),dlo(6)),ddims) -c - call ga_sync() -c -c calculate the maximum range of g_a that can fit into g_b - do i = 1,ndim - lop(i) = 1 - hip(i) = n - enddo - hip(dndim) = 1 -c - do loop=1, 10 - call random_range(lop,hip,alo,ahi,ndim) -c - do i=1, dndim - dlo(i) = alo(dndim-i+1) - dhi(i) = ahi(dndim-i+1) - enddo - dlo(1) = alo(ndim) - dhi(1) = ahi(ndim) -c - if(me.eq.0) then - call dot_range(loop,alo,ahi,ndim,dlo,dhi,dndim) -c$$$ print *,loop,'dot: [',(alo(i),':',ahi(i), i=1,ndim),']', -c$$$ $ ',', '[',(dlo(i),':',dhi(i), i=1,dndim),']' - endif -c - alpha=nga_zdot_patch(g_a,'n',alo,ahi, - $ g_b,'n',dlo,dhi) -c -c the result should be - beta = dot_patch_dcpl(total, - $ a,alo,ahi,ndim,dims,d,dlo,dhi,dndim,ddims) -c - if(ABS(beta-alpha).gt.1d-6*ABS(alpha)) then - print *,me, ' error ', beta, alpha - call ga_error('exiting ...',0) - endif -c - enddo -c - call ga_sync() - if(me.eq.0)then - print *, ' dot patch on different dimensions: OK' - print *, ' ' - call ffflush(6) - endif -c - status = ga_destroy(g_b) - - -c--- - status = ga_destroy(g_a) - end - -c----------------------- -c Utility functions - - subroutine random_range(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, swap, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = hi(i)-lo(i)+1 - val = iran(range) - lop(i) = lo(i) + val - val = iran(range) - hip(i) = hi(i) - val - if(hip(i) .lt. lop(i))then - swap =hip(i) - hip(i)=lop(i) - lop(i)=swap - endif - hip(i)=MIN(hip(i),hi(i)) - lop(i)=MAX(lop(i),lo(i)) - enddo - end -c - -c - subroutine random_range_outbound(lo,hi,lop,hip,ndim) - implicit none - integer lo(1),hi(1),lop(1),hip(1),ndim - integer i, range, val,iran - double precision drand - iran(range) = int(drand(0)*dble(range)) + 1 -c iran(range) = range/2 - do i = 1, ndim - range = 2*(hi(i)-lo(i)+1) - val = iran(range) - lop(i) = lo(i) + val - range = hi(i)-lo(i)+1 - val = iran(range) - hip(i) = lop(i) + range - val -c - hip(i) = hip(i)-hi(i) - lop(i) = lop(i)-hi(i) - enddo - end -c -c - integer function count_elems(lo,hi,ndim) - implicit none - integer lo(1),hi(1),ndim,elems,i - elems=1 - do i=1,ndim - elems = elems*(hi(i)-lo(i)+1) - enddo - count_elems = elems - end -c - -c get the next available nindex in the range of lo and hi - integer function next_index(ind,total,ndim,lo,hi,dims) - implicit none - integer ind,total,ndim,lo(ndim),hi(ndim),dims(ndim) - integer i - integer indx(8),nindex -c - nindex = ind + 1 - 200 call conv_1ton(ndim,dims,nindex,indx) -c -c test if indx(i) is in the range of lo(i) and hi(i) - do i=1,ndim - if((indx(i).lt.lo(i)).or.(indx(i).gt.hi(i))) then - nindex = nindex + 1 - if(nindex.gt.total) then - next_index = 0 - goto 300 - else - goto 200 - endif - endif - enddo -c - next_index = nindex - 300 end - -c testing if the indices are unique - integer function unique(ind,ndim,m,n) - implicit none - integer ndim,m,n - integer ind(ndim,m) - integer i,j,marker -c - unique = 1 - do i = 1, n-1 - marker = 0 - do j = 1, ndim - if(ind(j,n).eq.ind(j,i)) marker = marker + 1 - enddo -c - if(marker.eq.ndim) unique = 0 - enddo -c - end - -c - subroutine prnt_rng(me,lo,hi,ndim) - implicit none - integer me,ndim - integer lo(ndim),hi(ndim) - integer i -c - print *, me,': array section [',(lo(i),':',hi(i),i=1,ndim),']' -c - end - -c divide the space into equal size patches according to nproc -c and calculate my lo and hi - subroutine my_space(me,nproc,ndim,total,dims,lo,hi) - implicit none - integer me,nproc,ndim,total - integer dims(ndim),lo(ndim),hi(ndim) - integer div,lop,hip,i -c - div = total/nproc -c - lop = div * me + 1 -c - if(me.eq.(nproc-1)) then - hip = total - else - hip = div * (me+1) - endif -c - call conv_1ton(ndim,dims,lop,lo) - call conv_1ton(ndim,dims,hip,hi) -c -c swap the indices if the lo if larger thant hi - do i = 1,ndim - if(lo(i).gt.hi(i)) then - if(i.eq.ndim) call ga_error('bye',0) - lo(i) = 1 - lo(i+1) = lo(i+1) + 1 - endif - enddo - end - -c convert the index from one dimension to n dimension - subroutine conv_1ton(ndim,dims,ind1,indn) - implicit none - integer ndim - integer dims(ndim) - integer ind1,indn(ndim) - integer range(8),remainder,i -c - remainder = ind1 -c get the range of each dimension - do i=1,ndim - if(i.eq.1) then - range(i) = dims(i) - else - range(i) = range(i-1) * dims(i) - endif - enddo -c -c get the indices in each dimension - do i = ndim,1,-1 - if(i.ne.1) then - indn(i) = remainder/range(i-1) - remainder = remainder - indn(i)*range(i-1) - if(remainder.eq.0) then - remainder = range(i-1) - else - indn(i) = indn(i) + 1 - endif - else - indn(i) = remainder - endif - enddo -c - end - - -c fill array with random numbers - subroutine fill_array_int(a,n,val) - implicit none - integer n - integer a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_int(a,n) - implicit none - integer n - integer a(n) - double precision drand - integer i - do i= 1, n - a(i) = int(drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_int(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - integer array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_int(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_int(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - integer a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - integer function dot_patch_int( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - integer res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_int = res -c - end - - -c fill array with random numbers - subroutine fill_array_dbl(a,n,val) - implicit none - integer n - double precision a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dbl(a,n) - implicit none - integer n - double precision a(n) - double precision drand - integer i - do i= 1, n - a(i) = drand(0) * i * 2 - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dbl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double precision array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dbl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - double precision alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dbl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double precision a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double precision function dot_patch_dbl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double precision arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double precision res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dbl = res -c - end - - -c fill array with random numbers - subroutine fill_array_dcpl(a,n,val) - implicit none - integer n - double complex a(n),val - integer i - do i= 1, n - a(i) = val - enddo - end - -c initialize the array with random numbers - subroutine init_array_dcpl(a,n) - implicit none - integer n - double complex a(n) - double precision drand - integer i - do i= 1, n - a(i) = dcmplx(drand(0) * i * 2, - $ -drand(0) * i * 2) - enddo - end - -c if the elements do match, stop the program - subroutine compare_patches_dcpl(eps, - $ total1,array1,lo1,hi1,ndim1,dims1, - $ total2,array2,lo2,hi2,ndim2,dims2) - implicit none - double precision eps - integer ndim1,ndim2,total1,total2 - double complex array1(total1),array2(total2) - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - integer next_index - integer index1,index2 - double precision diff,maxval -c -c initialize index1 and index2, searching from zeros - index1 = 0 - index2 = 0 -c compare corresponding elements in each array - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - do while((index1.ne.0).and.(index2.ne.0)) - diff = abs(array1(index1) - array2(index2)) - maxval = max(abs(array1(index1)), abs(array2(index2))) - if((maxval.eq.0).or.(maxval.lt.eps)) maxval = 1 - if(eps .lt. abs(diff)/maxval) then - print *, 'Error: Comparison failed!' - print *, array1(index1), array2(index2) - call ga_error('bye',0) - endif - index1 = next_index(index1,total1,ndim1,lo1,hi1,dims1) - index2 = next_index(index2,total2,ndim2,lo2,hi2,dims2) - enddo -c -c at this point both index1 and index2 should be 0 - if((index1.ne.0).or.(index2.ne.0)) then - print *, 'Error: # of elems dont match' - call ga_error('bye',0) - endif -c - end - -c do patch = patch + buf * alpha - subroutine scale_patch_dcpl(total, - $ alpha,arr1,lo1,hi1,ndim1,dims1,beta,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - double complex alpha, beta - integer next_index - integer ind1, ind2 -c - ind1 = 0 - ind2 = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - arr1(ind1) = arr1(ind1)*alpha + arr2(ind2)*beta - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - end - -c transpose an array - subroutine transpose_dcpl(a1,a2,total,ndim,dims) - implicit none - integer ndim,total - integer dims(ndim) - double complex a1(total),a2(total) - integer i, j - integer idx - integer bv(8), bunit(8) -c - bv(1)=0 - bunit(1)=1 - do i=2, ndim - bv(i) = 0 - bunit(i) = bunit(i-1) * dims(i-1) - enddo -c - do i=1, total - idx = 1 - do j=1, ndim - idx = idx + bv(j) * bunit(ndim-j+1) - if(mod(i,bunit(j)).eq.0) bv(j) = bv(j) + 1 - if(bv(j).ge.dims(j)) bv(j) = 0 - enddo -c print *, 'i = ',i, 'idx = ',idx - a2(idx) = a1(i) - enddo -c - do i=1, total - a1(i) = a2(i) - enddo -c - end - -c do patch = patch + buf * alpha - double complex function dot_patch_dcpl( - $ total,arr1,lo1,hi1,ndim1,dims1,arr2,lo2,hi2,ndim2,dims2) - implicit none - integer ndim1,ndim2,total - integer lo1(ndim1),hi1(ndim1),lo2(ndim2),hi2(ndim2) - integer dims1(ndim1),dims2(ndim2) - double complex arr1(total),arr2(total) - integer next_index - integer ind1, ind2 - double complex res -c - ind1 = 0 - ind2 = 0 - res = 0 - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) -c - do while(ind1.ne.0) - res = res + arr1(ind1)*arr2(ind2) - ind1 = next_index(ind1,total,ndim1,lo1,hi1,dims1) - ind2 = next_index(ind2,total,ndim2,lo2,hi2,dims2) - enddo -c - dot_patch_dcpl = res -c - end - - diff --git a/global/testing/patch.F b/global/testing/patch.F index 875c47ae1..ac79f7b3b 100644 --- a/global/testing/patch.F +++ b/global/testing/patch.F @@ -2,18 +2,10 @@ # include "config.fh" #endif C $iD: PATch.F,v 1.21 2002/10/15 23:35:52 vinod Exp $ -#if (defined(CRAY) && !defined(__crayx1)) || defined(KSR) -# define xgemm SGEMM -# define ygemm CGEMM -#else -# define xgemm TEST_DGEMM -# define ygemm TEST_ZGEMM -#endif -#if defined(FUJITSU) || defined(CRAY_YMP) -# define THRESH 1.0d-10 -#else + +#define xgemm TEST_DGEMM +#define ygemm TEST_ZGEMM # define THRESH 1.0d-20 -#endif #define MISMATCH(x,y) abs(x-y)/max(1,abs(x)).gt.THRESH c c Be aware that matrix-multiplies with transposes are not being tested diff --git a/global/testing/perf.F b/global/testing/perf.F index c2d294bf2..ddc6323d6 100644 --- a/global/testing/perf.F +++ b/global/testing/perf.F @@ -8,11 +8,6 @@ c remote operations access data on processes 1,2,3 in the round-robin way| c------------------------------------------------------------------------ c -#if defined(__xlc__) || defined(__xlC__) || defined(__IBMC__) -#define SLEEP sleep_ -#else -#define SLEEP sleep -#endif program perf implicit none #include "mafdecls.fh" @@ -157,27 +152,24 @@ subroutine TestPutGetAcc1 if (me .eq. 0) then tg=time_get1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(1) endif + call ga_sync() c c everybody touches own data call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop) if (me .eq. 0) then tp=time_put1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(1) endif + call ga_sync() c c everybody touches own data call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop) if (me .eq. 0) then ta=time_acc1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(1) endif + call ga_sync() c if (me .eq. 0) then write(6,77)bytes, chunk(loop), @@ -433,27 +425,24 @@ subroutine TestPutGetAcc if (me .eq. 0) then tg=time_get(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(1) endif + call ga_sync() c c everybody touches own data call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) if (me .eq. 0) then tp=time_put(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(1) endif + call ga_sync() c c everybody touches own data call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) if (me .eq. 0) then ta=time_acc(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(1) endif + call ga_sync() c if (me .eq. 0) then write(6,77)bytes, chunk(loop), diff --git a/global/testing/perfmod.F b/global/testing/perfmod.F index 50629870b..b43c7a775 100644 --- a/global/testing/perfmod.F +++ b/global/testing/perfmod.F @@ -8,11 +8,6 @@ c remote operations access data on processes 1,2,3 in the round-robin way| c------------------------------------------------------------------------ c -#if defined(__xlc__) || defined(__xlC__) || defined(__IBMC__) -#define SLEEP sleep_ -#else -#define SLEEP sleep -#endif program perfmod implicit none #include "mafdecls.fh" @@ -136,27 +131,24 @@ subroutine TestPutGetAcc if (me .eq. 0) then tg=time_get(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(2) endif + call ga_sync() c c everybody touches own data call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) if (me .eq. 0) then tp=time_put(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(2) endif + call ga_sync() c c everybody touches own data call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) if (me .eq. 0) then ta=time_acc(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, $ local) - else - call SLEEP(2) endif + call ga_sync() c if (me .eq. 0) then write(6,77)bytes, count, tg, 1d-6*bytes/tg, diff --git a/global/testing/perform.F b/global/testing/perform.F index d75adfecb..8b289a899 100644 --- a/global/testing/perform.F +++ b/global/testing/perform.F @@ -2,11 +2,6 @@ # include "config.fh" #endif c $Id: perform.F,v 1.9 2000-05-25 01:09:20 d3h325 Exp $ -#if defined(__xlc__) || defined(__xlC__) || defined(__IBMC__) -#define SLEEP sleep_ -#else -#define SLEEP sleep -#endif program perform c*** c*** Last modification: Fri Jan 13 12:13:27 PST 1995 @@ -127,25 +122,22 @@ subroutine TestPutGetAcc call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) if (me .eq. 0) then tg= time_get(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count) - else - call SLEEP(2) endif + call ga_sync() c c everybody touches own data call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) if (me .eq. 0) then tp= time_put(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count) - else - call SLEEP(2) endif + call ga_sync() c c everybody touches own data call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) if (me .eq. 0) then ta= time_acc(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count) - else - call SLEEP(2) endif + call ga_sync() c if (me .eq. 0) then write(6,77)bytes, count, tg, 1d-6*bytes/tg, diff --git a/global/testing/pg2test.F b/global/testing/pg2test.F index fb9a4bd65..c20b63105 100644 --- a/global/testing/pg2test.F +++ b/global/testing/pg2test.F @@ -3,16 +3,8 @@ #endif c $Id: pg2test.F,v 1.6 2004-12-01 22:39:46 manoj Exp $ c vector boxes lack arithmetic precision -#ifdef CRAY_YMP -# define THRESH 1d-10 -# define THRESHF 1e-5 -#elif defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 1e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/pg2testmatmult.F b/global/testing/pg2testmatmult.F index 07a9d879f..756ccd52c 100644 --- a/global/testing/pg2testmatmult.F +++ b/global/testing/pg2testmatmult.F @@ -1,13 +1,9 @@ #if HAVE_CONFIG_H # include "config.fh" #endif -#if (defined(CRAY) && !defined(__crayx1)) || defined(KSR) -# define xgemm SGEMM -# define ygemm CGEMM -#else -# define xgemm TEST_DGEMM -# define ygemm TEST_ZGEMM -#endif + +#define xgemm TEST_DGEMM +#define ygemm TEST_ZGEMM program ga_test c $Id: pg2testmatmult.F,v 1.3.6.1 2007-04-25 21:49:59 manoj Exp $ diff --git a/global/testing/pgtest.F b/global/testing/pgtest.F index 9cda75468..c599aa6fe 100644 --- a/global/testing/pgtest.F +++ b/global/testing/pgtest.F @@ -3,16 +3,8 @@ #endif c $Id: pgtest.F,v 1.9 2005-11-23 10:25:18 manoj Exp $ c vector boxes lack arithmetic precision -#ifdef CRAY_YMP -# define THRESH 1d-10 -# define THRESHF 1e-5 -#elif defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 1e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/test.F b/global/testing/test.F index 94232f038..a2cde198d 100644 --- a/global/testing/test.F +++ b/global/testing/test.F @@ -3,13 +3,8 @@ #endif c $Id: test.F,v 1.64.2.11 2007-04-06 22:37:35 d3g293 Exp $ c vector boxes lack arithmetic precision -#if defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 2e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/types-test.F b/global/testing/types-test.F index f9685e7c0..fd7f4f6ea 100644 --- a/global/testing/types-test.F +++ b/global/testing/types-test.F @@ -3,13 +3,8 @@ #endif c $Id: test.F,v 1.64.2.11 2007-04-06 22:37:35 d3g293 Exp $ c vector boxes lack arithmetic precision -#if defined(FUJITSU) -# define THRESH 1d-12 -# define THRESHF 1e-5 -#else # define THRESH 1d-13 # define THRESHF 2e-5 -#endif #define MISMATCH(x,y) abs(x-y)/max(1d0,abs(x)).gt.THRESH #define MISMATCHF(x,y) abs(x-y)/max(1.0,abs(x)).gt.THRESHF diff --git a/global/testing/util.c b/global/testing/util.c index 955a1b720..8c595a37e 100644 --- a/global/testing/util.c +++ b/global/testing/util.c @@ -453,12 +453,10 @@ double FATR util_timer_() void FATR set_ma_use_armci_mem_() { int retval; -#if defined(SPARC64_GP) || defined(HPUX64) - if((retval=putenv("MA_USE_ARMCI_MEM=YES")) != 0) -#elif defined(_WIN32) +#if defined(_WIN32) if((retval=_putenv("MA_USE_ARMCI_MEM=YES")) != 0) #else if((retval=setenv("MA_USE_ARMCI_MEM", "YES", 1)) != 0) #endif - GA_Error("setenv failed: insufficient space in the environment",1); + GA_Error("setenv failed: insufficient space in the environment",1); } diff --git a/m4/armci_c_opt.m4 b/m4/armci_c_opt.m4 index 81d3e43e3..8b0800ff2 100644 --- a/m4/armci_c_opt.m4 +++ b/m4/armci_c_opt.m4 @@ -10,52 +10,26 @@ AC_CACHE_CHECK([for specific C optimizations], [armci_cv_c_opt], [ AS_IF([test "x$ARMCI_COPT" != x], [armci_cv_c_opt="$ARMCI_COPT"], [armci_cv_c_opt=]) AS_IF([test "x$armci_cv_c_opt" = x && test "x$enable_opt" = xyes], [ AS_CASE([$ga_cv_target:$ga_cv_c_compiler_vendor:$host_cpu:$ga_armci_network], -[BGL:*:*:*], [armci_cv_c_opt="-O0"], -[BGP:ibm:*:*], [armci_cv_c_opt="-O3 -qstrict -qarch=450 -qtune=450"], -[BGP:gnu:*:*], [armci_cv_c_opt="-O2"], -[CATAMOUNT:*:*:*], [armci_cv_c_opt=], -[CRAY_XT:*:*:*], [armci_cv_c_opt=], [CYGWIN:*:*:*], [armci_cv_c_opt="-malign-double"], -[FUJITSU_VPP64:*:*:*], [armci_cv_c_opt="-x100"], -[FUJITSU_VPP:*:*:*], [armci_cv_c_opt="-x100 -KA32"], -[HPUX64:*:*:*], [armci_cv_c_opt="-Ae"], -[HPUX64:*:ia64:*], [armci_cv_c_opt="-Ae"], -[HPUX:*:*:*], [armci_cv_c_opt="-Ae"], [IBM64:*:*:*], [armci_cv_c_opt="-O3 -qinline=100 -qstrict -qarch=auto -qtune=auto"], [IBM:*:*:*], [armci_cv_c_opt="-O3 -qinline=100 -qstrict -qarch=auto -qtune=auto"], -[LAPI64:*:*:*], [armci_cv_c_opt="-O3 -qinline=100 -qstrict -qarch=auto -qtune=auto"], -[LAPI:*:*:*], [armci_cv_c_opt="-O3 -qinline=100 -qstrict -qarch=auto -qtune=auto"], -[LINUX64:fujitsu:ia64:*], [armci_cv_c_opt="-Kfast"], [LINUX64:fujitsu:x86_64:*], [armci_cv_c_opt="-Kfast"], -[LINUX64:gnu:ia64:*], [armci_cv_c_opt="-O0 -g"], [LINUX64:gnu:x86_64:*], [armci_cv_c_opt="-O3 -funroll-loops"], [LINUX64:ibm:powerpc64:*], [armci_cv_c_opt="-O3 -qinline=100 -qstrict -qarch=auto -qtune=auto"], [LINUX64:ibm:ppc64:*], [armci_cv_c_opt="-O3 -qinline=100 -qstrict -qarch=auto -qtune=auto"], [LINUX64:ibm:x86_64:*], [armci_cv_c_opt=""], -[LINUX64:intel:ia64:*], [armci_cv_c_opt="-w1"], [LINUX64:unknown:alpha:*], [armci_cv_c_opt="-assume no2underscore -fpe3 -check nooverflow -assume accuracy_sensitive -check nopower -check nounderflow"], [LINUX:fujitsu:*:*], [armci_cv_c_opt="-Kfast"], [LINUX:gnu:686:*], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops -march=pentiumpro -malign-double"], -[LINUX:gnu:686:MELLANOX], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops -march=pentiumpro"], [LINUX:gnu:686:OPENIB], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops -march=pentiumpro"], [LINUX:gnu:786:*], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops -march=pentiumpro -malign-double"], -[LINUX:gnu:786:MELLANOX], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops -march=pentiumpro"], [LINUX:gnu:786:OPENIB], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops -march=pentiumpro"], [LINUX:gnu:x86:*], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops -malign-double"], -[LINUX:gnu:x86:MELLANOX], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops "], [LINUX:gnu:x86:OPENIB], [armci_cv_c_opt="-O2 -finline-functions -funroll-loops "], [LINUX:ibm:*:*], [armci_cv_c_opt="-q32"], [LINUX:intel:*:*], [armci_cv_c_opt="-O3 -prefetch"], [MACX64:*:*:*], [armci_cv_c_opt=], [MACX:*:*:*], [armci_cv_c_opt=], -[NEC64:*:*:*], [armci_cv_c_opt="-Cvsafe -size_t64"], -[NEC:*:*:*], [armci_cv_c_opt="-Cvsafe"], -[SOLARIS64:fujitsu:*:*], [armci_cv_c_opt="-Kfast -KV9FMADD -x0"], -[SOLARIS64:gnu:*:*], [armci_cv_c_opt="-dalign"], -[SOLARIS64:gnu:i386:*], [armci_cv_c_opt="-dalign -xarch=amd64"], -[SOLARIS:fujitsu:*:*], [armci_cv_c_opt="-Kfast -KV8PFMADD -x0"], -[SOLARIS:gnu:*:*], [armci_cv_c_opt="-dalign"], -[SOLARIS:gnu:i386:*], [armci_cv_c_opt="-dalign -xarch=sse2"], [armci_cv_c_opt=]) ])]) AC_SUBST([ARMCI_COPT], [$armci_cv_c_opt]) diff --git a/m4/armci_cxx_opt.m4 b/m4/armci_cxx_opt.m4 index c122e9349..1324a812e 100644 --- a/m4/armci_cxx_opt.m4 +++ b/m4/armci_cxx_opt.m4 @@ -11,8 +11,6 @@ AS_IF([test "x$ARMCI_CXXOPT" != x], [armci_cv_cxx_opt="$ARMCI_CXXOPT"], [armci_c AS_IF([test "x$armci_cv_cxx_opt" = x && test "x$enable_opt" = xyes], [ AS_CASE([$ga_cv_target:$ga_cv_cxx_compiler_vendor:$host_cpu:$ga_armci_network], [LINUX:*:*:*], [armci_cv_cxx_opt="-O0"], -[NEC64:*:*:*], [armci_cv_cxx_opt="-Cvsafe -size_t64"], -[NEC:*:*:*], [armci_cv_cxx_opt="-Cvsafe"], [armci_cv_cxx_opt=]) ])]) AC_SUBST([ARMCI_CXXOPT], [$armci_cv_cxx_opt]) diff --git a/m4/armci_f77_opt.m4 b/m4/armci_f77_opt.m4 index 686b1665e..ffa3d972d 100644 --- a/m4/armci_f77_opt.m4 +++ b/m4/armci_f77_opt.m4 @@ -10,44 +10,24 @@ AC_CACHE_CHECK([for specific Fortran optimizations], [armci_cv_f77_opt], [ AS_IF([test "x$ARMCI_FOPT" != x], [armci_cv_f77_opt="$ARMCI_FOPT"], [armci_cv_f77_opt=]) AS_IF([test "x$armci_cv_f77_opt" = x && test "x$enable_opt" = xyes], [ AS_CASE([$ga_cv_target:$ga_cv_f77_compiler_vendor:$host_cpu:$ga_armci_network], -[BGL:*:*:*], [armci_cv_f77_opt="-O0"], -[BGP:ibm:*:*], [armci_cv_f77_opt="-O3 -qstrict -qarch=450 -qtune=450"], -[BGP:gnu:*:*], [armci_cv_f77_opt="-O2"], -[CATAMOUNT:*:*:*], [armci_cv_f77_opt="-O3"], -[CRAY_XT:*:*:*], [armci_cv_f77_opt=], [CYGWIN:*:*:*], [armci_cv_f77_opt=], -[FUJITSU_VPP64:*:*:*], [armci_cv_f77_opt="-Sw"], -[FUJITSU_VPP:*:*:*], [armci_cv_f77_opt="-Sw -KA32"], -[HPUX64:*:*:*], [armci_cv_f77_opt="-O3 +Odataprefetch +Ofastaccess"], -[HPUX64:*:ia64:*], [armci_cv_f77_opt=], -[HPUX:*:*:*], [armci_cv_f77_opt="-O3 +Odataprefetch"], [IBM64:*:*:*], [armci_cv_f77_opt=], [IBM:*:*:*], [armci_cv_f77_opt="-O4 -qarch=auto -qstrict"], -[LAPI64:*:*:*], [armci_cv_f77_opt=], -[LAPI:*:*:*], [armci_cv_f77_opt=], -[LINUX64:fujitsu:ia64:OPENIB], [armci_cv_f77_opt="-Kfast -X9 -Am -fw -Kthreadsafe"], -[LINUX64:fujitsu:ia64:*], [armci_cv_f77_opt="-Kfast -X9 -Am -fw"], [LINUX64:fujitsu:x86_64:OPENIB],[armci_cv_f77_opt="-Kfast -X9 -Am -fw -Kthreadsafe"], [LINUX64:fujitsu:x86_64:*], [armci_cv_f77_opt="-Kfast -X9 -Am -fw"], [LINUX64:gnu:x86_64:*], [armci_cv_f77_opt="-fstrength-reduce -mfpmath=sse"], [LINUX64:ibm:powerpc64:*], [armci_cv_f77_opt="-O4 -qarch=auto -qstrict"], [LINUX64:ibm:ppc64:*], [armci_cv_f77_opt="-O4 -qarch=auto -qstrict"], -[LINUX64:intel:ia64:*], [armci_cv_f77_opt="-O3 -hlo -ftz -pad -w -cm -w90"], [LINUX64:intel:x86_64:*], [armci_cv_f77_opt="-O3 -w -cm -xW -tpp7"], [LINUX64:pathscale:x86_64:*], [armci_cv_f77_opt="-O3 -OPT:Ofast"], [LINUX64:portland:x86_64:*], [armci_cv_f77_opt="-fast -Mdalign -O3"], -[LINUX64:sgi:ia64:*], [armci_cv_f77_opt="-macro-expand"], -[LINUX64:unknown:ia64:*], [armci_cv_f77_opt="-assume no2underscore -fpe3 -check nooverflow -assume accuracy_sensitive -check nopower -check nounderflow"], [LINUX:fujitsu:*:OPENIB], [armci_cv_f77_opt="-Kfast -X9 -Am -fw -Kthreadsafe"], [LINUX:fujitsu:*:*], [armci_cv_f77_opt="-Kfast -X9 -Am -fw"], [LINUX:gnu:686:*], [armci_cv_f77_opt="-O3 -funroll-loops -march=pentiumpro -malign-double"], -[LINUX:gnu:686:MELLANOX], [armci_cv_f77_opt="-O3 -funroll-loops -march=pentiumpro"], [LINUX:gnu:686:OPENIB], [armci_cv_f77_opt="-O3 -funroll-loops -march=pentiumpro"], [LINUX:gnu:786:*], [armci_cv_f77_opt="-O3 -funroll-loops -march=pentiumpro -malign-double"], -[LINUX:gnu:786:MELLANOX], [armci_cv_f77_opt="-O3 -funroll-loops -march=pentiumpro"], [LINUX:gnu:786:OPENIB], [armci_cv_f77_opt="-O3 -funroll-loops -march=pentiumpro"], [LINUX:gnu:x86:*], [armci_cv_f77_opt="-O3 -funroll-loops -malign-double"], -[LINUX:gnu:x86:MELLANOX], [armci_cv_f77_opt="-O3 -funroll-loops"], [LINUX:gnu:x86:OPENIB], [armci_cv_f77_opt="-O3 -funroll-loops"], [LINUX:intel:686:*], [armci_cv_f77_opt="-O4 -prefetch -unroll -ip -xK -tpp6"], [LINUX:intel:786:*], [armci_cv_f77_opt="-O4 -prefetch -unroll -ip -xW -tpp7"], @@ -57,14 +37,6 @@ AS_CASE([$ga_cv_target:$ga_cv_f77_compiler_vendor:$host_cpu:$ga_armci_network], [LINUX:portland:*:*], [armci_cv_f77_opt="-Mvect -Munroll -Mdalign -Minform,warn -Mnolist -Minfo=loop -Munixlogical"], [MACX64:intel:*:*], [armci_cv_f77_opt="-O3 -prefetch -w -cm"], [MACX:*:*:*], [armci_cv_f77_opt=], -[NEC64:*:*:*], [armci_cv_f77_opt="-Cvsafe -size_t64"], -[NEC:*:*:*], [armci_cv_f77_opt="-Cvsafe"], -[SOLARIS64:fujitsu:*:*], [armci_cv_f77_opt="-fw -Kfast -KV8PFMADD"], -[SOLARIS64:gnu:*:*], [armci_cv_f77_opt="-dalign"], -[SOLARIS64:gnu:i386:*], [armci_cv_f77_opt="-dalign -xarch=sse2"], -[SOLARIS:fujitsu:*:*], [armci_cv_f77_opt="-fw -Kfast -KV8PFMADD"], -[SOLARIS:gnu:*:*], [armci_cv_f77_opt="-dalign"], -[SOLARIS:gnu:i386:*], [armci_cv_f77_opt="-dalign -xarch=sse2"], [armci_cv_f77_opt=]) ])]) AC_SUBST([ARMCI_FOPT], [$armci_cv_f77_opt]) diff --git a/m4/armci_nb_noncont.m4 b/m4/armci_nb_noncont.m4 deleted file mode 100644 index b5966a1a7..000000000 --- a/m4/armci_nb_noncont.m4 +++ /dev/null @@ -1,11 +0,0 @@ -# ARMCI_ENABLE_NB_NONCONT -# ----------------------- -# Not sure what this is for. -AC_DEFUN([ARMCI_ENABLE_NB_NONCONT], -[AC_ARG_ENABLE([nb_noncont], - [AS_HELP_STRING([--enable-nb-noncont], [TODO])], - [enable_nb_noncont=yes - AC_DEFINE([NB_NONCONT], [1], [TODO])], - [enable_nb_noncont=no]) -AM_CONDITIONAL([NB_NONCONT], [test x$enable_nb_noncont = xyes]) -])dnl diff --git a/m4/armci_setup.m4 b/m4/armci_setup.m4 index 8d56616fe..54150c5bb 100644 --- a/m4/armci_setup.m4 +++ b/m4/armci_setup.m4 @@ -7,57 +7,6 @@ AC_DEFUN([ARMCI_SETUP], [AC_REQUIRE([GA_ARMCI_NETWORK]) AS_CASE([$ga_armci_network], -[CRAY_SHMEM], [ - AC_DEFINE([CLUSTER], [1], [TODO]) - AC_DEFINE([CRAY_XT], [1], [TODO]) - AC_DEFINE([CRAY_SHMEM], [1], [TODO]) - ], -[ELAN3], [ - AC_DEFINE([PTHREADS], [1], [TODO]) - AC_DEFINE([SERVER_THREAD], [1], [TODO]) - AC_DEFINE([_REENTRANT], [1], [TODO]) - AC_DEFINE([ACC_SMP], [1], [TODO]) - AC_DEFINE([QUADRICS], [1], [TODO]) - AS_IF([test x$LIBELAN_NATTACH != x], - [AC_DEFINE([MULTI_CTX], [1], [TODO])], - [AC_DEFINE([ALLOC_MUNMAP], [1], [TODO])]) - ], -[ELAN4], [ - AC_DEFINE([PTHREADS], [1], [TODO]) - AC_DEFINE([SERVER_THREAD], [1], [TODO]) - AC_DEFINE([_REENTRANT], [1], [TODO]) - AC_DEFINE([ACC_SMP], [1], [TODO]) - AC_DEFINE([QUADRICS], [1], [TODO]) - AC_DEFINE([DOELAN4], [1], [TODO]) - ], -[GEMINI], [ - AC_DEFINE([LIBONESIDED], [1], [for Gemini]) - AC_DEFINE([CRAY_UGNI], [1], [for Gemini]) - AC_DEFINE([GEMINI], [1], [for Gemini]) - ], -[GM], [ - AC_DEFINE([PTHREADS], [1], [TODO]) - AC_DEFINE([DATA_SERVER], [1], [TODO]) - AC_DEFINE([ALLOW_PIN], [1], [TODO]) - AC_DEFINE([SERVER_THREAD], [1], [TODO]) - AC_DEFINE([_REENTRANT], [1], [TODO]) - AC_DEFINE([GM], [1], [TODO]) - ], -[LAPI], [ - AC_DEFINE([LAPI], [1], [TODO]) - AS_IF([test x$LAPI_RDMA != x], - [AC_DEFINE([ALLOW_PIN], [1], [TODO]) - AC_DEFINE([LAPI_RDMA], [1], [TODO])]) - ], -[MELLANOX], [ - AC_DEFINE([PTHREADS], [1], [TODO]) - AC_DEFINE([DATA_SERVER], [1], [TODO]) - AC_DEFINE([SERVER_THREAD], [1], [TODO]) - AC_DEFINE([_REENTRANT], [1], [TODO]) - AC_DEFINE([VAPI], [1], [TODO]) - AC_DEFINE([ALLOW_PIN], [1], [TODO]) - AC_DEFINE([MELLANOX], [1], [TODO]) - ], [MPI_TS], [ AC_DEFINE([PTHREADS], [1], [TODO]) AC_DEFINE([DATA_SERVER], [1], [TODO]) @@ -91,8 +40,6 @@ AS_CASE([$ga_armci_network], [MPI_SPAWN], [ AC_DEFINE([DATA_SERVER], [1], [TODO]) AC_DEFINE([MPI_SPAWN], [1], [TODO]) - AS_IF([test x$ARMCI_SPAWN_CRAY_XT != x], - [AC_DEFINE([SPAWN_CRAY_XT], [1], [TODO])]) ], [OPENIB], [ AC_DEFINE([PTHREADS], [1], [TODO]) @@ -104,23 +51,9 @@ AS_CASE([$ga_armci_network], AC_DEFINE([PEND_BUFS], [1], [TODO]) AC_DEFINE([OPENIB], [1], [TODO]) ], -[PORTALS], [ - AC_DEFINE([DATA_SERVER], [1], [TODO]) - AC_DEFINE([_REENTRANT], [1], [TODO]) - AC_DEFINE([SERVER_THREAD], [1], [TODO]) - AC_DEFINE([CRAY_XT], [1], [TODO]) - AC_DEFINE([PORTALS], [1], [TODO]) - ], [SOCKETS], [ AC_DEFINE([DATA_SERVER], [1], [TODO]) AC_DEFINE([SOCKETS], [1], [TODO]) - ], -[VIA], [ - AC_DEFINE([PTHREADS], [1], [TODO]) - AC_DEFINE([DATA_SERVER], [1], [TODO]) - AC_DEFINE([SERVER_THREAD], [1], [TODO]) - AC_DEFINE([_REENTRANT], [1], [TODO]) - AC_DEFINE([VIA], [1], [TODO]) ] ) AS_IF([test x$REPORT_SHMMAX != x], diff --git a/m4/ax_pthread.m4 b/m4/ax_pthread.m4 index f1825b989..2fa2d68ab 100644 --- a/m4/ax_pthread.m4 +++ b/m4/ax_pthread.m4 @@ -120,7 +120,7 @@ fi # We must check for the threads library under a number of different # names; the ordering is very important because some systems -# (e.g. DEC) have both -lpthread and -lpthreads, where one of the +# have both -lpthread and -lpthreads, where one of the # libraries is broken (non-POSIX). # Create a list of thread flags to try. Items starting with a "-" are @@ -160,14 +160,6 @@ case $host_os in ax_pthread_flags="-kthread lthread $ax_pthread_flags" ;; - hpux*) - - # From the cc(1) man page: "[-mt] Sets various -D flags to enable - # multi-threading and also sets -lpthread." - - ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags" - ;; - openedition*) # IBM z/OS requires a feature-test macro to be defined in order to @@ -208,7 +200,7 @@ AS_IF([test "x$GCC" = "xyes"], # correctly enabled case $host_os in - darwin* | hpux* | linux* | osf* | solaris*) + darwin* | linux* | osf* | solaris*) ax_pthread_check_macro="_REENTRANT" ;; diff --git a/m4/ga_64bit_flag.m4 b/m4/ga_64bit_flag.m4 index be113ce95..43a979b38 100644 --- a/m4/ga_64bit_flag.m4 +++ b/m4/ga_64bit_flag.m4 @@ -7,9 +7,6 @@ # Known flags: # -m64 GNU # -q64 IBM -# +DD64 HPUX -# +DA2.0W HPUX (obsolete form of +DD64) -# -64 SGI TFP, not sure, others might be -mips64, -align64? # AC_DEFUN([GA_64BIT_FLAG], [AC_CACHE_CHECK([for flag to indicate 64-bits], [ga_cv_64bit_flag], diff --git a/m4/ga_ar.m4 b/m4/ga_ar.m4 index 39689a62b..8726f8fd0 100644 --- a/m4/ga_ar.m4 +++ b/m4/ga_ar.m4 @@ -7,14 +7,9 @@ # # Known archivers: # ar - all known systems -# sxar - special to NEC/NEC64 # AC_DEFUN([GA_AR], [ AC_ARG_VAR([AR], [archiver used by libtool (default: ar)]) AC_ARG_VAR([AR_FLAGS], [archiver flags used by libtool (default: cru)]) AC_ARG_VAR([RANLIB], [generates index to archive (default: ranlib)]) -AS_IF([test "x$AR" = x], - [AS_CASE([$ga_cv_target], [NEC|NEC64], [AR=sxar])]) -AS_IF([test "x$RANLIB" = x], - [AS_CASE([$ga_cv_target], [NEC|NEC64], [RANLIB=true])]) ])dnl diff --git a/m4/ga_armci_network.m4 b/m4/ga_armci_network.m4 index d510d67c9..18a228e81 100644 --- a/m4/ga_armci_network.m4 +++ b/m4/ga_armci_network.m4 @@ -144,49 +144,6 @@ AS_IF([test "x$happy" = xyes], [armci_network_external=0; $2]) ])dnl -# _GA_ARMCI_NETWORK_CRAY_SHMEM([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -# ---------------------------------------------------------------------- -AC_DEFUN([_GA_ARMCI_NETWORK_CRAY_SHMEM], [ -AC_MSG_NOTICE([searching for CRAY_SHMEM...]) -happy=yes -AS_IF([test "x$happy" = xyes], - [AC_CHECK_HEADER([mpp/shmem.h], [], - [AC_CHECK_HEADER([shmem.h], [], [happy=no])])]) -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([shmem_init], [sma], [], [happy=no]) - AS_CASE([$ac_cv_search_shmem_init], - ["none required"], [], - [no], [], - [# add sma to ARMCI_NETWORK_LIBS if not there - AS_CASE([$ARMCI_NETWORK_LIBS], - [*sma*], [], - [ARMCI_NETWORK_LIBS="$ARMCI_NETWORK_LIBS -lsma"])])]) -AS_IF([test "x$happy" = xyes], - [ga_armci_network=CRAY_SHMEM; with_cray_shmem=yes; $1], - [$2]) -])dnl - -# _GA_ARMCI_NETWORK_LAPI([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -# ---------------------------------------------------------------- -AC_DEFUN([_GA_ARMCI_NETWORK_LAPI], [ -AC_MSG_NOTICE([searching for LAPI...]) -happy=yes -AS_IF([test "x$happy" = xyes], - [AC_CHECK_HEADER([lapi.h], [], [happy=no])]) -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([LAPI_Init], [lapi_r lapi], [], [happy=no]) - AS_CASE([$ac_cv_search_LAPI_Init], - ["none required"], [], - [no], [], - [# add missing lib to ARMCI_NETWORK_LIBS if not there - AS_CASE([$ARMCI_NETWORK_LIBS], - [*$ac_cv_search_LAPI_Init*], [], - [ARMCI_NETWORK_LIBS="$ARMCI_NETWORK_LIBS $ac_cv_search_LAPI_Init"])])]) -AS_IF([test "x$happy" = xyes], - [ga_armci_network=LAPI; with_lapi=yes; $1], - [$2]) -])dnl - # _GA_ARMCI_NETWORK_MPI_TS([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # --------------------------------------------------------------------- AC_DEFUN([_GA_ARMCI_NETWORK_MPI_TS], [ @@ -301,41 +258,6 @@ AS_IF([test "x$happy" = xyes], [$2]) ])dnl -# _GA_ARMCI_NETWORK_PORTALS([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -# ------------------------------------------------------------------- -AC_DEFUN([_GA_ARMCI_NETWORK_PORTALS], [ -AC_MSG_NOTICE([searching for PORTALS...]) -happy=yes -AS_IF([test "x$happy" = xyes], - [AC_CHECK_HEADER([portals/portals3.h], [], [happy=no])]) -AS_IF([test "x$happy" = xyes], - [AC_CHECK_HEADER([portals/nal.h], [], [happy=no])]) -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([PtlInit], [portals], [], [happy=no])]) -AS_IF([test "x$happy" = xyes], - [ga_armci_network=PORTALS; with_portals=yes; $1], - [$2]) -])dnl - -# _GA_ARMCI_NETWORK_DMAPP([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -# ----------------------------------------------------------------- -AC_DEFUN([_GA_ARMCI_NETWORK_DMAPP], [ -AC_MSG_NOTICE([searching for DMAPP...]) -happy=yes -AS_IF([test "x$happy" = xyes], - [ga_armci_network=DMAPP; with_dmapp=yes; $1], - [$2]) -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([gethugepagesize], [hugetlbfs]) - AS_CASE([$ac_cv_search_gethugepagesize], - ["none required"], [], - [no], [], - [# add missing lib to ARMCI_NETWORK_LIBS if not there - AS_CASE([$ARMCI_NETWORK_LIBS], - [*$ac_cv_search_gethugepagesize*], [], - [ARMCI_NETWORK_LIBS="$ARMCI_NETWORK_LIBS $ac_cv_search_gethugepagesize"])])]) -])dnl - # _GA_ARMCI_NETWORK_OFI([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # ------------------------------------------------------------------- AC_DEFUN([_GA_ARMCI_NETWORK_OFI], [ @@ -346,54 +268,6 @@ AS_IF([test "x$happy" = xyes], [$2]) ])dnl -# _GA_ARMCI_NETWORK_GEMINI([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -# ------------------------------------------------------------------ -# TODO when gemini headers and libraries become available, fix this -AC_DEFUN([_GA_ARMCI_NETWORK_GEMINI], [ -AC_MSG_NOTICE([searching for GEMINI...]) -happy=yes -AS_IF([test "x$happy" = xyes], - [AC_CHECK_HEADER([numatoolkit.h], [], [happy=no], [ -AC_INCLUDES_DEFAULT -#include ])]) -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([NTK_Init], [numatoolkit], [], [happy=no])]) -# CPPFLAGS must have CRAY_UGNI before looking for the next headers. -gemini_save_CPPFLAGS="$CPPFLAGS"; CPPFLAGS="$CPPFLAGS -DCRAY_UGNI" -AS_IF([test "x$happy" = xyes], - [AC_CHECK_HEADER([onesided.h], [], [happy=no])]) -AS_IF([test "x$happy" = xyes], - [AC_CHECK_HEADER([gni.h], [], [happy=no])]) -CPPFLAGS="$gemini_save_CPPFLAGS" -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([gniInit], [onesided], [], [happy=no])]) -AS_IF([test "x$happy" = xyes], - [ga_armci_network=GEMINI; with_gemini=yes; $1], - [$2]) -# check for a function introduced in libonesided/1.5 -# we purposefully abuse the ac_cv_search_onesided_mem_htflush value -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([onesided_mem_htflush], [onesided]) - AS_IF([test "x$ac_cv_search_onesided_mem_htflush" != xno], - [ac_cv_search_onesided_mem_htflush=1], - [ac_cv_search_onesided_mem_htflush=0]) - AC_DEFINE_UNQUOTED([HAVE_ONESIDED_MEM_HTFLUSH], - [$ac_cv_search_onesided_mem_htflush], - [set to 1 if libonesided has onesided_mem_htflush (added in v1.5)]) - ]) -# check for a function introduced in libonesided/1.6 -# we purposefully abuse the ac_cv_search_onesided_fadd value -AS_IF([test "x$happy" = xyes], - [AC_SEARCH_LIBS([onesided_fadd], [onesided]) - AS_IF([test "x$ac_cv_search_onesided_fadd" != xno], - [ac_cv_search_onesided_fadd=1], - [ac_cv_search_onesided_fadd=0]) - AC_DEFINE_UNQUOTED([HAVE_ONESIDED_FADD], - [$ac_cv_search_onesided_fadd], - [set to 1 if libonesided has onesided_fadd (added in v1.6)]) - ]) -])dnl - # GA_ARMCI_NETWORK # ---------------- # This macro allows user to choose the armci network but also allows the @@ -411,10 +285,6 @@ AC_ARG_ENABLE([autodetect], armci_network_external=0 armci_network_count=0 _GA_ARMCI_NETWORK_WITH([armci], [external; path to external ARMCI library]) -_GA_ARMCI_NETWORK_WITH([cray-shmem],[Cray XT shmem]) -_GA_ARMCI_NETWORK_WITH([dmapp], [(Comex) Cray DMAPP]) -_GA_ARMCI_NETWORK_WITH([gemini], [Cray XE Gemini using libonesided]) -_GA_ARMCI_NETWORK_WITH([lapi], [IBM LAPI]) _GA_ARMCI_NETWORK_WITH([mpi-mt], [(Comex) MPI-2 multi-threading]) _GA_ARMCI_NETWORK_WITH([mpi-pt], [(Comex) MPI-2 multi-threading with progress thread]) _GA_ARMCI_NETWORK_WITH([mpi-pr], [(Comex) MPI-1 two-sided with progress rank]) @@ -425,7 +295,6 @@ _GA_ARMCI_NETWORK_WITH([ofa], [(Comex) Infiniband OpenIB]) _GA_ARMCI_NETWORK_WITH([ofi], [(Comex) OFI]) _GA_ARMCI_NETWORK_WITH([openib], [Infiniband OpenIB]) _GA_ARMCI_NETWORK_WITH([portals4], [(Comex) Portals4]) -_GA_ARMCI_NETWORK_WITH([portals], [Cray XT portals]) _GA_ARMCI_NETWORK_WITH([sockets], [Ethernet TCP/IP]) # Temporarily add ARMCI_NETWORK_CPPFLAGS to CPPFLAGS. ga_save_CPPFLAGS="$CPPFLAGS"; CPPFLAGS="$CPPFLAGS $ARMCI_NETWORK_CPPFLAGS" @@ -435,10 +304,6 @@ ga_save_LDFLAGS="$LDFLAGS"; LDFLAGS="$LDFLAGS $ARMCI_NETWORK_LDFLAGS" ga_save_LIBS="$LIBS"; LIBS="$ARMCI_NETWORK_LIBS $LIBS" AS_IF([test "x$enable_autodetect" = xyes], [AC_MSG_NOTICE([searching for ARMCI_NETWORK...]) - AS_IF([test "x$ga_armci_network" = x && test "x$with_cray_shmem" != xno], - [_GA_ARMCI_NETWORK_CRAY_SHMEM()]) - AS_IF([test "x$ga_armci_network" = x && test "x$with_lapi" != xno], - [_GA_ARMCI_NETWORK_LAPI()]) dnl AS_IF([test "x$ga_armci_network" = x && test "x$with_mpi_ts" != xno], dnl [_GA_ARMCI_NETWORK_MPI_TS()]) dnl AS_IF([test "x$ga_armci_network" = x && test "x$with_mpi_mt" != xno], @@ -457,12 +322,6 @@ dnl [_GA_ARMCI_NETWORK_MPI_SPAWN()]) [_GA_ARMCI_NETWORK_OPENIB()]) AS_IF([test "x$ga_armci_network" = x && test "x$with_portals4" != xno], [_GA_ARMCI_NETWORK_PORTALS4()]) - AS_IF([test "x$ga_armci_network" = x && test "x$with_portals" != xno], - [_GA_ARMCI_NETWORK_PORTALS()]) - AS_IF([test "x$ga_armci_network" = x && test "x$with_dmapp" != xno], - [_GA_ARMCI_NETWORK_DMAPP()]) - AS_IF([test "x$ga_armci_network" = x && test "x$with_gemini" != xno], - [_GA_ARMCI_NETWORK_GEMINI()]) AS_IF([test "x$ga_armci_network" = x && test "x$with_armci" != xno], [_GA_ARMCI_NETWORK_ARMCI()]) AS_IF([test "x$ga_armci_network" = x && test "x$with_ofi" != xno], @@ -480,15 +339,6 @@ dnl [_GA_ARMCI_NETWORK_MPI_SPAWN()]) [1], [AS_IF([test "x$ga_armci_network" = xARMCI], [_GA_ARMCI_NETWORK_ARMCI([], [AC_MSG_ERROR([test for ARMCI_NETWORK=ARMCI failed])])]) - AS_IF([test "x$ga_armci_network" = xCRAY_SHMEM], - [_GA_ARMCI_NETWORK_CRAY_SHMEM([], - [AC_MSG_ERROR([test for ARMCI_NETWORK=CRAY_SHMEM failed])])]) - AS_IF([test "x$ga_armci_network" = xDMAPP], - [_GA_ARMCI_NETWORK_DMAPP([], - [AC_MSG_ERROR([test for ARMCI_NETWORK=DMAPP failed])])]) - AS_IF([test "x$ga_armci_network" = xLAPI], - [_GA_ARMCI_NETWORK_LAPI([], - [AC_MSG_ERROR([test for ARMCI_NETWORK=LAPI failed])])]) AS_IF([test "x$ga_armci_network" = xMPI_TS], [_GA_ARMCI_NETWORK_MPI_TS([], [AC_MSG_ERROR([test for ARMCI_NETWORK=MPI_TS failed])])]) @@ -516,12 +366,6 @@ dnl [_GA_ARMCI_NETWORK_MPI_SPAWN()]) AS_IF([test "x$ga_armci_network" = xPORTALS4], [_GA_ARMCI_NETWORK_PORTALS4([], [AC_MSG_ERROR([test for ARMCI_NETWORK=PORTALS4 failed])])]) - AS_IF([test "x$ga_armci_network" = xPORTALS], - [_GA_ARMCI_NETWORK_PORTALS([], - [AC_MSG_ERROR([test for ARMCI_NETWORK=PORTALS failed])])]) - AS_IF([test "x$ga_armci_network" = xGEMINI], - [_GA_ARMCI_NETWORK_GEMINI([], - [AC_MSG_ERROR([test for ARMCI_NETWORK=GEMINI failed])])]) AS_IF([test "x$ga_armci_network" = xOFI], [_GA_ARMCI_NETWORK_OFI([], [AC_MSG_ERROR([test for ARMCI_NETWORK=OFI failed])])]) @@ -529,9 +373,6 @@ dnl [_GA_ARMCI_NETWORK_MPI_SPAWN()]) [AC_MSG_WARN([too many armci networks specified: $armci_network_count]) AC_MSG_WARN([the following were specified:]) _GA_ARMCI_NETWORK_WARN([armci]) - _GA_ARMCI_NETWORK_WARN([cray-shmem]) - _GA_ARMCI_NETWORK_WARN([dmapp]) - _GA_ARMCI_NETWORK_WARN([lapi]) _GA_ARMCI_NETWORK_WARN([mpi-ts]) _GA_ARMCI_NETWORK_WARN([mpi-mt]) _GA_ARMCI_NETWORK_WARN([mpi-pt]) @@ -541,8 +382,6 @@ dnl [_GA_ARMCI_NETWORK_MPI_SPAWN()]) _GA_ARMCI_NETWORK_WARN([ofa]) _GA_ARMCI_NETWORK_WARN([openib]) _GA_ARMCI_NETWORK_WARN([portals4]) - _GA_ARMCI_NETWORK_WARN([portals]) - _GA_ARMCI_NETWORK_WARN([gemini]) _GA_ARMCI_NETWORK_WARN([ofi]) _GA_ARMCI_NETWORK_WARN([sockets]) AC_MSG_ERROR([please select only one armci network])])]) @@ -553,9 +392,6 @@ LDFLAGS="$ga_save_LDFLAGS" # Remove ARMCI_NETWORK_LIBS from LIBS. LIBS="$ga_save_LIBS" _GA_ARMCI_NETWORK_AM_CONDITIONAL([armci]) -_GA_ARMCI_NETWORK_AM_CONDITIONAL([cray-shmem]) -_GA_ARMCI_NETWORK_AM_CONDITIONAL([dmapp]) -_GA_ARMCI_NETWORK_AM_CONDITIONAL([lapi]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([mpi-ts]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([mpi-mt]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([mpi-pt]) @@ -564,9 +400,7 @@ _GA_ARMCI_NETWORK_AM_CONDITIONAL([mpi-spawn]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([mpi3]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([ofa]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([openib]) -_GA_ARMCI_NETWORK_AM_CONDITIONAL([gemini]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([portals4]) -_GA_ARMCI_NETWORK_AM_CONDITIONAL([portals]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([ofi]) _GA_ARMCI_NETWORK_AM_CONDITIONAL([sockets]) AC_SUBST([ARMCI_NETWORK_LDFLAGS]) @@ -575,8 +409,6 @@ AC_SUBST([ARMCI_NETWORK_CPPFLAGS]) # permanent hack AS_CASE([$ga_armci_network], -[DMAPP], [ARMCI_SRC_DIR=comex], -[GEMINI], [ARMCI_SRC_DIR=src-gemini], [MPI_MT], [ARMCI_SRC_DIR=comex], [MPI_PT], [ARMCI_SRC_DIR=comex], [MPI_PR], [ARMCI_SRC_DIR=comex], @@ -586,40 +418,20 @@ AS_CASE([$ga_armci_network], [OFI], [ARMCI_SRC_DIR=comex], [OPENIB], [ARMCI_SRC_DIR=src], [PORTALS4], [ARMCI_SRC_DIR=comex], -[PORTALS], [ARMCI_SRC_DIR=src-portals], [ARMCI_SRC_DIR=src]) AC_SUBST([ARMCI_SRC_DIR]) -AM_CONDITIONAL([ARMCI_SRC_DIR_PORTALS], [test "x$ARMCI_SRC_DIR" = "xsrc-portals"]) -AM_CONDITIONAL([ARMCI_SRC_DIR_GEMINI], [test "x$ARMCI_SRC_DIR" = "xsrc-gemini"]) AM_CONDITIONAL([ARMCI_SRC_DIR_COMEX], [test "x$ARMCI_SRC_DIR" = "xcomex"]) AM_CONDITIONAL([ARMCI_SRC_DIR_SRC], [test "x$ARMCI_SRC_DIR" = "xsrc"]) AS_IF([test "x$ARMCI_SRC_DIR" = "xcomex"], [armci_network_external=1]) AM_CONDITIONAL([ARMCI_NETWORK_EXTERNAL], [test "x$armci_network_external" = x1]) AM_CONDITIONAL([ARMCI_NETWORK_COMEX], [test "x$ARMCI_SRC_DIR" = "xcomex"]) -# tcgmsg5 requires this -AS_IF([test x$ga_armci_network = xLAPI], -[AC_DEFINE([NOTIFY_SENDER], [1], - [this was defined unconditionally when using LAPI for tcgmsg 5]) -AC_DEFINE([LAPI], [1], [tcgmsg 5 requires this when using LAPI]) -]) - -ga_cray_xt_networks=no -AS_IF([test x$ga_armci_network = xPORTALS], [ga_cray_xt_networks=yes]) -AS_IF([test x$ga_armci_network = xCRAY_SHMEM], [ga_cray_xt_networks=yes]) -AM_CONDITIONAL([CRAY_XT_NETWORKS], [test x$ga_cray_xt_networks = xyes]) - ga_cv_sysv_hack=no # Only perform this hack for ARMCI build. AS_IF([test "x$ARMCI_TOP_BUILDDIR" != x], [ - AS_IF([test x$ga_cv_sysv = xno], - [AS_CASE([$ga_armci_network], - [PORTALS|GEMINI], [ga_cv_sysv_hack=no], - [ga_cv_sysv_hack=yes])], - [ga_cv_sysv_hack=yes]) -AS_IF([test x$ga_cv_sysv_hack = xyes], - [AC_DEFINE([SYSV], [1], - [Defined if we want this system to use SYSV shared memory])]) + ga_cv_sysv_hack=yes + AC_DEFINE([SYSV], [1], + [Defined if we want this system to use SYSV shared memory]) ]) AM_CONDITIONAL([SYSV], [test x$ga_cv_sysv_hack = xyes]) @@ -639,19 +451,16 @@ AM_CONDITIONAL([HAVE_ARMCI_MSG_INIT], [test "x$ga_armci_network" != xARMCI]) AM_CONDITIONAL([HAVE_ARMCI_MSG_FINALIZE],[test "x$ga_armci_network" != xARMCI]) # the armci iterators only available in the conglomerate sources AS_CASE([$ga_armci_network], - [ARMCI|GEMINI|PORTALS], [], + [ARMCI], [], [AC_DEFINE([HAVE_ARMCI_STRIDE_INFO_INIT], [1], [])]) AM_CONDITIONAL([HAVE_ARMCI_STRIDE_INFO_INIT], - [test "x$ga_armci_network" != xARMCI && test "x$ga_armci_network" != xGEMINI && test "x$ga_armci_network" != xPORTALS]) + [test "x$ga_armci_network" != xARMCI]) # ugly hack for working around NWChem memory requirements # and MPI_PR startup verus the 'classic' ARMCI startup delay_tcgmsg_mpi_startup=1 AS_CASE([$ga_armci_network], [ARMCI], [delay_tcgmsg_mpi_startup=0], -[CRAY_SHMEM], [delay_tcgmsg_mpi_startup=1], -[DMAPP], [delay_tcgmsg_mpi_startup=0], -[LAPI], [delay_tcgmsg_mpi_startup=1], [MPI_TS], [delay_tcgmsg_mpi_startup=0], [MPI_MT], [delay_tcgmsg_mpi_startup=0], [MPI_PT], [delay_tcgmsg_mpi_startup=0], @@ -661,9 +470,7 @@ AS_CASE([$ga_armci_network], [OFA], [delay_tcgmsg_mpi_startup=0], [OFI], [delay_tcgmsg_mpi_startup=0], [OPENIB], [delay_tcgmsg_mpi_startup=1], -[GEMINI], [delay_tcgmsg_mpi_startup=1], [PORTALS4], [delay_tcgmsg_mpi_startup=0], -[PORTALS], [delay_tcgmsg_mpi_startup=1], [SOCKETS], [delay_tcgmsg_mpi_startup=1]) AC_DEFINE_UNQUOTED([NEED_DELAY_TCGMSG_MPI_STARTUP], [$delay_tcgmsg_mpi_startup], diff --git a/m4/ga_as.m4 b/m4/ga_as.m4 deleted file mode 100644 index ae26a73db..000000000 --- a/m4/ga_as.m4 +++ /dev/null @@ -1,15 +0,0 @@ -# GA_AS -# ----- -# Certain systems may require specific assemblers (instead of $CC). -# -# Known assemblers: -# sxas - special to NEC/NEC64 -# -AC_DEFUN([GA_AS], [ -AS_IF([test "x$CCAS" = x], - [AS_CASE([$ga_cv_target], [NEC|NEC64], [CCAS=sxas])]) -AS_IF([test "x$CCASFLAGS" = x], - [AS_CASE([$ga_cv_target], - [NEC], [CCASFLAGS=], - [NEC64], [CCASFLAGS="-h size_t64"])]) -])dnl diff --git a/m4/ga_blas.m4 b/m4/ga_blas.m4 index b5ac1dd71..55f68eb88 100644 --- a/m4/ga_blas.m4 +++ b/m4/ga_blas.m4 @@ -99,7 +99,7 @@ AS_IF([test "x$enable_f77" = xno], # many flavors of BLAS that we test for explicitly, although the list could # probably be reduced based on currently available systems. # -# Apparently certain compilers on BGP define sgemm and dgemm, so we must +# Apparently certain compilers define sgemm and dgemm, so we must # test for a different BLAS routine. cgemm seems okay. AC_DEFUN([GA_BLAS], [AC_REQUIRE([AC_F77_LIBRARY_LDFLAGS]) @@ -268,26 +268,6 @@ AS_IF([test $ga_blas_ok = no], LIBS="$ga_save_LIBS"]) AC_MSG_RESULT([$ga_blas_ok])]) -# SCSL library (SCSL stands for SGI/Cray Scientific Library) -AS_IF([test $ga_blas_ok = no], - [AC_MSG_CHECKING([for BLAS in SGI/Cray Scientific Library]) - # add -lscs to BLAS_LIBS if missing from LIBS - AS_CASE([$LIBS], [*scs*], [], [BLAS_LIBS="-lscs"]) - LIBS="$BLAS_LIBS $LIBS" - GA_RUN_BLAS_TEST() - LIBS="$ga_save_LIBS" - AC_MSG_RESULT([$ga_blas_ok])]) - -# SGIMATH library -AS_IF([test $ga_blas_ok = no], - [AC_MSG_CHECKING([for BLAS in SGIMATH library]) - # add -lcomplib.sgimath to BLAS_LIBS if missing from LIBS - AS_CASE([$LIBS], [*complib.sgimath*], [], [BLAS_LIBS="-lcomplib.sgimath"]) - LIBS="$BLAS_LIBS $LIBS" - GA_RUN_BLAS_TEST() - LIBS="$ga_save_LIBS" - AC_MSG_RESULT([$ga_blas_ok])]) - # IBM ESSL library (might require generic BLAS lib, too) AS_IF([test $ga_blas_ok = no], [AC_MSG_CHECKING([for BLAS in IBM ESSL library]) diff --git a/m4/ga_c_opt.m4 b/m4/ga_c_opt.m4 index 548a848f1..891b77407 100644 --- a/m4/ga_c_opt.m4 +++ b/m4/ga_c_opt.m4 @@ -9,31 +9,16 @@ AC_CACHE_CHECK([for specific C optimizations], [ga_cv_c_opt], [ AS_IF([test "x$GA_COPT" != x], [ga_cv_c_opt="$GA_COPT"], [ga_cv_c_opt=]) AS_IF([test "x$ga_cv_c_opt" = x && test "x$enable_opt" = xyes], [ AS_CASE([$ga_cv_target:$ga_cv_c_compiler_vendor:$host_cpu], -[BGL:*:*], [ga_cv_c_opt="-O0"], -[BGP:ibm:*], [ga_cv_c_opt="-O3 -qstrict -qarch=450 -qtune=450"], -[BGP:gnu:*], [ga_cv_c_opt="-O2"], -[CATAMOUNT:*:*], [ga_cv_c_opt=], -[CRAY_XT:*:*], [ga_cv_c_opt=], [CYGWIN:*:*], [ga_cv_c_opt=], -[FUJITSU_VPP64:*:*], [ga_cv_c_opt=], -[FUJITSU_VPP:*:*], [ga_cv_c_opt="-KA32"], -[HPUX64:*:*], [ga_cv_c_opt="-Ae"], -[HPUX64:*:ia64], [ga_cv_c_opt="-Ae"], -[HPUX:*:*], [ga_cv_c_opt="-Ae"], [IBM64:*:*], [ga_cv_c_opt=], [IBM:*:*], [ga_cv_c_opt=], -[LAPI64:*:*], [ga_cv_c_opt=], -[LAPI:*:*], [ga_cv_c_opt=], -[LINUX64:fujitsu:ia64], [ga_cv_c_opt="-Kfast"], [LINUX64:fujitsu:x86_64], [ga_cv_c_opt="-Kfast"], -[LINUX64:gnu:ia64], [ga_cv_c_opt="-O3 -funroll-loops"], [LINUX64:gnu:powerpc64], [ga_cv_c_opt="-funroll-loops"], [LINUX64:gnu:ppc64], [ga_cv_c_opt="-funroll-loops"], [LINUX64:gnu:x86_64], [ga_cv_c_opt="-O2 -funroll-loops"], [LINUX64:ibm:powerpc64], [ga_cv_c_opt="-qinline=100 -qstrict -qarch=auto -qtune=auto"], [LINUX64:ibm:ppc64], [ga_cv_c_opt="-qinline=100 -qstrict -qarch=auto -qtune=auto"], [LINUX64:ibm:x86_64], [ga_cv_c_opt=], -[LINUX64:intel:ia64], [ga_cv_c_opt="-fno-alias -ftz"], [LINUX:fujitsu:*], [ga_cv_c_opt="-Kfast"], [LINUX:gnu:786], [ga_cv_c_opt="-O2 -funroll-loops -malign-double"], [LINUX:gnu:*], [ga_cv_c_opt="-O2 -funroll-loops"], @@ -42,14 +27,6 @@ AS_CASE([$ga_cv_target:$ga_cv_c_compiler_vendor:$host_cpu], [LINUX:intel:*], [ga_cv_c_opt="-O3 -prefetch"], [MACX64:*:*], [ga_cv_c_opt=], [MACX:*:*], [ga_cv_c_opt=], -[NEC64:*:*], [ga_cv_c_opt="-Cvsafe -size_t64"], -[NEC:*:*], [ga_cv_c_opt="-Cvsafe"], -[SOLARIS64:fujitsu:*], [ga_cv_c_opt="-Kfast -KV9FMADD"], -[SOLARIS64:gnu:*], [ga_cv_c_opt="-dalign -xarch=v9"], -[SOLARIS64:gnu:i386], [ga_cv_c_opt="-dalign -xarch=amd64"], -[SOLARIS:fujitsu:*], [ga_cv_c_opt="-Kfast -KV8PFMADD"], -[SOLARIS:gnu:*], [ga_cv_c_opt="-dalign"], -[SOLARIS:gnu:i386], [ga_cv_c_opt="-dalign -xarch=sse2"], [ga_cv_c_opt=]) ])]) AC_SUBST([GA_COPT], [$ga_cv_c_opt]) diff --git a/m4/ga_compiler_vendor.m4 b/m4/ga_compiler_vendor.m4 index e65b4915c..3f374e6a0 100644 --- a/m4/ga_compiler_vendor.m4 +++ b/m4/ga_compiler_vendor.m4 @@ -12,14 +12,12 @@ AC_LANG_CASE([Fortran], [ac_ext=F]) AC_LANG_CASE([Fortran 77], [ac_ext=F]) ga_cv_compiler_vendor=unknown ga_cpp_vendor_symbols= -for vendor in intel ibm pathscale amd cray gnu sun hp dec borland comeau kai lcc metrowerks sgi microsoft watcom portland fujitsu +for vendor in intel ibm pathscale amd gnu sun hp borland comeau kai lcc metrowerks microsoft watcom portland fujitsu do AS_CASE([$vendor], [amd], [ga_cpp_vendor_symbols="defined(__OPEN64__)"], [borland], [ga_cpp_vendor_symbols="defined(__BORLANDC__) || defined(__TURBOC__)"], [comeau], [ga_cpp_vendor_symbols="defined(__COMO__)"], -[cray], [ga_cpp_vendor_symbols="defined(_CRAYC) || defined(_ADDR64)"], -[dec], [ga_cpp_vendor_symbols="defined(__DECC) || defined(__DECCXX) || defined(__DECC_VER) || defined(__DECCXX_VER)"], [fujitsu], [ga_cpp_vendor_symbols="defined(__fcc__) || defined(__fcc_version__) || defined(_FCC_VER) || defined(__FCC_VER_)"], [gnu], [ga_cpp_vendor_symbols="defined(__GNUC__)"], [hp], [ga_cpp_vendor_symbols="defined(__HP_cc) || defined(__HP_aCC)"], @@ -31,7 +29,6 @@ AS_CASE([$vendor], [microsoft], [ga_cpp_vendor_symbols="defined(_MSC_VER)"], [pathscale], [ga_cpp_vendor_symbols="defined(__PATHCC__) || defined(__PATHSCALE__)"], [portland], [ga_cpp_vendor_symbols="defined(__PGI)"], -[sgi], [ga_cpp_vendor_symbols="defined(__sgi) || defined(sgi)"], [sun], [ga_cpp_vendor_symbols="defined(__SUNPRO_C) || defined(__SUNPRO_CC)"], [watcom], [ga_cpp_vendor_symbols="defined(__WATCOMC__)"]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],[ diff --git a/m4/ga_cross_compiling.m4 b/m4/ga_cross_compiling.m4 index c2c11e789..c64f243d2 100644 --- a/m4/ga_cross_compiling.m4 +++ b/m4/ga_cross_compiling.m4 @@ -8,7 +8,6 @@ AC_DEFUN([GA_CROSS_COMPILING], [ AC_REQUIRE([GA_TARGET]) AC_CACHE_CHECK([whether we are cross compiling], [ga_cv_cross_compiling], - [AS_IF([test "x$ga_cv_target_base" = xBGP], [cross_compiling=yes]) - ga_cv_cross_compiling=$cross_compiling]) + [ga_cv_cross_compiling=$cross_compiling]) AM_CONDITIONAL([CROSS_COMPILING], [test "x$cross_compiling" = xyes]) ])dnl diff --git a/m4/ga_cxx_opt.m4 b/m4/ga_cxx_opt.m4 index ca44dd1ff..01749db7d 100644 --- a/m4/ga_cxx_opt.m4 +++ b/m4/ga_cxx_opt.m4 @@ -10,8 +10,6 @@ AS_IF([test "x$GA_CXXOPT" != x], [ga_cv_cxx_opt="$GA_CXXOPT"], [ga_cv_cxx_opt=]) AS_IF([test "x$ga_cv_cxx_opt" = x && test "x$enable_opt" = xyes], [ AS_CASE([$ga_cv_target:$ga_cv_cxx_compiler_vendor:$host_cpu], [LINUX:*:*], [ga_cv_cxx_opt="-O0"], -[NEC64:*:*], [ga_cv_cxx_opt="-Cvsafe -size_t64"], -[NEC:*:*], [ga_cv_cxx_opt="-Cvsafe"], [ga_cv_cxx_opt=]) ])]) AC_SUBST([GA_CXXOPT], [$ga_cv_cxx_opt]) diff --git a/m4/ga_f77_fixed.m4 b/m4/ga_f77_fixed.m4 index ce0cea9ce..9f18f3698 100644 --- a/m4/ga_f77_fixed.m4 +++ b/m4/ga_f77_fixed.m4 @@ -8,10 +8,8 @@ # -fixed: Intel compiler (ifort), Sun compiler (f95) # -qfixed: IBM compiler (xlf*) # -Mfixed: Portland Group compiler -# -fixedform: SGI compiler # -f fixed: Absoft Fortran # +source=fixed: HP Fortran -# -fix: Lahey/Fujitsu Fortran # AC_DEFUN([GA_F77_FIXED], [ AC_CACHE_CHECK([whether $F77 needs a flag to compile fixed format source], diff --git a/m4/ga_f77_opt.m4 b/m4/ga_f77_opt.m4 index 568f0e371..5896011f0 100644 --- a/m4/ga_f77_opt.m4 +++ b/m4/ga_f77_opt.m4 @@ -9,27 +9,13 @@ AC_CACHE_CHECK([for specific Fortran optimizations], [ga_cv_f77_opt], [ AS_IF([test "x$GA_FOPT" != x], [ga_cv_f77_opt="$GA_FOPT"], [ga_cv_f77_opt=]) AS_IF([test "x$ga_cv_f77_opt" = x && test "x$enable_opt" = xyes], [ AS_CASE([$ga_cv_target:$ga_cv_f77_compiler_vendor:$host_cpu], -[BGL:*:*], [ga_cv_f77_opt="-O0"], -[BGP:ibm:*], [ga_cv_f77_opt="-O3 -qstrict -qarch=450 -qtune=450"], -[BGP:gnu:*], [ga_cv_f77_opt="-O2"], -[CATAMOUNT:*:*], [ga_cv_f77_opt="-O3"], -[CRAY_XT:*:*], [ga_cv_f77_opt=], [CYGWIN:*:*], [ga_cv_f77_opt=], -[FUJITSU_VPP64:*:*], [ga_cv_f77_opt="-Sw"], -[FUJITSU_VPP:*:*], [ga_cv_f77_opt="-Sw -KA32"], -[HPUX64:*:*], [ga_cv_f77_opt="-O1"], -[HPUX64:*:ia64], [ga_cv_f77_opt="-O1"], -[HPUX:*:*], [ga_cv_f77_opt="-O1"], [IBM64:*:*], [ga_cv_f77_opt="-qarch=auto"], [IBM:*:*], [ga_cv_f77_opt="-qarch=auto"], -[LAPI64:*:*], [ga_cv_f77_opt="-qarch=auto"], -[LAPI:*:*], [ga_cv_f77_opt="-qarch=auto"], [LINUX64:*:alpha], [ga_cv_f77_opt="-align_dcommons -fpe3 -check nooverflow -assume accuracy_sensitive -check nopower -check nounderflow"], -[LINUX64:fujitsu:ia64], [ga_cv_f77_opt="-Kfast -X9 -Am -fw"], [LINUX64:fujitsu:x86_64], [ga_cv_f77_opt="-Kfast -X9 -Am -fw"], [LINUX64:gnu:x86_64], [ga_cv_f77_opt="-O"], [LINUX64:ibm:x86_64], [ga_cv_f77_opt=], -[LINUX64:intel:ia64], [ga_cv_f77_opt="-cm -w90 -w95 -align"], [LINUX64:intel:powerpc64], [ga_cv_f77_opt=], [LINUX64:intel:ppc64], [ga_cv_f77_opt=], [LINUX64:intel:x86_64], [ga_cv_f77_opt="-O3 -w -cm -xW -tpp7"], @@ -45,14 +31,6 @@ AS_CASE([$ga_cv_target:$ga_cv_f77_compiler_vendor:$host_cpu], [MACX64:intel:*], [ga_cv_f77_opt="-O3 -prefetch -w -cm"], [MACX:gnu:*], [ga_cv_f77_opt="-O3 -funroll-loops"], [MACX:intel:*], [ga_cv_f77_opt="-O3 -prefetch -w -cm"], -[NEC64:*:*], [ga_cv_f77_opt="-Cvsafe -size_t64"], -[NEC:*:*], [ga_cv_f77_opt="-Cvsafe"], -[SOLARIS64:fujitsu:*], [ga_cv_f77_opt="-fw -Kfast -KV9FMADD"], -[SOLARIS64:gnu:*], [ga_cv_f77_opt="-xs -dalign -xarch=v9"], -[SOLARIS64:gnu:i386], [ga_cv_f77_opt="-xs -dalign -xarch=amd64"], -[SOLARIS:fujitsu:*], [ga_cv_f77_opt="-fw -Kfast -KV8PFMADD"], -[SOLARIS:gnu:*], [ga_cv_f77_opt="-xs -dalign"], -[SOLARIS:gnu:i386], [ga_cv_f77_opt="-xs -dalign -xarch=sse2"], [ga_cv_f77_opt=]) ])]) AC_SUBST([GA_FOPT], [$ga_cv_f77_opt]) diff --git a/m4/ga_f77_underscore.m4 b/m4/ga_f77_underscore.m4 index d0b297fac..b73670728 100644 --- a/m4/ga_f77_underscore.m4 +++ b/m4/ga_f77_underscore.m4 @@ -10,7 +10,6 @@ # -funderscoring GNU # -fno-second-underscore GNU # -f absoft compiler (OSX?) -# +ppu HPUX some compiler? # AC_DEFUN([GA_F77_UNDERSCORE], [AC_CACHE_CHECK([for $F77 flag to add single underscore to external names], diff --git a/m4/ga_mpi_unwrap.m4 b/m4/ga_mpi_unwrap.m4 index 7daf8a19f..247c244c3 100644 --- a/m4/ga_mpi_unwrap.m4 +++ b/m4/ga_mpi_unwrap.m4 @@ -76,7 +76,7 @@ AC_LANG_CASE( ], [C++], [AS_CASE([$wrapped], [*_r], [compilers="bgxlC_r xlC_r"], - [*], [compilers="icpc pgCC pathCC sxc++ xlC bgxlC openCC sunCC crayc++ g++ c++ gpp aCC cxx cc++ cl.exe FCC KCC RCC CC"]) + [*], [compilers="icpc pgCC pathCC sxc++ xlC bgxlC openCC sunCC craycxx g++ c++ gpp aCC cxx cc++ cl.exe FCC KCC RCC CC"]) ], [Fortran 77], [AS_CASE([$wrapped], [*_r], [compilers="bgxlf95_r xlf95_r bgxlf90_r xlf90_r bgxlf_r xlf_r"], diff --git a/m4/ga_mpicc.m4 b/m4/ga_mpicc.m4 index 6b79b39c5..0cf4f9eeb 100644 --- a/m4/ga_mpicc.m4 +++ b/m4/ga_mpicc.m4 @@ -4,18 +4,13 @@ # # Known C compilers # cc generic compiler name -# ccc Fujitsu ?? old Cray ?? # cl -# ecc Intel on IA64 ?? # gcc GNU # icc Intel -# bgxlc Intel on BG/P -# bgxlc_r Intel on BG/P, thread safe # xlc Intel # xlc_r Intel, thread safe # pgcc Portland Group # pathcc PathScale -# sxcc NEC SX # fcc Fujitsu # opencc AMD's x86 open64 # suncc Sun's Studio @@ -28,7 +23,6 @@ # hcc # mpxlc_r # mpxlc -# sxmpicc NEC SX # mpifcc Fujitsu # mpgcc # mpcc @@ -37,9 +31,6 @@ # AC_DEFUN([GA_PROG_MPICC], [AC_ARG_VAR([MPICC], [MPI C compiler]) -AS_CASE([$ga_cv_target_base], -[BGP], [ga_mpicc_pref=mpixlc_r; ga_cc_pref=bgxlc_r], -[]) # In the case of using MPI wrappers, set CC=MPICC since CC will override # absolutely everything in our list of compilers. # Save CC, just in case. @@ -62,7 +53,7 @@ AS_IF([test x$with_mpi_wrappers = xyes], ga_cv_mpic_naked="$CC" CC="$MPICC"], [AC_MSG_ERROR([CC/MPICC case failure])])]) -ga_cc="bgxlc_r bgxlc xlc_r xlc pgcc pathcc icc sxcc fcc opencc suncc craycc gcc cc ecc cl ccc" +ga_cc="xlc_r xlc pgcc pathcc icc sxcc fcc opencc suncc craycc gcc cc ecc cl ccc" ga_mpicc="mpicc mpixlc_r mpixlc hcc mpxlc_r mpxlc sxmpicc mpifcc mpgcc mpcc cmpicc cc" AS_IF([test x$with_mpi_wrappers = xyes], [CC_TO_TEST="$ga_mpicc_pref $ga_mpicc"], diff --git a/m4/ga_mpicxx.m4 b/m4/ga_mpicxx.m4 index 86d5f708c..8cd47824b 100644 --- a/m4/ga_mpicxx.m4 +++ b/m4/ga_mpicxx.m4 @@ -15,22 +15,18 @@ # icpc Intel C++ compiler # KCC KAI C++ compiler # RCC Rational C++ -# bgxlC Intel -# bgxlC_r Intel, thread safe # xlC AIX C Set++ # xlC_r AIX C Set++, thread safe # pgCC Portland Group # pathCC PathScale -# sxc++ NEC SX # openCC AMD's x86 open64 # sunCC Sun's Studio -# crayc++ Cray +# craycxx Cray # # Known MPI C++ compilers # mpic++ # mpicxx # mpiCC -# sxmpic++ NEC SX # hcp # mpxlC_r # mpxlC @@ -45,9 +41,6 @@ # AC_DEFUN([GA_PROG_MPICXX], [AC_ARG_VAR([MPICXX], [MPI C++ compiler]) -AS_CASE([$ga_cv_target_base], -[BGP], [ga_mpicxx_pref=mpixlcxx_r; ga_cxx_pref=bgxlC_r], -[]) # In the case of using MPI wrappers, set CXX=MPICXX since CXX will override # absolutely everything in our list of compilers. AS_IF([test x$with_mpi_wrappers = xyes], @@ -69,7 +62,7 @@ AS_IF([test x$with_mpi_wrappers = xyes], ga_cv_mpicxx_naked="$CXX" CXX="$MPICXX"], [AC_MSG_ERROR([CXX/MPICXX case failure])])]) -ga_cxx="icpc pgCC pathCC sxc++ xlC_r xlC bgxlC_r bgxlC openCC sunCC crayc++ g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC" +ga_cxx="icpc pgCC pathCC sxc++ xlC_r xlC openCC sunCC g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC" ga_mpicxx="mpic++ mpicxx mpiCC sxmpic++ hcp mpxlC_r mpxlC mpixlcxx_r mpixlcxx mpg++ mpc++ mpCC cmpic++ mpiFCC CC" AS_IF([test x$with_mpi_wrappers = xyes], [CXX_TO_TEST="$ga_mpicxx_pref $ga_mpicxx"], diff --git a/m4/ga_mpif77.m4 b/m4/ga_mpif77.m4 index fdf19e1b5..42817fa34 100644 --- a/m4/ga_mpif77.m4 +++ b/m4/ga_mpif77.m4 @@ -9,17 +9,13 @@ # preferred above all. # # Known Fortran 95 compilers: -# bgxlf95 IBM BlueGene/P F95 cross-compiler -# blrts_xlf95 IBM BlueGene/L F95 cross-compiler -# efc Intel Fortran 95 compiler for IA64 # f95 generic compiler name # fort Compaq/HP Fortran 90/95 compiler for Tru64 and Linux/Alpha -# ftn native Fortran 95 compiler on Cray X1,XT4,XT5 +# ftn native Fortran 95 compiler on Cray # g95 original gcc-based f95 compiler (gfortran is a fork) # gfortran GNU Fortran 95+ compiler (released in gcc 4.0) # ifc Intel Fortran 95 compiler for Linux/x86 (now ifort) # ifort Intel Fortran 95 compiler for Linux/x86 (was ifc) -# lf95 Lahey-Fujitsu F95 compiler # pghpf/pgf95 Portland Group F95 compiler # xlf95 IBM (AIX) F95 compiler # pathf95 PathScale @@ -29,40 +25,30 @@ # # Known MPI Fortran 95 compilers: # cmpifc ?? not sure if this is even F95 -# ftn native Fortran 95 compiler on Cray XT4,XT5 +# ftn native Fortran 95 compiler on Cray # mpif95 generic compiler name -# mpixlf95 IBM BlueGene/P Fortran 95 -# mpixlf95_r IBM BlueGene/P Fortran 95, reentrant code -# mpxlf95 IBM BlueGene/L Fortran 95 -# mpxlf95_r IBM BlueGene/L Fortran 95, reentrant code +# mpixlf95 IBM Blue Gene Fortran 95 +# mpixlf95_r IBM Blue Gene Fortran 95, reentrant code # # Known Fortran 90 compilers: -# blrts_xlf90 IBM BlueGene/L F90 cross-compiler # epcf90 "Edinburgh Portable Compiler" F90 # f90 generic compiler name # fort Compaq/HP Fortran 90/95 compiler for Tru64 and Linux/Alpha # pgf90 Portland Group F90 compiler # xlf90 IBM (AIX) F90 compiler # pathf90 PathScale -# sxf90 NEC SX Fortran 90 # openf90 AMD's x86 open64 # sunf90 Sun's Studio # # Known MPI Fortran 90 compilers: -# cmpif90c ?? -# mpf90 ?? +# mpifrt Fujitsu # mpif90 generic compiler name -# mpxlf90 IBM BlueGene/L Fortran 90 -# mpxlf90_r IBM BlueGene/L Fortran 90, reentrant code -# sxmpif90 NEC SX Fortran 90 # # Known Fortran 77 compilers: # af77 Apogee F77 compiler for Intergraph hardware running CLIX -# blrts_xlf IBM BlueGene/L F77 cross-compiler -# cf77 native F77 compiler under older Crays (prefer over fort77) # f77 generic compiler names # fl32 Microsoft Fortran 77 "PowerStation" compiler -# fort77 native F77 compiler under HP-UX (and some older Crays) +# fort77 native F77 compiler on older UNIX systems # frt Fujitsu F77 compiler # g77 GNU Fortran 77 compiler # pgf77 Portland Group F77 compiler @@ -70,19 +56,11 @@ # pathf77 PathScale # # Known MPI Fortran 77 compilers: -# hf77 ?? -# mpf77 ?? # mpif77 generic compiler name -# mpxlf IBM BlueGene/L Fortran 77 -# mpxlf_r IBM BlueGene/L Fortran 77, reentrant code # mpifrt Fujitsu # AC_DEFUN([GA_PROG_MPIF77], [AC_ARG_VAR([MPIF77], [MPI Fortran 77 compiler]) -AS_CASE([$ga_cv_target_base], -[BGP], [ga_mpif77_pref=mpixlf77_r;ga_f77_pref=bgxlf_r], -[BGL], [ga_mpif77_pref=mpxlf95; ga_f77_pref=blrts_xlf95], -[]) # If FC is set, override F77. Similarly for MPIFC/MPIF77 and FCFLAGS/FFLAGS. AS_IF([test "x$FC" != x], [F77="$FC"]) AS_IF([test "x$MPIFC" != x], [MPIF77="$MPIFC"]) @@ -109,9 +87,9 @@ AS_IF([test x$with_mpi_wrappers = xyes], ga_cv_mpif77_naked="$F77" F77="$MPIF77"], [AC_MSG_ERROR([F77/MPIF77 case failure])])]) -ga_mpif95="mpif95 mpxlf95_r mpxlf95 ftn" -ga_mpif90="mpif90 mpxlf90_r mpxlf90 mpf90 cmpif90c sxmpif90" -ga_mpif77="mpif77 hf77 mpxlf_r mpxlf mpifrt mpf77 cmpifc" +ga_mpif95="mpif95 mpixlf95_r mpixlf95 ftn" +ga_mpif90="mpif90 mpixlf90_r mpixlf90 mpf90 cmpif90c sxmpif90" +ga_mpif77="mpif77 hf77 mpixlf_r mpixlf mpifrt mpf77 cmpifc" ga_f95="xlf95 pgf95 pathf95 ifort g95 f95 fort ifc efc openf95 sunf95 crayftn gfortran lf95 ftn" ga_f90="xlf90 f90 pgf90 pghpf pathf90 epcf90 sxf90 openf90 sunf90" ga_f77="xlf f77 frt pgf77 pathf77 g77 cf77 fort77 fl32 af77" diff --git a/m4/ga_msg_comms.m4 b/m4/ga_msg_comms.m4 index 93e5eb6d2..7fdd20820 100644 --- a/m4/ga_msg_comms.m4 +++ b/m4/ga_msg_comms.m4 @@ -19,31 +19,14 @@ AC_ARG_WITH([mpi], with_mpi_need_parse=no AS_CASE([$with_mpi], [yes], [with_mpi_wrappers=yes; ga_msg_comms=MPI], - [no], [AS_CASE([$ga_cv_target_base], - [MACX|LAPI|CYGNUS|CYGWIN|INTERIX], [ga_msg_comms=TCGMSG5], - [ga_msg_comms=TCGMSG])], [*], [with_mpi_need_parse=yes; ga_msg_comms=MPI]) dnl postpone parsing with_mpi until we know sizeof(void*) dnl AS_IF([test x$with_mpi_need_parse = xyes], dnl [GA_ARG_PARSE([with_mpi], [GA_MP_LIBS], [GA_MP_LDFLAGS], [GA_MP_CPPFLAGS])]) AM_CONDITIONAL([MSG_COMMS_MPI], [test "x$ga_msg_comms" = xMPI]) -AM_CONDITIONAL([MSG_COMMS_TCGMSG4], [test "x$ga_msg_comms" = xTCGMSG]) -AM_CONDITIONAL([MSG_COMMS_TCGMSG5], [test "x$ga_msg_comms" = xTCGMSG5]) AS_CASE([$ga_msg_comms], [MPI], [AC_DEFINE([MSG_COMMS_MPI], [1], - [Use MPI for messaging])], - [TCGMSG], [AC_DEFINE([MSG_COMMS_TCGMSG4], [1], - [Use TCGMSG (ipcv4.0) for messaging]) - AC_DEFINE([MSG_COMMS_TCGMSG], [1], - [Use TCGMSG for messaging]) - AC_DEFINE([TCGMSG], [1], - [deprecated, use MSG_COMMS_TCGMSG])], - [TCGMSG5], [AC_DEFINE([MSG_COMMS_TCGMSG5], [1], - [Use TCGMSG (ipcv5.0) for messaing]) - AC_DEFINE([MSG_COMMS_TCGMSG], [1], - [Use TCGMSG for messaging]) - AC_DEFINE([TCGMSG], [1], - [deprecated, use MSG_COMMS_TCGMSG])]) + [Use MPI for messaging])]) AC_SUBST([GA_MP_LIBS]) AC_SUBST([GA_MP_LDFLAGS]) AC_SUBST([GA_MP_CPPFLAGS]) diff --git a/m4/ga_pario.m4 b/m4/ga_pario.m4 index edd84110d..a3960cb27 100644 --- a/m4/ga_pario.m4 +++ b/m4/ga_pario.m4 @@ -43,29 +43,15 @@ if test x$LARGE_FILES != x ; then fi fi if test x$TARGET = xSOLARIS ; then - PARIO_CPPFLAGS="$PARIO_CPPFLAGS -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64" + PARIO_CPPFLAGS=`getconf LFS_CFLAGS` PARIO_CFLAGS=`getconf LFS_CFLAGS` fi if test x$TARGET = xLINUX ; then PARIO_CPPFLAGS="$PARIO_CPPFLAGS -D_LARGEFILE64_SOURCE" PARIO_CFLAGS=`getconf LFS_CFLAGS` fi - if test x$TARGET = xBGL -o x$TARGET = xBGP ; then - PARIO_CPPFLAGS="$PARIO_CPPFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE" - fi - if test x$TARGET = xHPUX ; then - PARIO_CPPFLAGS="$PARIO_CPPFLAGS -D_LARGEFILE64_SOURCE" - PARIO_CFLAGS=`getconf XBS5_ILP32_OFFBIG_CFLAGS` - fi - if test x$TARGET = xHPUX64 ; then - PARIO_CPPFLAGS="$PARIO_CPPFLAGS -D_LARGEFILE64_SOURCE" - PARIO_CFLAGS=`getconf XBS5_LP64_OFF64_CFLAGS` - fi PARIO_CPPFLAGS="$PARIO_CPPFLAGS -DLARGE_FILES" fi -if test x$TARGET = xDECOSF ; then - PARIO_LDFLAGS="-laio -lpthreads" -fi if test x$USE_LINUXAIO != x ; then PARIO_CPPFLAGS="$PARIO_CPPFLAGS -DLINUXAIO" PARIO_LDFLAGS="$PARIO_LDFLAGS -lrt" @@ -88,9 +74,6 @@ dnl ########################################################################## dnl FROM pario/eaf/GNUmakefile dnl ########################################################################## PARIO_CPPFLAGS="$PARIO_CPPFLAGS -DEAF_STATS" -if test x$ga_armci_network = xPORTALS ; then - PARIO_CPPFLAGS="$PARIO_CPPFLAGS -DCRAY_XT" -fi dnl ########################################################################## dnl FROM pario/dra/GNUmakefile diff --git a/m4/ga_target.m4 b/m4/ga_target.m4 index 62a67c394..6bcc2779c 100644 --- a/m4/ga_target.m4 +++ b/m4/ga_target.m4 @@ -1,51 +1,17 @@ # GA_TARGET() # ----------- # Attempt to determine the old TARGET variable automatically. -# Deprecated TARGETs: -# CRAY-SV1 -# cray-sv2 -# CRAY-T3E -# CRAY-YMP -# CYGNUS -# DECOSF -# HITACHI -# INTERIX -# SGI -# SGI64 -# SGI_N32 -# SGITFP + AC_DEFUN([GA_TARGET], [# AH_TEMPLATE for all known TARGETs -AH_TEMPLATE([BGL], [Define to 1 on BlueGene/L systems]) -AH_TEMPLATE([BGP], [Define to 1 on BlueGene/P systems]) -AH_TEMPLATE([CATAMOUNT], [Define to 1 on Cray XT systems using Catamount]) -AH_TEMPLATE([CRAY_SV1], [Define to 1 on Cray SV1 systems]) -AH_TEMPLATE([CRAY_SV2], [Define to 1 on Cray SV2 systems]) -AH_TEMPLATE([CRAY_T3E], [Define to 1 on Cray T3E systems]) -AH_TEMPLATE([CRAY_XT], [Define to 1 on Cray XT systems]) -AH_TEMPLATE([CRAY_YMP], [Define to 1 on Cray YMP systems]) AH_TEMPLATE([CYGNUS], [Define to 1 on Cygnus systems]) AH_TEMPLATE([CYGWIN], [Define to 1 on Cygwin systems]) -AH_TEMPLATE([DECOSF], [Define to 1 on DEC OSF]) -AH_TEMPLATE([FUJITSU_VPP], [Define to 1 on fujitsu systems]) -AH_TEMPLATE([FUJITSU_VPP64],[Define to 1 on fujitsu systems]) -AH_TEMPLATE([HITACHI], [Define to 1 on hitachi systems]) -AH_TEMPLATE([HPUX], [Define to 1 on HP-UX systems]) -AH_TEMPLATE([HPUX64], [Define to 1 on 64bit HP-UX systems]) AH_TEMPLATE([IBM], [Define to 1 on IBM SP systems]) AH_TEMPLATE([IBM64], [Define to 1 on 64bit IBM SP systems]) -AH_TEMPLATE([INTERIX], [Define to 1 on ??? systems]) -AH_TEMPLATE([LAPI], [Define to 1 on IBM systems with LAPI]) -AH_TEMPLATE([LAPI64], [Define to 1 on 64bit IBM systems with LAPI]) AH_TEMPLATE([LINUX], [Define to 1 on generic Linux systems]) AH_TEMPLATE([LINUX64], [Define to 1 on generic 64bit Linux systems]) AH_TEMPLATE([MACX], [Define to 1 on OSX systems]) AH_TEMPLATE([MACX64], [Define to 1 on 64bit OSX systems]) -AH_TEMPLATE([NEC], [Define to 1 on NEC systems]) -AH_TEMPLATE([NEC64], [Define to 1 on 64bit NEC systems]) -AH_TEMPLATE([SGI], [Define to 1 on ??? systems]) -AH_TEMPLATE([SGI_N32], [Define to 1 on ??? systems]) -AH_TEMPLATE([SGITFP], [Define to 1 on ??? systems]) AH_TEMPLATE([SOLARIS], [Define to 1 on Solaris systems]) AH_TEMPLATE([SOLARIS64], [Define to 1 on 64bit Solaris systems]) AC_REQUIRE([AC_CANONICAL_BUILD]) @@ -53,28 +19,14 @@ AC_REQUIRE([AC_CANONICAL_HOST]) AC_CACHE_CHECK([for TARGET base (64bit-ness checked later)], [ga_cv_target_base], [ga_cv_target_base=UNKNOWN -AS_IF([test "x$ga_cv_target_base" = xUNKNOWN], - [AS_IF([test -f /bgsys/drivers/ppcfloor/arch/include/common/bgp_personality.h], - [ga_cv_target_base=BGP])]) -AS_IF([test "x$ga_cv_target_base" = xUNKNOWN], - [AS_IF([test -d /bgl/BlueLight/ppcfloor/bglsys/include], - [ga_cv_target_base=BGL])]) AS_IF([test "x$ga_cv_target_base" = xUNKNOWN], [AS_CASE([$host], - [*bgl*], [ga_cv_target_base=BGL], - [*bgp*], [ga_cv_target_base=BGP], - #[TODO], [ga_cv_target_base=CATAMOUNT], - #[TODO], [ga_cv_target_base=CRAY_XT], [*cygwin*], [ga_cv_target_base=CYGWIN], - [*fujitsu*], [ga_cv_target_base=FUJITSU_VPP], - [*hpux*], [ga_cv_target_base=HPUX], [*ibm*], [ga_cv_target_base=IBM], - #[TODO], [ga_cv_target_base=LAPI], [*linux*], [ga_cv_target_base=LINUX], [*darwin*], [ga_cv_target_base=MACX], [*apple*], [ga_cv_target_base=MACX], [*mingw32*], [ga_cv_target_base=MINGW], - [*superux*], [ga_cv_target_base=NEC], [*solaris*], [ga_cv_target_base=SOLARIS])]) ])dnl AC_DEFINE_UNQUOTED([$ga_cv_target_base], [1], @@ -85,7 +37,7 @@ dnl AS_IF([test "x$ARMCI_TOP_BUILDDIR" != x], [ AC_CACHE_CHECK([whether we think this system is what we call SYSV], [ga_cv_sysv], [AS_CASE([$ga_cv_target_base], - [SUN|SOLARIS|SGI|SGI_N32|SGITFP|HPUX|IBM|DECOSF|LINUX|INTERIX|NEC|LAPI], + [SUN|SOLARIS|IBM|LINUX], [ga_cv_sysv=yes], [ga_cv_sysv=no]) ]) @@ -94,30 +46,12 @@ dnl AS_IF([test "x$ARMCI_TOP_BUILDDIR" != x], [ [Define if we want this system to use SYSV shared memory])]) dnl ]) # Hopefully these will never be used and we can remove them soon. -AM_CONDITIONAL([BGL], [test "$ga_cv_target_base" = BGL]) -AM_CONDITIONAL([BGP], [test "$ga_cv_target_base" = BGP]) -AM_CONDITIONAL([CATAMOUNT], [test "$ga_cv_target_base" = CATAMOUNT]) -AM_CONDITIONAL([CRAY_SV1], [test "$ga_cv_target_base" = CRAY_SV1]) -AM_CONDITIONAL([CRAY_SV2], [test "$ga_cv_target_base" = CRAY_SV2]) -AM_CONDITIONAL([CRAY_T3E], [test "$ga_cv_target_base" = CRAY_T3E]) -AM_CONDITIONAL([CRAY_XT], [test "$ga_cv_target_base" = CRAY_XT]) -AM_CONDITIONAL([CRAY_YMP], [test "$ga_cv_target_base" = CRAY_YMP]) AM_CONDITIONAL([CYGNUS], [test "$ga_cv_target_base" = CYGNUS]) AM_CONDITIONAL([CYGWIN], [test "$ga_cv_target_base" = CYGWIN]) -AM_CONDITIONAL([DECOSF], [test "$ga_cv_target_base" = DECOSF]) -AM_CONDITIONAL([FUJITSU_VPP], [test "$ga_cv_target_base" = FUJITSU_VPP]) -AM_CONDITIONAL([HITACHI], [test "$ga_cv_target_base" = HITACHI]) -AM_CONDITIONAL([HPUX], [test "$ga_cv_target_base" = HPUX]) AM_CONDITIONAL([IBM], [test "$ga_cv_target_base" = IBM]) -AM_CONDITIONAL([INTERIX], [test "$ga_cv_target_base" = INTERIX]) -AM_CONDITIONAL([LAPI], [test "$ga_cv_target_base" = LAPI]) AM_CONDITIONAL([LINUX], [test "$ga_cv_target_base" = LINUX]) AM_CONDITIONAL([MACX], [test "$ga_cv_target_base" = MACX]) AM_CONDITIONAL([MINGW], [test "$ga_cv_target_base" = MINGW]) -AM_CONDITIONAL([NEC], [test "$ga_cv_target_base" = NEC]) -AM_CONDITIONAL([SGI], [test "$ga_cv_target_base" = SGI]) -AM_CONDITIONAL([SGI_N32], [test "$ga_cv_target_base" = SGI_N32]) -AM_CONDITIONAL([SGITFP], [test "$ga_cv_target_base" = SGITFP]) AM_CONDITIONAL([SOLARIS], [test "$ga_cv_target_base" = SOLARIS]) ])dnl diff --git a/m4/ga_thread_safe.m4 b/m4/ga_thread_safe.m4 index aadfd72c3..6b2a62232 100644 --- a/m4/ga_thread_safe.m4 +++ b/m4/ga_thread_safe.m4 @@ -5,7 +5,7 @@ # This was taken from older GNUmakefiles... the original doc follows # Procedures are thread safe; should also specify the max number of threads # via ARMCI_MAX_THREADS and thread library via THREAD_LIBRARY. Only supported -# for SOCKETS ELAN4 OPENIB LAPI64. +# for SOCKETS OPENIB. AC_DEFUN([GA_THREAD_SAFE], [AC_ARG_ENABLE([thread-safety], [AS_HELP_STRING([--enable-thread-safety], [**unsupported** turn on thread safety])], diff --git a/m4/ga_warn_flags.m4 b/m4/ga_warn_flags.m4 index 2f00ab740..4f3f4beb0 100644 --- a/m4/ga_warn_flags.m4 +++ b/m4/ga_warn_flags.m4 @@ -13,8 +13,6 @@ AS_CASE([$vendor], [amd], [result="-Wall -W -Wdeclaration-after-statement"], [borland], [result=], [comeau], [result=], -[cray], [result=], -[dec], [result=], [fujitsu], [result="-Xc -pvctl,fullmsg"], [gnu], [result="-Wall -Wextra -Wdeclaration-after-statement -Wno-unused-parameter -pedantic -Wno-long-long -Wnested-externs -ansi"], [hp], [result=], @@ -26,7 +24,6 @@ AS_CASE([$vendor], [microsoft], [result=], [pathscale], [result="-Wall -fullwarn -Wno-unused-parameter -pedantic -Wno-long-long -Wnested-externs"], [portland], [result="-Xc"], -[sgi], [result=], [sun], [result=], [watcom], [result=]) ], diff --git a/m4/ma_enable_armci_mem.m4 b/m4/ma_enable_armci_mem.m4 deleted file mode 100644 index 8baeab55e..000000000 --- a/m4/ma_enable_armci_mem.m4 +++ /dev/null @@ -1,8 +0,0 @@ -# MA_ENABLE_ARMCI_MEM_OPTION -# ------------------------------ -# Whether to enable ARMCI in MA. -AC_DEFUN([MA_ENABLE_ARMCI_MEM_OPTION], -[AS_IF([test x$TARGET != xBGL], - [AC_DEFINE([ENABLE_ARMCI_MEM_OPTION], [1], [enables ARMCI in MA])]) -AM_CONDITIONAL([ENABLE_ARMCI_MEM_OPTION], [test x$TARGET != xBGL]) -])dnl diff --git a/ma/CMakeLists.txt b/ma/CMakeLists.txt index d0a60c3a0..d527ba7bb 100644 --- a/ma/CMakeLists.txt +++ b/ma/CMakeLists.txt @@ -62,6 +62,9 @@ install (FILES DESTINATION include/ga ) +list (APPEND GA_HEADER_PATHS ${CMAKE_CURRENT_LIST_DIR} ${CMAKE_CURRENT_BINARY_DIR}) +set (GA_HEADER_PATHS ${GA_HEADER_PATHS} PARENT_SCOPE) + # ------------------------------------------------------------- # MA library installation # ------------------------------------------------------------- diff --git a/ma/MA.tex b/ma/MA.tex index 64911e255..fceb88c58 100644 --- a/ma/MA.tex +++ b/ma/MA.tex @@ -29,23 +29,6 @@ memory allocation is not supported by all versions of the language. MA is designed to be portable across a variety of platforms. -The following platforms are currently supported: - -\begin{itemize} -\item Cray Y-MP C90 -\item Cray T3D -\item DEC Alpha -\item HP 9000/700 -\item IBM RS/6000 -\item Intel DELTA -\item Intel iPSC/860 -\item Intel Paragon -\item KSR1 -\item SGI -\item SGI Power Challenge -\item SPARC -\end{itemize} - The implementation of MA uses the following memory layout: \begin{quote} diff --git a/ma/ma.c b/ma/ma.c index 8bbbdbf79..d5e2ce84f 100644 --- a/ma/ma.c +++ b/ma/ma.c @@ -25,6 +25,9 @@ #include "scope.h" #include "table.h" +// this was only ever disabled for Blue Gene, which has been removed. +#define ENABLE_ARMCI_MEM_OPTION 1 + #if defined(ENABLE_CUDA_MEM) extern int cudaMallocManaged(void** devPtr, size_t size, unsigned int flags); #elif defined(ENABLE_ARMCI_MEM_OPTION) @@ -86,7 +89,7 @@ extern void* ARMCI_Malloc_local(long bytes); /* block lengths are integral multiples of this */ /* - * Note that for machines such as the KSR on which sizeof(pointer) + * Note that for machines on which sizeof(pointer) * and sizeof(long) are different than sizeof(int), alignment issues * can be tricky. For example, the fields of a struct (e.g., * client_space of AD) can be improperly aligned if the struct is @@ -97,11 +100,7 @@ extern void* ARMCI_Malloc_local(long bytes); * problem is solved, but the sum of sizes of preceding fields can * still potentially cause difficulty. */ -#if defined(BGQ) -#define ALIGNMENT 32 -#else #define ALIGNMENT sizeof(size_t) -#endif /* min size of block split and placed on free list */ #define MINBLOCKSIZE mai_round((size_t)(ALIGNMENT + BLOCK_OVERHEAD_FIXED), \ @@ -2509,9 +2508,11 @@ public Boolean MA_init( /* segment consists of heap and stack */ total_bytes = heap_bytes + stack_bytes; #ifdef NOUSE_MMAP +#if HAVE_MALLOPT /* disable memory mapped malloc */ mallopt(M_MMAP_MAX, 0); mallopt(M_TRIM_THRESHOLD, -1); +#endif #endif /* allocate the segment of memory */ #ifdef ENABLE_CUDA_MEM diff --git a/ma/mafdecls.fh.in b/ma/mafdecls.fh.in index cfb3d13db..f60fda858 100644 --- a/ma/mafdecls.fh.in +++ b/ma/mafdecls.fh.in @@ -124,60 +124,36 @@ ! variables ! -#ifdef HPUX -# define HP_SHARED_COMMON_ -#endif ! common blocks #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_byte -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_byte/ #endif common /mbc_byte/ byte_mb(2) character*1 byte_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_int -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_int/ #endif common /mbc_int/ int_mb(2) integer int_mb -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_log/ -#endif common /mbc_log/ log_mb(2) logical log_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_real -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_real/ #endif common /mbc_real/ real_mb(2) real real_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_dbl -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_dbl/ #endif common /mbc_dbl/ dbl_mb(2) double precision dbl_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_scpl -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_scpl/ #endif common /mbc_scpl/ scpl_mb(2) complex scpl_mb #ifdef INTEL_64ALIGN !DIR$ ATTRIBUTES ALIGN : 64 :: mbc_dcpl -#endif -#ifdef HP_SHARED_COMMON -*$HP$ shared_common /mbc_dcpl/ #endif common /mbc_dcpl/ dcpl_mb(2) double complex dcpl_mb diff --git a/ma/man/man3/MA.3 b/ma/man/man3/MA.3 index 4f6900226..40264fc60 100644 --- a/ma/man/man3/MA.3 +++ b/ma/man/man3/MA.3 @@ -18,31 +18,7 @@ because dynamic memory allocation is not supported by all versions of the language. MA is designed to be portable across a variety of platforms. -The following platforms are currently supported: -.in +0.5i -.nf -alpha_osf13 DEC Alpha, DEC OSF/1 V1.3 -alpha_osf13_i8 DEC Alpha, DEC OSF/1 V1.3, f77 -i8 -crayc90_7c_dp Cray Y-MP C90, UNICOS 7.C, cf77 -Wf-dp -crayt3d_802_dp Cray T3D, UNICOS 8.0.2, cf77 -Wf-dp -delta_pgi40 Intel DELTA, NX/M R1.5 (cross-compiled by PGI 4.0) -hp700_ux90 HP 9000/700, HP-UX 9.0, gcc -ipsc_pgi40 Intel iPSC/860, NX/M R1.4 (cross-compiled by PGI 4.0) -ksr1_1141 KSR1, KSR OS R1.1.4.1 -ksr1_1141_i4 KSR1, KSR OS R1.1.4.1, f77 -i4 -ksr1_1141_i4r8 KSR1, KSR OS R1.1.4.1, f77 -i4 -r8 -ksr1_1141_r8 KSR1, KSR OS R1.1.4.1, f77 -r8 -paragon_osf12 Intel Paragon, OSF/1 R1.2 -rs_aix32 IBM RS/6000, AIX 3.2 -sgi_405f SGI, IRIX 4.0.5F -sgi_5111 SGI, IRIX 5.1.1.1 -sgi_60 SGI Power Challenge, IRIX 6.0 -sgi_60_i8 SGI Power Challenge, IRIX 6.0, f77 -i8 -sun4m_412 Sun SPARC, SunOS 4.1.x -sun4x_55 Sun SPARC, SunOS 5.5.x -.fi -.in .SH IMPLEMENTATION Memory layout: diff --git a/pario/CMakeLists.txt b/pario/CMakeLists.txt index b2766f0f6..2a4940746 100644 --- a/pario/CMakeLists.txt +++ b/pario/CMakeLists.txt @@ -85,6 +85,15 @@ install (FILES DESTINATION include/ga ) +list (APPEND GA_HEADER_PATHS + ${CMAKE_CURRENT_LIST_DIR} + ${CMAKE_CURRENT_LIST_DIR}/dra + ${CMAKE_CURRENT_LIST_DIR}/sf + ${CMAKE_CURRENT_LIST_DIR}/eaf + ${CMAKE_CURRENT_LIST_DIR}/elio) + +set (GA_HEADER_PATHS ${GA_HEADER_PATHS} PARENT_SCOPE) + # ------------------------------------------------------------- # PARIO executable files # ------------------------------------------------------------- diff --git a/pario/dra/bign.c b/pario/dra/bign.c index 0f262e0d1..ab4c8373f 100644 --- a/pario/dra/bign.c +++ b/pario/dra/bign.c @@ -295,11 +295,7 @@ int main(int argc, char **argv) int status, me; int max_arrays = 10; double max_sz = 1e8, max_disk = 1e10, max_mem = 1e6; -#if defined(IBM) - int stack = 9000000, heap = 4000000; -#else int stack = 1200000, heap = 800000; -#endif MP_INIT(argc,argv); GA_Initialize(); diff --git a/pario/dra/disk.arrays.c b/pario/dra/disk.arrays.c index 65162fc83..947d8a9d3 100644 --- a/pario/dra/disk.arrays.c +++ b/pario/dra/disk.arrays.c @@ -86,11 +86,7 @@ /* #define DRA_DBLE_BUFFER */ -#if defined(SP1)|| defined(SP) || defined(LAPI) -# define DRA_NUM_IOPROCS 8 -#else # define DRA_NUM_IOPROCS 1 -#endif #ifndef DRA_NUM_FILE_MGR # define DRA_NUM_FILE_MGR DRA_NUM_IOPROCS @@ -401,13 +397,7 @@ Integer dai_io_procs(Integer d_a) Integer num; /* this one of many possibilities -- depends on the system */ - /* -#ifdef _CRAYMPP -num = DRA_NUM_IOPROCS; -#else -num = (INDEPFILES(d_a)) ? INFINITE_NUM_PROCS: DRA_NUM_IOPROCS; -#endif -*/ + /* num = (INDEPFILES(d_a)) ? INFINITE_NUM_PROCS: DRA_NUM_IOPROCS; */ if (INDEPFILES(d_a)) { num = pnga_cluster_nnodes(); } else { diff --git a/pario/dra/ffflush.F b/pario/dra/ffflush.F index c08af46d5..d4c6178c0 100644 --- a/pario/dra/ffflush.F +++ b/pario/dra/ffflush.F @@ -5,13 +5,7 @@ subroutine ffflush(unit) integer unit c -#ifdef CRAY -* if(unit.eq.6)then -* call flush(101) -* else -* call flush(unit) -* endif -#elif HAVE_F77_FLUSH +#if HAVE_F77_FLUSH call F77_FLUSH(unit) #endif c diff --git a/pario/dra/perfn.c b/pario/dra/perfn.c index 836e92c36..4535bde35 100644 --- a/pario/dra/perfn.c +++ b/pario/dra/perfn.c @@ -621,11 +621,7 @@ int main(int argc, char **argv) int max_arrays = 10; double max_sz = 1e8, max_disk = 1e10, max_mem = 1e6; int numfiles, numioprocs; -#if defined(IBM) - int stack = 9000000, heap = 4000000; -#else int stack = 12000000, heap = 8000000; -#endif MP_INIT(argc,argv); GA_Initialize(); diff --git a/pario/eaf/eaf.c b/pario/eaf/eaf.c index f4dd3e22b..0f34d713a 100644 --- a/pario/eaf/eaf.c +++ b/pario/eaf/eaf.c @@ -592,28 +592,9 @@ void EAF_Errmsg(int code, char *msg) */ int EAF_Truncate(int fd, eaf_off_t length) { -#ifdef CRAY - int rc; -#endif - if (!valid_fd(fd)) return EAF_ERR_INVALID_FD; -#ifdef CRAY - /* ftruncate does not work with Cray FFIO, we need to implement it - * as a sequence of generic close, truncate, open calls - */ - - rc = elio_close(file[fd].elio_fd); - if(rc) return rc; - if(truncate(file[fd].fname, (off_t) length)) return EAF_ERR_TRUNCATE; - if (!(file[fd].elio_fd = elio_open(file[fd].fname, file[fd].type, ELIO_PRIVATE))) { - free(file[fd].fname); - file[fd].fname = 0; - return ELIO_PENDING_ERR; - } -#else if(elio_truncate(file[fd].elio_fd, (Off_t)length)) return EAF_ERR_TRUNCATE; -#endif return EAF_OK; /* return elio_truncate(file[fd].elio_fd, (Off_t) length);*/ diff --git a/pario/eaf/eaf_f2c.c b/pario/eaf/eaf_f2c.c index aa9243188..b1afad39a 100644 --- a/pario/eaf/eaf_f2c.c +++ b/pario/eaf/eaf_f2c.c @@ -17,10 +17,6 @@ #include "typesf2c.h" #include "farg.h" -#if defined(CRAY) && defined(__crayx1) -#undef CRAY -#endif - #define eaf_aread_ F77_FUNC_(eaf_aread,EAF_AREAD) #define eaf_awrite_ F77_FUNC_(eaf_awrite,EAF_AWRITE) #define eaf_close_ F77_FUNC_(eaf_close,EAF_CLOSE) diff --git a/pario/eaf/test.F b/pario/eaf/test.F index 69cae5808..891e4838c 100644 --- a/pario/eaf/test.F +++ b/pario/eaf/test.F @@ -107,9 +107,7 @@ subroutine test2 goto 1000 endif -#ifndef __crayx1 call sleep(10) -#endif write(0,*) 'Finished reading file=',fnum, $ ' fd=',fd(fnum), $ ' iter=',iter diff --git a/pario/elio/elio.c b/pario/elio/elio.c index 8a6798315..67e21bcfe 100644 --- a/pario/elio/elio.c +++ b/pario/elio/elio.c @@ -42,11 +42,7 @@ #include "../sf/coms.h" -#if defined(CRAY) && defined(__crayx1) -#undef CRAY -#endif - -#if defined(AIX) || defined(DECOSF) || defined(SGI64) || defined(CRAY) || defined(LINUXAIO) +#if defined(AIX) || defined(LINUXAIO) /* systems with Asynchronous I/O */ #else # ifndef NOAIO @@ -96,17 +92,7 @@ #endif /* structure to emulate control block in Posix AIO */ -#if defined (CRAY) -# if defined(FFIO) - typedef struct { struct ffsw stat; int filedes; }io_status_t; -# else -# include - typedef struct { struct iosw stat; int filedes; }io_status_t; -# endif - io_status_t cb_fout[MAX_AIO_REQ]; - io_status_t *cb_fout_arr[MAX_AIO_REQ]; - -#elif defined(AIO) +#if defined(AIO) # include # if defined(AIX) # define INPROGRESS EINPROG @@ -291,13 +277,8 @@ int elio_set_cb(Fd_t fd, Off_t doffset, int reqn, void *buf, Size_t bytes) { #if defined(AIO) off_t offset = (off_t) doffset; -# if defined(CRAY) - if(offset != SEEK(fd->fd, offset, SEEK_SET))return (SEEKFAIL); - cb_fout_arr[reqn] = cb_fout+reqn; - cb_fout[reqn].filedes = fd->fd; -# else - cb_fout[reqn].aio_offset = offset; - cb_fout_arr[reqn] = cb_fout+reqn; + cb_fout[reqn].aio_offset = offset; + cb_fout_arr[reqn] = cb_fout+reqn; cb_fout[reqn].aio_buf = buf; cb_fout[reqn].aio_nbytes = bytes; # if defined(AIX) @@ -306,7 +287,6 @@ int elio_set_cb(Fd_t fd, Off_t doffset, int reqn, void *buf, Size_t bytes) cb_fout[reqn].aio_sigevent.sigev_notify = SIGEV_NONE; cb_fout[reqn].aio_fildes = fd->fd; # endif -# endif #endif return ELIO_OK; } @@ -370,10 +350,7 @@ int elio_awrite(Fd_t fd, Off_t doffset, const void* buf, Size_t bytes, io_reques if((rc=elio_set_cb(fd, offset, aio_i, (void*) buf, bytes))) ELIO_ERROR(rc,0); -# if defined(CRAY) - rc = WRITEA(fd->fd, (char*)buf, bytes, &cb_fout[aio_i].stat, DEFARG); - stat = (rc < 0)? -1 : 0; -# elif defined(AIX) +# if defined(AIX) # if !defined(AIX52) && !defined(_AIO_AIX_SOURCE) stat = aio_write(fd->fd, cb_fout + aio_i); # endif @@ -551,9 +528,6 @@ int elio_aread(Fd_t fd, Off_t doffset, void* buf, Size_t bytes, io_request_t * r #ifdef AIO int aio_i; #endif -#ifdef CRAY - int rc; -#endif if (doffset >= ABSURDLY_LARGE) ELIO_ERROR(SEEKFAIL,0); @@ -603,10 +577,7 @@ int elio_aread(Fd_t fd, Off_t doffset, void* buf, Size_t bytes, io_request_t * r *req_id = (io_request_t) aio_i; if((stat=elio_set_cb(fd, offset, aio_i, (void*) buf, bytes))) ELIO_ERROR((int)stat,0); -# if defined(CRAY) - rc = READA(fd->fd, buf, bytes, &cb_fout[aio_i].stat, DEFARG); - stat = (rc < 0)? -1 : 0; -# elif defined(AIX) +# if defined(AIX) #if !defined(AIX52) && !defined(_AIO_AIX_SOURCE) stat = aio_read(fd->fd, cb_fout+aio_i); #endif @@ -648,23 +619,7 @@ int elio_wait(io_request_t *req_id) if(*req_id != ELIO_DONE ) { # ifdef AIO -# if defined(CRAY) - -# if defined(FFIO) - { - struct ffsw dumstat, *prdstat=&(cb_fout[*req_id].stat); - fffcntl(cb_fout[*req_id].filedes, FC_RECALL, prdstat, &dumstat); - if (FFSTAT(*prdstat) == FFERR) ELIO_ERROR(SUSPFAIL,0); - } -# else - { - struct iosw *statlist[1]; - statlist[0] = &(cb_fout[*req_id].stat); - recall(cb_fout[*req_id].filedes, 1, statlist); - } -# endif - -# elif defined(AIX) +# if defined(AIX) # if !defined(AIX52) && !defined(_AIO_AIX_SOURCE) do { /* I/O can be interrupted on SP through rcvncall ! */ rc =(int)aio_suspend(1, cb_fout_arr+(int)*req_id); @@ -675,11 +630,6 @@ int elio_wait(io_request_t *req_id) if((int)aio_suspend((const struct aiocb *const*)(cb_fout_arr+(int)*req_id), 1, NULL) != 0) rc =-1; # endif if(rc ==-1) ELIO_ERROR(SUSPFAIL,0); - -# if defined(DECOSF) - /* on DEC aio_return is required to clean internal data structures */ - if(aio_return(cb_fout+(int)*req_id) == -1) ELIO_ERROR(RETUFAIL,0); -# endif #endif while(aio_req[aio_i] != *req_id && aio_i < MAX_AIO_REQ) aio_i++; @@ -715,21 +665,7 @@ int elio_probe(io_request_t *req_id, int* status) } else { #ifdef AIO -# if defined(CRAY) - -# if defined(FFIO) - { - struct ffsw dumstat, *prdstat=&(cb_fout[*req_id].stat); - fffcntl(cb_fout[*req_id].filedes, FC_ASPOLL, prdstat, &dumstat); - errval = (FFSTAT(*prdstat) == 0) ? INPROGRESS: 0; - } -# else - - errval = ( IO_DONE(cb_fout[*req_id].stat) == 0)? INPROGRESS: 0; - -# endif - -# elif defined(AIX) +# if defined(AIX) errval = aio_error(cb_fout[(int)*req_id].aio_handle); # else errval = aio_error(cb_fout+(int)*req_id); @@ -760,38 +696,6 @@ int elio_probe(io_request_t *req_id, int* status) } -#if defined(CRAY) && defined(FFIO) -static int cray_part_info(char *dirname,long *pparts,long *sparts) -{ - struct statfs stats; - long temp,count=0; - - if(statfs(dirname, &stats, sizeof(struct statfs), 0) == -1) return -1; - - temp = stats.f_priparts; - while(temp != 0){ - count++; - temp <<= 1; - } - *pparts = count; - - if(stats.f_secparts != 0){ - - temp = (stats.f_secparts << count); - count = 0; - while(temp != 0){ - count++; - temp <<= 1; - } - *sparts = count; - } - return ELIO_OK; - -} - -#endif - - /*\ Noncollective File Open \*/ Fd_t elio_open(const char* fname, int type, int mode) @@ -865,48 +769,7 @@ Fd_t elio_open(const char* fname, int type, int mode) } #endif -#if defined(CRAY) && defined(FFIO) - { - struct ffsw ffstat; - long pparts, sparts, cbits, cblocks; - extern long _MPP_MY_PE; - char *ffio_str="cache:256"; /* intern I/O buffer/cache 256*4096 bytes */ - /* JN: we do not want read-ahead write-behind*/ - - if(cray_part_info(dirname,&pparts,&sparts) != ELIO_OK){ - free(fd); - ELIO_ERROR_NULL(STATFAIL, 0); - } - - ptype |= ( O_BIG | O_PLACE | O_RAW ); - cbits = (sparts != 0) ? 1 : 0; - - if( sparts != 0) { - - /* stripe is set so we only select secondary partitions with cbits */ - if(mode == ELIO_SHARED){ - cbits = ~((~0L)<fd = OPEN(fname, ptype, FOPEN_MODE, cbits, cblocks, &ffstat, NULL); - else - fd->fd = OPEN(fname, ptype, FOPEN_MODE, 0L , 0 , &ffstat, ffio_str); - - } -#else fd->fd = OPEN(fname, ptype, FOPEN_MODE ); -#endif if( (int)fd->fd == -1) { free(fd); diff --git a/pario/elio/eliop.h b/pario/elio/eliop.h index 7b0bfe636..536cb21c0 100644 --- a/pario/elio/eliop.h +++ b/pario/elio/eliop.h @@ -27,15 +27,7 @@ extern void GA_Error(char*, int); #endif -#if (defined(SP) || defined(SP1)) -#define PIOFS 1 -#endif - - -#if (defined(CRAY) && !defined(__crayx1)) -# include -# define STATVFS statfs -#elif defined(__FreeBSD__) && !defined(GLIBC) +#if defined(__FreeBSD__) && !defined(GLIBC) # include # include # define STATVFS statfs @@ -44,7 +36,7 @@ extern void GA_Error(char*, int); # define STATVFS _stat # define S_ISDIR(mode) ((mode&S_IFMT) == S_IFDIR) # define S_ISREG(mode) ((mode&S_IFMT) == S_IFREG) -#elif defined(LINUX) || defined(CYGWIN) || defined(BGQ) || defined(__GLIBC__) +#elif defined(LINUX) || defined(CYGWIN) || defined(__GLIBC__) # include # define STATVFS statfs # define NO_F_FRSIZE @@ -61,7 +53,7 @@ extern void GA_Error(char*, int); #include -#if (defined(CRAY) && defined(FFIO)) +#if defined(FFIO) # include # include # include @@ -77,10 +69,6 @@ extern int elio_pending_error; #if !defined(PRINT_AND_ABORT) -# if defined(SUN) - extern int fprintf(); - extern void fflush(); -# endif # define PRINT_AND_ABORT(msg, val){\ fprintf(stderr, "ELIO fatal error: %s %ld\n", msg, val);\ fprintf(stdout, "ELIO fatal error: %s %ld\n", msg, val);\ diff --git a/pario/elio/stat.c b/pario/elio/stat.c index 88248cb45..343bac262 100644 --- a/pario/elio/stat.c +++ b/pario/elio/stat.c @@ -6,10 +6,6 @@ #include "eliop.h" #include "chemio.h" - -#if defined(CRAY) && defined(__crayx1) -#undef CRAY -#endif #define DEBUG_ 0 @@ -97,11 +93,7 @@ int elio_stat(char *fname, stat_t *statinfo) if(!S_ISREG(ufs_stat.st_mode) && !S_ISDIR(ufs_stat.st_mode)) ELIO_ERROR(TYPEFAIL, 1); -# if defined(CRAY) - if(statfs(fname, &ufs_statfs, sizeof(ufs_statfs), 0) != 0) -# else if(STATVFS(fname, &ufs_statfs) != 0) -# endif ELIO_ERROR(STATFAIL,1); # if defined(WIN32) @@ -110,18 +102,7 @@ int elio_stat(char *fname, stat_t *statinfo) # else /* get number of available blocks */ -# if defined(CRAY) || defined(NEC) - /* f_bfree == f_bavail -- naming changes */ - -# ifdef CRAY - if(ufs_statfs.f_secnfree != 0) /* check for secondary partition */ - statinfo->avail = (avail_t) ufs_statfs.f_secnfree; - else -# endif - statinfo->avail = (avail_t) ufs_statfs.f_bfree; -# else - statinfo->avail = (avail_t) ufs_statfs.f_bavail; -# endif + statinfo->avail = (avail_t) ufs_statfs.f_bavail; # ifdef NO_F_FRSIZE /* on some older systems it was f_bsize */ diff --git a/pario/makefile.h b/pario/makefile.h index 2bd2bad49..37312015f 100644 --- a/pario/makefile.h +++ b/pario/makefile.h @@ -34,10 +34,6 @@ ifdef LARGE_FILES AIO_LARGE_FILES = $(shell /usr/bin/oslevel | awk -F. \ '{ if ($$1 == 4 && $$2 == 2 && $$3 <= 0 ) \ print "NO"}') - endif - ifeq ($(TARGET), SOLARIS) - LIB_DEFINES += -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 - CC += $(shell getconf LFS_CFLAGS) endif # # LINUX: kernel 2.4 is needed @@ -47,31 +43,6 @@ ifdef LARGE_FILES LIB_DEFINES += $(shell getconf LFS_CFLAGS) endif - ifeq ($(TARGET), BGL) - LIB_DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE - endif - - ifeq ($(TARGET), BGP) - LIB_DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE - endif - - ifeq ($(TARGET), BGQ) - LIB_DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE - endif - -# -# HP targets tested on HPUX 11.0 -# - ifeq ($(TARGET), HPUX) - LIB_DEFINES += -D_LARGEFILE64_SOURCE - LIB_DEFINES += $(shell getconf XBS5_ILP32_OFFBIG_CFLAGS) - endif - - ifeq ($(TARGET), HPUX64) - LIB_DEFINES += -D_LARGEFILE64_SOURCE - LIB_DEFINES += $(shell getconf XBS5_LP64_OFF64_CFLAGS) - endif - LIB_DEFINES += -DLARGE_FILES endif @@ -80,9 +51,6 @@ ifdef LIB_TARGETS ifdef HPIODIR LIB_DEFINES += -DHPIODIR=\'$(HPIODIR)/\' endif -ifeq ($(TARGET), DECOSF) - LOC_LIBS += -laio -lpthreads -endif endif ifdef USE_LINUXAIO diff --git a/pario/pabody b/pario/pabody index 7edc3962f..00ec1353f 100755 --- a/pario/pabody +++ b/pario/pabody @@ -1,4 +1,4 @@ -#!/usr/local/bin/perl +#!/usr/bin/env perl print "Enter ascii SDDF file: "; #$file = ; diff --git a/INSTALL b/support/INSTALL similarity index 96% rename from INSTALL rename to support/INSTALL index 7d1c323be..ff7d92558 100644 --- a/INSTALL +++ b/support/INSTALL @@ -231,11 +231,11 @@ parse its `' header file. The option `-nodtk' can be used as a workaround. If GNU CC is not installed, it is therefore recommended to try - ./configure CC="cc" + ./configure CC="cc" and if that doesn't work, try - ./configure CC="cc -nodtk" + ./configure CC="cc -nodtk" On Solaris, don't put `/usr/ucb' early in your `PATH'. This directory contains several dysfunctional programs; working variants of @@ -245,7 +245,7 @@ in your `PATH', put it _after_ `/usr/bin'. On Haiku, software installed for all users goes in `/boot/common', not `/usr/local'. It is recommended to use the following options: - ./configure --prefix=/boot/common + ./configure --prefix=/boot/common Specifying the System Type ========================== @@ -258,22 +258,22 @@ a message saying it cannot guess the machine type, give it the `--build=TYPE' option. TYPE can either be a short name for the system type, such as `sun4', or a canonical name which has the form: - CPU-COMPANY-SYSTEM + CPU-COMPANY-SYSTEM where SYSTEM can have one of these forms: - OS - KERNEL-OS + OS + KERNEL-OS - See the file `config.sub' for the possible values of each field. If +See the file `config.sub' for the possible values of each field. If `config.sub' isn't included in this package, then this package doesn't need to know the machine type. - If you are _building_ compiler tools for cross-compiling, you should +If you are _building_ compiler tools for cross-compiling, you should use the option `--target=TYPE' to select the type of system they will produce code for. - If you want to _use_ a cross compiler, that generates code for a +If you want to _use_ a cross compiler, that generates code for a platform different from the build platform, you should specify the "host" platform (i.e., that on which the generated programs will eventually be run) with `--host=TYPE'. @@ -281,7 +281,7 @@ eventually be run) with `--host=TYPE'. Sharing Defaults ================ - If you want to set default values for `configure' scripts to share, +If you want to set default values for `configure' scripts to share, you can create a site shell script called `config.site' that gives default values for variables like `CC', `cache_file', and `prefix'. `configure' looks for `PREFIX/share/config.site' if it exists, then @@ -298,7 +298,7 @@ configure again during the build, and the customized values of these variables may be lost. In order to avoid this problem, you should set them in the `configure' command line, using `VAR=value'. For example: - ./configure CC=/usr/local2/bin/gcc + ./configure CC=/usr/local2/bin/gcc causes the specified `gcc' to be used as the C compiler (unless it is overridden in the site shell script). @@ -306,7 +306,7 @@ overridden in the site shell script). Unfortunately, this technique does not work for `CONFIG_SHELL' due to an Autoconf bug. Until the bug is fixed you can use this workaround: - CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash + CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash `configure' Invocation ====================== diff --git a/README_AUTOTOOLS.md b/support/README_AUTOTOOLS.md similarity index 100% rename from README_AUTOTOOLS.md rename to support/README_AUTOTOOLS.md diff --git a/README_RELEASES.md b/support/README_RELEASES.md similarity index 100% rename from README_RELEASES.md rename to support/README_RELEASES.md diff --git a/README_SICM.md b/support/README_SICM.md similarity index 100% rename from README_SICM.md rename to support/README_SICM.md diff --git a/tcgmsg/examples/Makefile.proto b/tcgmsg/examples/Makefile.proto index 31ee6da09..ae68c78ab 100644 --- a/tcgmsg/examples/Makefile.proto +++ b/tcgmsg/examples/Makefile.proto @@ -5,101 +5,6 @@ INSTALL = echo "$@" is built -IF CRAY -#CRAY ... note that ranf is much faster than dran48 being used - FC = cf77 - CC = cc -#older UNICOS -# LIBS = -lbsd -lnet -lrpc - LIBS = - FFLAGS = -Zv -Wf"-dp" - CFLAGS = -I$(LIBDIR) -DCRAY - SCFBLAS = scfblas.o -ENDIF CRAY -IF SGI - FC = f77 - FFLAGS = -O2 - CC = cc - CFLAGS = -O -I$(LIBDIR) -I/usr/include/bsd -I/usr/include/sun -DSGI - LIBS = -lmalloc -lbsd -lrpcsvc -lsun - SCFBLAS = scfblas.o -ENDIF SGI -IF SGITFP -#SGI IRIX ... note that need -lmalloc ... default break in Stress - FC = f77 - FFLAGS = -O2 -d8 -i8 - CC = cc - DEFINES = -DSGI -DSGITFP - CFLAGS = -O $(DEFINES) - LIBS = -lmalloc -lbsd -lrpcsvc - SCFBLAS = scfblas.o -ENDIF SGI -IF KSR - FC = f77 - FFLAGS = -para -g -r8 - CC = cc - CFLAGS = -para -g -I$(LIBDIR) - LIBS = -lrpc - SCFBLAS = scfblas.o -ENDIF KSR -IF APOLLO -#APOLLO GN10000 ... Apollo PRISM Domain/OS - FC = f77 - FFLAGS = -O - CC = cc - CFLAGS = -O -DAPOLLO -DCONVEX -I$(LIBDIR) \ - -A nansi -A sys,bsd4.3 -A run,bsd4.3 - LIBS = - SCFBLAS = scfblas.o -ENDIF -IF CONVEX - FC = fc - FFLAGS = -O2 - CC = cc - CFLAGS = -O -DCONVEX -I$(LIBDIR) - LIBS = - SCFBLAS = -ENDIF CONVEX -IF HPUX - FC = f77 - CC = cc - LIBS = - FFLAGS = -O +ppu - CFLAGS = -O -DHPUX -DEXTNAME -DPLOT -I$(LIBDIR) - SCFBLAS = scfblas.o -ENDIF HPUX -IF HPUXNOEXT - FC = f77 - CC = cc - LIBS = - FFLAGS = -O - CFLAGS = -O -DHPUX -DPLOT -I$(LIBDIR) - SCFBLAS = scfblas.o -ENDIF HPUXNOEXT -IF SUN - FC = f77 - CC = cc - LIBS = - FFLAGS = -O3 -dalign -Nl99 - CFLAGS = -O -dalign -DSUN -DPLOT -I$(LIBDIR) - SCFBLAS = scfblas.o -ENDIF SUN -IF DEC - FC = f77 - CC = cc - LIBS = - FFLAGS = -O2 - CFLAGS = -O -DULTRIX -DPLOT -I$(LIBDIR) - SCFBLAS = scfblas.o -ENDIF DEC -IF DECOSF - FC = f77 - CC = cc - LIBS = - FFLAGS = -O -integer_size 64 -align dcommons -align records - CFLAGS = -O -DDECOSF -I$(LIBDIR) - SCFBLAS = scfblas.o -ENDIF DEC IF LINUX FC = f77 CC = cc @@ -108,45 +13,6 @@ IF LINUX CFLAGS = -O -I$(LIBDIR) SCFBLAS = scfblas.o ENDIF LINUX -IF IPSC DELTA PARAGON - AS = as860 - FC = if77 - CC = icc - LIBS = -node -lkmath -IF DELTA - DEFINES = -DIPSC -DDELTA -DPLOT - INSTALL = rcp "$@" delta1: & - CFLAGS = -i860 -O $(DEFINES) -I$(LIBDIR) -Knoieee - FFLAGS = -O2 -i860 -Knoieee -ELSEIF PARAGON - DEFINES = -DIPSC -DPARAGON -DPLOT - INSTALL = echo "$@" is built - CFLAGS = -nx -g -Knoieee $(DEFINES) - FFLAGS = -nx -O2 -Knoieee -ELSE - DEFINES = -DIPSC -DPLOT - INSTALL = rcp "$@" micron: & - CFLAGS = -i860 -O $(DEFINES) -I$(LIBDIR) -Knoieee - FFLAGS = -O2 -i860 -Knoieee -ENDIF - SCFBLAS = -ENDIF IPSC DELTA PARAGON -IF NEXT - FC = f77 - CC = cc - FFLAGS = -g -f -N9 - CFLAGS = -g -DNEXT -DEXTNAME -DPLOT -I$(LIBDIR) - LIBS = - SCFBLAS = scfblas.o -ENDIF NEXT -IF NEXTNOEXT - FC = f77 - CC = cc - FFLAGS = -g -f - CFLAGS = -g -DNEXT -DPLOT -I$(LIBDIR) - LIBS = - SCFBLAS = scfblas.o -ENDIF NEXT IF IBM #IBM RISC 6000 ... AIX 3.1 ... -qEXTNAME # ... see the comments in the Makefile for the toolkit. @@ -165,26 +31,6 @@ IF IBMNOEXT CFLAGS = -O -DPLOT -DAIX -I$(LIBDIR) SCFBLAS = ENDIF IBMNOEXT -IF ARDENT - FC = fc - CC = cc - LIBS = - FFLAGS = -43 -O2 - CFLAGS = -O2 -43 -DARDENT -DPLOT -I$(LIBDIR) - SCFBLAS = scfblas.o -ENDIF ARDENT -IF ALLIANT ALLIANTMPP - FC = fortran - CC = fxc -IF ALLIANT - LIBS = -lalgebra -lsignal -lcommon -ELSE - LIBS = -lalgebra -lsignal -lcommon -lsw -ENDIF - FFLAGS = -g -Ogv -AS -uniproc -OM - CFLAGS = -Ogv -uniproc -DALLIANT -I$(LIBDIR) - SCFBLAS = -ENDIF ALLIANT ###################################### # Should not need to modify below here @@ -284,12 +130,8 @@ mc.o: msgtypesf.h # Jacobi example ################ -IF CRAY - JOBJ = fexit.o timer.o getmem.o -ELSEIF IBMNOEXT ALLIANT +IF IBMNOEXT JOBJ = fexit.o timer.o getmem.o mxv_dgemv.o -ELSEIF IPSC DELTA - JOBJ = fexit.o timer.o getmem.o mxv_daxpy1.o daxpy1.o ELSE JOBJ = fexit.o timer.o getmem.o mxv_fortran.o ENDIF @@ -318,13 +160,11 @@ template.p: echo "# This template is piped thru sed to replace" > template.p echo "# PROGRAM with the name of the program." >> template.p echo "# Add other hosts and processes as desired." >> template.p -IF CRAY NEXT NEXTNOEXT +IF NEXTNOEXT echo "$$LOGNAME `hostname` 1 `pwd`/PROGRAM /tmp" >> template.p echo "$$LOGNAME `hostname` 1 `pwd`/PROGRAM /tmp" >> template.p echo "$$LOGNAME `hostname` 1 `pwd`/PROGRAM /tmp" >> template.p echo "$$LOGNAME `hostname` 1 `pwd`/PROGRAM /tmp" >> template.p -ELSEIF IPSC - echo '0 $$ ' PROGRAM . >> template.p ELSE echo "`whoami` `hostname` 4 `pwd`/PROGRAM /tmp" >> template.p ENDIF diff --git a/tcgmsg/examples/demo.proto b/tcgmsg/examples/demo.proto index d6336933d..7917c58d0 100644 --- a/tcgmsg/examples/demo.proto +++ b/tcgmsg/examples/demo.proto @@ -6,23 +6,6 @@ echo " Example Message Passing Programs" echo " --------------------------------" echo "" -IF IPSC -if (! -e template.p) make template.p - -# Get cube size etc. - -echo -n 'Input options for parallel (default = -w -t 4 -C tcgmsg) ' -set cube="$<" -if ("$cube" == "") set cube = '-w -t 4 -C tcgmsg' -echo " " -ELSEIF DELTA -# Get mesh size for the Delta - -echo -n 'Input desired submesh with no spaces (default = 4,4) ' -set mesh="$<" -if ("$mesh" == "") set mesh = "4,4" -echo " " -ELSE # Find/generate template PROCGRP file if (! -e template.p) then @@ -31,7 +14,6 @@ if (! -e template.p) then make template.p endif -ENDIF # Check no. of arguments @@ -93,30 +75,11 @@ goto RUNIT RUNIT: -IF DELTA -echo " " -echo "Run $PROGRAM on the mesh ($mesh)" -echo " " - -# Actually run the sucker ... assumes can use rcp and rsh to delta1 - -echo rcp $PROGRAM delta1: -rcp $PROGRAM delta1: -echo rsh delta1 mexec "-t\(${mesh}\) -f '$PROGRAM $ARGS'" -rsh delta1 mexec "-t\(${mesh}\) -f '$PROGRAM $ARGS'" - -ELSE - # Generate the actual PROCGRP file from the template and print out # summary of it sed "s/PROGRAM/$PROGRAM/" < template.p > {$PROGRAM}.p -IF IPSC -echo "time parallel $cube $PROGRAM" -time parallel $cube $PROGRAM -ELSE - echo "" echo 'The following hosts/processes will be used:' echo ' ' @@ -126,5 +89,3 @@ echo "" echo time parallel $PROGRAM $ARGS echo "" time parallel $PROGRAM $ARGS -ENDIF -ENDIF diff --git a/tcgmsg/examples/fexit.f.proto b/tcgmsg/examples/fexit.f.proto index 2d3d2744a..2523c7b6d 100644 --- a/tcgmsg/examples/fexit.f.proto +++ b/tcgmsg/examples/fexit.f.proto @@ -1,13 +1,6 @@ subroutine fexit -IF ARDENT -c Fortran returns a non-zero status unless we STOP, which -c produces the irritating FORTRAN STOP message, or we -c explicitly call exit - call exit(0) -ELSE -ENDIF end -IF HPUX HPUXNOEXT IBM IBMNOEXT +IF IBM IBMNOEXT subroutine flush(iunit) end ENDIF diff --git a/tcgmsg/examples/getmem.c b/tcgmsg/examples/getmem.c index 65dd757fb..962dcc150 100644 --- a/tcgmsg/examples/getmem.c +++ b/tcgmsg/examples/getmem.c @@ -5,14 +5,10 @@ /*$Id: getmem.c,v 1.2 1995-02-02 23:24:10 d3g681 Exp $*/ extern char * memalign(); -#if (defined(AIX) || defined(NEXT) || defined(HPUX)) && !defined(EXTNAME) +#if defined(AIX) && !defined(EXTNAME) #define getmem_ getmem #endif -#if defined(CRAY) || defined(ARDENT) -#define getmem_ GETMEM -#endif - /* getmem gets n real*8 storage locations and returns its address (iaddr) and offset (ioff) within the real*8 array work so that the usable memory is (work(i+ioff),i=1,n). diff --git a/tcgmsg/examples/grid.c b/tcgmsg/examples/grid.c index b39889c69..0d7e84a99 100644 --- a/tcgmsg/examples/grid.c +++ b/tcgmsg/examples/grid.c @@ -20,11 +20,7 @@ static long plot_type = 0; /* 0 means no plot */ #include "sndrcv.h" -#if defined(DELTA) || defined(IPSC) -#define htonl(a) (a) -#endif - -#if !defined(AIX) && !defined(DECOSF) +#if !defined(AIX) extern char *malloc(); #endif extern void exit(); @@ -457,14 +453,7 @@ double Operate(grid, ncols, nrows, ngrid, do_sums) for (i=1; i #endif +#if HAVE_STRINGS_H +# include +#endif #if HAVE_MEMORY_H # include #endif diff --git a/tcgmsg/tests/hpuxargs.f b/tcgmsg/tests/hpuxargs.f deleted file mode 100644 index d0898f966..000000000 --- a/tcgmsg/tests/hpuxargs.f +++ /dev/null @@ -1,10 +0,0 @@ -c -c $Header: /tmp/hpctools/ga/tcgmsg-mpi/hpuxargs.f,v 1.2 1999-06-08 21:08:29 d3h325 Exp $ -c - integer function hpargc() - hpargc = iargc() + 1 - end - integer function hpargv(index, arg, maxlen) - character*256 arg - hpargv = igetarg(index,arg,maxlen) - end diff --git a/tcgmsg/tests/pvm.c b/tcgmsg/tests/pvm.c deleted file mode 100644 index f4b99dda9..000000000 --- a/tcgmsg/tests/pvm.c +++ /dev/null @@ -1,196 +0,0 @@ -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_STDIO_H -# include -#endif - -#include - -#define MAX_PROC 128 - -#define TCGTIME_ TCGTIME - -double TCGTIME_(); - -#define ENCODING PvmDataRaw - - -long NODEID_() -{ - return((long) pvm_get_PE(pvm_mytid())); -} - - -long NNODES_() -{ - return((long)pvm_gsize(0)); -} - - -/** - * Error handler - */ -void Error(string, code) - char *string; - long code; -{ - - (void) fflush(stdout); - (void) fflush(stderr); - - (void) fprintf(stderr, "%3d:%s %ld(%x)\n", NODEID_(), string, code, code); - (void) perror("system message"); - - (void) fflush(stdout); - (void) fflush(stderr); - - globalexit(1); -} - - -void SND_(type, buf, lenbuf, node, sync) - long *type; - char *buf; - long *lenbuf; - long *node; - long *sync; -{ - long tid=pvm_gettid("", *node); - - pvm_psend(tid, *type, buf, *lenbuf, PVM_BYTE); -} - - -void RCV_(type, buf, lenbuf, lenmes, nodeselect, nodefrom, sync) - long *type; - char *buf; - long *lenbuf; - long *lenmes; - long *nodeselect; - long *nodefrom; - long *sync; -{ - int tid=*nodeselect, tidfrom; - - if(tid >-1) tid=pvm_gettid("", *nodeselect); - pvm_precv(tid, *type, buf, *lenbuf, PVM_BYTE, &tidfrom, 0, 0); - *nodefrom = pvm_get_PE(tidfrom); -} - - -/** - * Time passing a message round a ring - */ -void RingTest() -{ - long me = NODEID_(); - long type = 4; - long left = (me + NNODES_() - 1) % NNODES_(); - long right = (me + 1) % NNODES_(); - char *buf, *buf2; - unsigned char sum, sum2; - long lenbuf, lenmes, nodefrom; - double start, used, rate; - long max_len; - long i; - long sync = 1; - char *malloc(); - - i = 0; - lenbuf = sizeof(long); - - if (me == 0) { - (void) printf("Ring test ... time network performance\n---------\n\n"); - /* - (void) printf("Input maximum message size: "); - (void) fflush(stdout); - if (scanf("%ld", &max_len) != 1) - Error("RingTest: error reading max_len",(long) -1); - if ( (max_len <= 0) || (max_len >= 4*1024*1024) ) - max_len = 256*1024; - */ - } - max_len = 512*1024; - /* type = 4 | MSGINT;*/ - /* BRDCST_(&type, (char *) &max_len, &lenbuf, &i);*/ - - if ( (buf = malloc((unsigned) max_len)) == (char *) NULL) - Error("failed to allocate buf",max_len); - - if (me == 0) { - if ( (buf2 = malloc((unsigned) max_len)) == (char *) NULL) - Error("failed to allocate buf2",max_len); - - for (i=0; i 0) - rate = 1.0e-6 * (double) (NNODES_() * lenbuf) / (double) used; - else - rate = 0.0; - rate = rate * nloops; - printf("len=%6ld bytes, nloop=%4ld, used=%8.4f s, rate=%8.4f Mb/s (0x%x, 0x%x)\n", - lenbuf, nloops, used, rate, sum, sum2); - (void) fflush(stdout); - } - else { - while (loop--) { - RCV_(&type, buf, &lenbuf, &lenmes, &right, &nodefrom, &sync); - SND_(&type, buf, &lenbuf, &left, &sync); - } - } - lenbuf *= 2; - } - - if (me == 0) - (void) free(buf2); - - (void) free(buf); -} - - -int main(int argc, char **argv) -{ - long me, nproc; - long buf[10]; - long type=1, len=sizeof(buf), node, sync=1, i; - - - me = NODEID_(); - nproc = NNODES_(); - /* - if(me==0){ - for(i=0;i<10;i++)buf[i]=i; - - node=1; - SND_(&type, buf, &len, &node, &sync); - printf("me=%d nproc = %d\n", me, nproc); - fflush(stdout); - }else{ - node=0; - RCV_(&type, buf, &len, &len, &node, &node, &sync); - printf("me=%d nproc = %d\n", me, nproc); - fflush(stdout); - for(i=0;i<10;i++)printf("%d ",buf[i]);; - } - */ - RingTest(); -} diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index efceea836..5acf04b63 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -48,6 +48,8 @@ add_library(ga_tools OBJECT ga-wprof.c ) +list (APPEND GA_HEADER_PATHS ${CMAKE_CURRENT_LIST_DIR}) +set (GA_HEADER_PATHS ${GA_HEADER_PATHS} PARENT_SCOPE) target_include_directories(ga_tools BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/ma diff --git a/travis/build-run.sh b/travis/build-run.sh index 349c40f0b..5b779bf84 100755 --- a/travis/build-run.sh +++ b/travis/build-run.sh @@ -55,6 +55,10 @@ else fi case "$os" in Darwin) + xcode_v=$(clang --version 2>&1 |head -1 |cut -d ' ' -f 4 |cut -d . -f 1) + if [[ $( [ $xcode_v -ge 15 ] && echo 1) ]] ; then + export LDFLAGS=" -ld_classic " + fi echo "Mac CFLAGS" $CFLAGS ;; Linux) diff --git a/travis/install-armci-mpi.sh b/travis/install-armci-mpi.sh index 719ec738e..2dea2837d 100755 --- a/travis/install-armci-mpi.sh +++ b/travis/install-armci-mpi.sh @@ -5,6 +5,10 @@ set -x TRAVIS_ROOT="$1" os=`uname` +if [ "x$os" = "xDarwin" ] ; then + brew install libtool + which libtool +fi case "$MPI_IMPL" in mpich) export PATH=$TRAVIS_ROOT/mpich/bin:$PATH diff --git a/travis/install-intel.sh b/travis/install-intel.sh index 2f87c550f..b85c87def 100755 --- a/travis/install-intel.sh +++ b/travis/install-intel.sh @@ -28,12 +28,12 @@ case "$os" in Darwin) mkdir -p ~/mntdmg ~/apps/oneapi || true cd ~/Downloads - dir_base="18342" - dir_hpc="18341" - base="m_BaseKit_p_2022.1.0.92_offline" - hpc="m_HPCKit_p_2022.1.0.86_offline" - curl -LJO https://registrationcenter-download.intel.com/akdlm/irc_nas/"$dir_base"/"$base".dmg - curl -LJO https://registrationcenter-download.intel.com/akdlm/irc_nas/"$dir_hpc"/"$hpc".dmg + dir_base="cd013e6c-49c4-488b-8b86-25df6693a9b7" + dir_hpc="edb4dc2f-266f-47f2-8d56-21bc7764e119" + base="m_BaseKit_p_2023.2.0.49398" + hpc="m_HPCKit_p_2023.2.0.49443" + curl -sS -LJO https://registrationcenter-download.intel.com/akdlm/IRC_NAS/"$dir_base"/"$base".dmg + curl -sS -LJO https://registrationcenter-download.intel.com/akdlm/IRC_NAS/"$dir_hpc"/"$hpc".dmg echo "installing BaseKit" hdiutil attach "$base".dmg -mountpoint ~/mntdmg -nobrowse sudo ~/mntdmg/bootstrapper.app/Contents/MacOS/install.sh --cli --eula accept \ @@ -62,13 +62,43 @@ case "$os" in ;; Linux) export APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ - && sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ - && echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list \ - && sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" \ - && sudo apt-get update \ - && sudo apt-get -y install intel-oneapi-compiler-fortran intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic intel-oneapi-mkl \ - && sudo apt-get -y install intel-oneapi-mpi-devel + export TERM=dumb + rm -f l_Base*sh l_HP*sh + tries=0 ; until [ "$tries" -ge 10 ] ; do \ + dir_base="20f4e6a1-6b0b-4752-b8c1-e5eacba10e01" + dir_hpc="1b2baedd-a757-4a79-8abb-a5bf15adae9a" + base="l_BaseKit_p_2024.0.0.49564" + hpc="l_HPCKit_p_2024.0.0.49589" + wget -nv https://registrationcenter-download.intel.com/akdlm/IRC_NAS/"$dir_hpc"/"$hpc".sh \ + && wget -nv https://registrationcenter-download.intel.com/akdlm/IRC_NAS/"$dir_base"/"$base".sh \ + && break ;\ + tries=$((tries+1)) ; echo attempt no. $tries ; sleep 30 ; done + sh ./"$base".sh -a -c -s --action install --components intel.oneapi.lin.mkl.devel --install-dir $IONEAPI_ROOT --eula accept + if [[ "$?" != 0 ]]; then + df -h + echo "base kit install failed: exit code " "${?}" + exit 1 + fi + rm -rf $IONEAPI_ROOT/mkl/latest/lib/ia32 + rm -rf $IONEAPI_ROOT/mkl/latest/lib/intel64/*sycl* + rm -rf $IONEAPI_ROOT/mkl/latest/lib/intel64/*_pgi_* + rm -rf $IONEAPI_ROOT/mkl/latest/lib/intel64/*_gf_* + intel_components="intel.oneapi.lin.ifort-compiler:intel.oneapi.lin.dpcpp-cpp-compiler" + if [[ "$MPI_IMPL" == "intel" ]]; then + intel_components+=":intel.oneapi.lin.mpi.devel" + fi + sh ./"$hpc".sh -a -c -s --action install \ + --components "$intel_components" \ + --install-dir $IONEAPI_ROOT --eula accept + if [[ "$?" != 0 ]]; then + df -h + echo "hpc kit install failed: exit code " "${?}" + exit 1 + fi + rm -rf $IONEAPI_ROOT/compiler/latest/linux/lib/oclfpga + rm -f ./"$hpc".sh ./"$base".sh + rm -rf $IONEAPI_ROOT/compiler/latest/linux/lib/oclfpga || true + source "$IONEAPI_ROOT"/setvars.sh --force || true export I_MPI_F90=ifort export I_MPI_F77=ifort