.github/workflows/ci.yml

name: ci
on:
  pull_request:
    paths:
      # NOTE: keep these paths in sync with the paths that trigger the
      # fuzzydata Github Actions in .github/workflows/fuzzydata-test.yml
      - .github/workflows/**
      - .github/actions/**
      - '!.github/workflows/push-to-main.yml'
      - asv_bench/**
      - modin/**
      - requirements/**
      - scripts/**
      - environment-dev.yml
      - requirements-dev.txt
      - setup.cfg
      - setup.py
      - versioneer.py
  push:
  schedule:
    - cron: "30 2 * * WED"
    - cron: "30 2 * * THU"
concurrency:
  # Cancel other jobs in the same branch. We don't care whether CI passes
  # on old commits.
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
  MODIN_GITHUB_CI: true

jobs:
  python-filter:
    runs-on: ubuntu-latest
    outputs:
      python-version: ${{ steps.choose.outputs.python-version }}
    steps:
    - id: choose
      run: |
        if [[ "${{ github.event.schedule }}" = "30 2 * * WED" ]]
        then
          echo "python-version=3.10" >> "$GITHUB_OUTPUT"
        elif [[ "${{ github.event.schedule }}" = "30 2 * * THU" ]]
        then
          echo "python-version=3.11" >> "$GITHUB_OUTPUT"
        else
          echo "python-version=3.9" >> "$GITHUB_OUTPUT"
        fi

  # lint-flake8:
  #   needs: [python-filter]
  #   name: lint (flake8)
  #   runs-on: ubuntu-latest
  #   steps:
  #     - uses: actions/checkout@v4
  #     - uses: ./.github/actions/python-only
  #       with:
  #         python-version: ${{ needs.python-filter.outputs.python-version }}
  #     # NOTE: If you are changing the set of packages installed here, make sure that
  #     # the dev requirements match them.
  #     - run: pip install flake8 flake8-print flake8-no-implicit-concat
  #     # NOTE: keep the flake8 command here in sync with the pre-commit hook in
  #     # /contributing/pre-commit
  #     - run: flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py

  execution-filter:
    # Choose which executions we want to run all tests for on a pull request.
    # We always test 'native' and 'python' executions completely because they
    # are fast, but we only test ray, dask, and unidist, if we think this pull
    # request is affecting how we execute with those engines specifically.
    runs-on: ubuntu-latest
    outputs:
      ray: ${{ steps.filter.outputs.ray }}
      dask: ${{ steps.filter.outputs.dask }}
      unidist: ${{ steps.filter.outputs.unidist }}
      engines: ${{ steps.engines.outputs.engines }}
      experimental: ${{ steps.experimental.outputs.experimental }}
    steps:
    - uses: actions/checkout@v4
    - uses: dorny/paths-filter@v3
      id: filter
      with:
        filters: |
          shared: &shared
            - 'modin/core/execution/dispatching/**'
          ray:
            - *shared
            - 'modin/core/execution/ray/**'
          dask:
            - *shared
            - 'modin/core/execution/dask/**'
          unidist:
            - *shared
            - 'modin/core/execution/unidist/**'
          experimental:
            - 'modin/experimental/**'
    - uses: actions/setup-python@v5
    - id: engines
      run: |
        python -c "import sys, json; print('engines=' + json.dumps(['python', 'native'] + (sys.argv[1] == 'true' and ['ray'] or []) + (sys.argv[2] == 'true' and ['dask'] or []) ))" \
              "${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT
    - id: show-variables
      run: echo ${{ steps.filter.outputs.ray }} , ${{ steps.filter.outputs.dask }} , ${{ steps.filter.outputs.unidist }} , ${{ steps.filter.outputs.experimental }}, ${{ steps.filter.outputs.test-native-dataframe-mode }}

  test-sanity:
    # The "sanity" tests run on each pull request to test that a subset of the
    # full tests work with the slower engines (ray, dask, and unidist-MPI).
    needs: [execution-filter, python-filter]
    # If we don't need any of these, we get a single job with an empty matrix
    # (that is, os, execution, etc. are not set and so we treat them as "").
    # so, if the matrix is going to be empty, we need to skip this job completely:
    # https://stackoverflow.com/a/77118991
    if: |
      github.event_name == 'pull_request' &&
      (
        needs.execution-filter.outputs.ray != 'true' ||
        needs.execution-filter.outputs.dask != 'true' ||
        needs.execution-filter.outputs.unidist != 'true'
      )
    strategy:
      matrix:
        os:
          - ubuntu
          - windows
        python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
        running-all-ray-tests: [ "${{ needs.execution-filter.outputs.ray }}" ]
        running-all-dask-tests: [ "${{needs.execution-filter.outputs.dask}}" ]
        running-all-unidist-tests: [ "${{needs.execution-filter.outputs.unidist}}" ]        
        execution: [ray, dask, unidist]
        exclude:
          - running-all-ray-tests: 'true'
            execution: ray
          - running-all-dask-tests: 'true'
            execution: dask
          - running-all-unidist-tests: 'true'
            execution: unidist
        # execution:
        #   - name: ray
        #     shell-ex: "python -m pytest"
        #     # If we're going to run all ray tests because we've detected a
        #     # change to the ray engine, we don't need to run these sanity tests
        #     # on ray.
        #     if: needs.execution-filter.outputs.ray != 'true'
        #   - name: dask
        #     shell-ex: "python -m pytest"
        #     # If we're going to run all dask tests because we've detected a
        #     # change to the dask engine, we don't need to run these sanity tests
        #     # on dask.
        #     if: needs.execution-filter.outputs.dask != 'true'
        #   - name: unidist
        #     shell-ex: "mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest"
        #     # If we're going to run all unidist tests because we've detected a
        #     # change to the unidist engine, we don't need to run these sanity tests
        #     # on unidist.
        #     if: needs.execution-filter.outputs.unidist != 'true'
    runs-on: ${{ matrix.os }}-latest
    defaults:
      run:
        shell: bash -l {0}
    env:
      MODIN_ENGINE: ${{ matrix.execution.name }}
      UNIDIST_BACKEND: "mpi"
      PARALLEL: ${{ matrix.execution.name != 'unidist' && matrix.os != 'windows' && '-n 2' || '' }}
      PYTEST_COMMAND: >-
        ${{
          (
            (matrix.execution.name == 'ray' || matrix.execution.name == 'dask') &&
            'python -m pytest'
          ) ||
          (
            matrix.execution.name == 'unidist' &&
            'mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest'
          ) ||
          'UNKNOWN_PYTEST_COMMAND'
        }}
    name: test-${{ matrix.os }}-sanity (engine ${{ matrix.execution.name }}, python ${{matrix.python-version}})
    services:
      moto:
        image: ${{ matrix.os != 'windows' && 'motoserver/moto:5.0.13' || '' }}
        ports:
          - 5000:5000
        env:
          AWS_ACCESS_KEY_ID: foobar_key
          AWS_SECRET_ACCESS_KEY: foobar_secret
    steps:
      - uses: actions/checkout@v4
      - uses: ./.github/actions/mamba-env
        with:
          environment-file: ${{ matrix.os == 'ubuntu' && matrix.execution.name == 'unidist' && 'requirements/env_unidist_linux.yml' || matrix.os == 'windows' && matrix.execution.name == 'unidist' && 'requirements/env_unidist_win.yml' || 'environment-dev.yml' }}
          activate-environment: ${{ matrix.execution.name == 'unidist' && 'modin_on_unidist' || 'modin' }}
          python-version: ${{matrix.python-version}}
      - name: Install HDF5
        run: sudo apt update && sudo apt install -y libhdf5-dev
        if: matrix.os != 'windows'
      - name: Limit ray memory
        run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV
        if: matrix.os != 'windows' && matrix.execution.name == 'ray'
      - name: Tell Modin to use existing ray cluster
        run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV
        if: matrix.os == 'windows' && matrix.execution.name == 'ray'
      - name: Start local ray cluster
        # Try a few times to start ray to work around
        # https://github.com/modin-project/modin/issues/4562
        uses: nick-fields/retry@v3
        with:
          timeout_minutes: 5
          max_attempts: 5
          command: ray start --head --port=6379 --object-store-memory=1000000000
        if: matrix.os == 'windows' && matrix.execution.name == 'ray'
      - run: MODIN_BENCHMARK_MODE=True $PYTEST_COMMAND modin/tests/pandas/internals/test_benchmark_mode.py
      - run: $PYTEST_COMMAND $PARALLEL modin/tests/test_partition_api.py
      - run: $PYTEST_COMMAND modin/tests/pandas/extensions
      - name: xgboost tests
        run: |
          # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
          # when we use collective instead of rabit.
          mamba install "xgboost>=1.7.1,<2.0.0" scikit-learn -c conda-forge
          $PYTEST_COMMAND $PARALLEL \
                  modin/tests/experimental/xgboost/test_default.py \
                  modin/tests/experimental/xgboost/test_xgboost.py \
                  modin/tests/experimental/xgboost/test_dmatrix.py
        if: matrix.os != 'windows' && needs.execution-filter.outputs.experimental == 'true'
      - run: $PYTEST_COMMAND $PARALLEL modin/tests/experimental/test_pipeline.py
        if: matrix.os != 'windows' && matrix.execution.name != 'unidist' && needs.execution-filter.outputs.experimental == 'true'
      - name: "test DF: binary, default, iter"
        run: |
          $PYTEST_COMMAND $PARALLEL \
                  modin/tests/pandas/dataframe/test_binary.py \
                  modin/tests/pandas/dataframe/test_default.py \
                  modin/tests/pandas/dataframe/test_iter.py
        if: matrix.os != 'windows'
      - name: "test DF: reduce, udf, window, pickle"
        run: |
          $PYTEST_COMMAND $PARALLEL \
                  modin/tests/pandas/dataframe/test_reduce.py \
                  modin/tests/pandas/dataframe/test_udf.py \
                  modin/tests/pandas/dataframe/test_window.py \
                  modin/tests/pandas/dataframe/test_pickle.py
        if: matrix.os != 'windows'
      - run: $PYTEST_COMMAND modin/tests/pandas/test_series.py
        if: matrix.execution.name == 'ray'
      - run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_series.py
        if: matrix.execution.name != 'ray'
      - run: $PYTEST_COMMAND modin/tests/pandas/dataframe/test_map_metadata.py
        if: matrix.execution.name == 'ray'
      - run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/dataframe/test_map_metadata.py
        if: matrix.execution.name != 'ray'
      - name: "test rolling, expanding, reshape, general, concat"
        run: |
          $PYTEST_COMMAND $PARALLEL \
                  modin/tests/pandas/test_rolling.py \
                  modin/tests/pandas/test_expanding.py \
                  modin/tests/pandas/test_reshape.py \
                  modin/tests/pandas/test_general.py \
                  modin/tests/pandas/test_concat.py
        if: matrix.os != 'windows'
      - run: $PYTEST_COMMAND $PARALLEL modin/tests/numpy
      - run: $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_io.py --verbose
        if: matrix.execution.name != 'unidist'
      - uses: nick-fields/retry@v3
        # to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
        # for details see: https://github.com/modin-project/modin/pull/6776
        with:
          timeout_minutes: 15
          max_attempts: 3
          command: conda run --no-capture-output -n modin_on_unidist $PYTEST_COMMAND -m "not exclude_in_sanity" modin/tests/pandas/test_io.py --verbose
        if: matrix.execution.name == 'unidist'
      - run: $PYTEST_COMMAND modin/tests/experimental/test_io_exp.py
      - run: $PYTEST_COMMAND $PARALLEL modin/tests/interchange/dataframe_protocol/test_general.py
      - run: $PYTEST_COMMAND $PARALLEL modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py
      - name: Stop local ray cluster
        run: ray stop
        if: matrix.os == 'windows' && matrix.execution.name == 'ray'
      - uses: ./.github/actions/upload-coverage