docs: Add wiki

Modalities · Jan 15, 2024 · b7169c4 · b7169c4
1 parent 3cf1ed3
commit b7169c4
Show file tree

Hide file tree

Showing 18 changed files with 454 additions and 2 deletions.
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
@@ -12,8 +12,7 @@ jobs:
     - uses: actions/checkout@v4
     - name: Check path
       run: |
-        pwd
-        ls -alsh
+        pip install myst-parser
     - uses: ammaraskar/sphinx-action@master
       with:
         docs-folder: "docs/"
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,13 @@
+version: "2"
+
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.10"
+
+python:
+  install:
+    - requirements: docs/requirements.txt
+
+sphinx:
+  configuration: docs/source/conf.py
diff --git a/docs/.gitkeep b/docs/.gitkeep
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -0,0 +1,2 @@
+sphinx==7.1.2
+sphinx-rtd-theme==1.3.0rc1
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -0,0 +1,7 @@
+API
+===
+
+.. autosummary::
+   :toctree: generated
+
+   lumache
diff --git a/docs/source/benchmarking.rst b/docs/source/benchmarking.rst
@@ -0,0 +1,3 @@
+Benchmarking
+===========
+**EDIT "docs/source/benchmarking.rst" IN ORDER TO MAKE CHANGES HERE**
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,35 @@
+# Configuration file for the Sphinx documentation builder.
+
+# -- Project information
+
+project = "LLMGym"
+copyright = "2023, Fraunhofer"
+author = "Max Lübbering"
+
+release = "0.1"
+version = "0.1.0"
+
+# -- General configuration
+
+extensions = [
+    "sphinx.ext.duration",
+    "sphinx.ext.doctest",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
+]
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3/", None),
+    "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
+}
+intersphinx_disabled_domains = ["std"]
+
+templates_path = ["_templates"]
+
+# -- Options for HTML output
+
+html_theme = "sphinx_rtd_theme"
+
+# -- Options for EPUB output
+epub_show_urls = "footnote"
diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
@@ -0,0 +1,90 @@
+.. role:: python(code)
+   :language: python
+
+Configuration
+============
+
+**EDIT "docs/source/configuration.rst" IN ORDER TO MAKE CHANGES HERE**
+
+Training config is defined in yaml formatted files. See :file:`data/config_lorem_ipsum.yaml`. These configs are very explicit specifying all training parameters to keep model trainings as transparent and reproducible as possible. Each config setting is reflected in pydantic classes in :file:`src/llm_gym/config/*.py`. In the config you need to define which config classes to load in field type_hint. This specifies the concrete class. A second parameter, config, then takes all the constructor arguments for that config class. This way it is easy to change i.e. DataLoaders while still having input validation in place.
+
+Pydantic and ClassResolver
+------------
+
+The mechanismn introduced to instantiate classes via :python:`type_hint` in the :file:`config.yaml`, utilizes 
+
+1) Omegaconf to load the config yaml file
+2) Pydantic for the validation of the config
+3) ClassResolver to instantiate the correct, concrete class of a class hierarchy.
+
+Firstly, Omegaconf loads the config yaml file and resolves internal refrences such as :python:`${subconfig.attribue}`. 
+
+Then, Pydantic validates the whole config as is and checks that each of the sub-configs are :python:`pydantic.BaseModel` classes.
+For configs, which allow different concrete classes to be instantiated by :python:`ClassResolver`, the special member names :python:`type_hint` and :python:`config` are introduced.
+With this we utilize Pydantics feature to auto-select a fitting type based on the keys in the config yaml file.
+
+:python:`ClassResolver` replaces large if-else control structures to infer the correct concrete type with a :python:`type_hint` used for correct class selection:
+
+.. code-block:: python
+
+  activation_resolver = ClassResolver(
+    [nn.ReLU, nn.Tanh, nn.Hardtanh],
+    base=nn.Module,
+    default=nn.ReLU,
+  )
+  type_hint="ReLU"
+  activation_kwargs={...}
+  activation_resolver.make(type_hint, activation_kwargs),
+
+
+In our implmentation we go a step further, as both,
+
+* a :python:`type_hint` in a :python:`BaseModel` config must be of type :python:`llm_gym.config.lookup_types.LookupEnum` and 
+* :python:`config` is a union of allowed concrete configs of base type :python:`BaseModel`. 
+:python:`config` hereby replaces :python:`activation_kwargs` in the example above, and replaces it with pydantic-validated :python:`BaseModel` configs.
+
+With this, a mapping between type hint strings needed for `class-resolver`, and the concrete class is introduced, while allowing pydantic to select the correct concrete config:
+
+.. code-block:: python
+
+  from enum import Enum
+  from pydantic import BaseModel, PositiveInt, PositiveFloat, conint, confloat
+  
+  class LookupEnum(Enum):
+      @classmethod
+      def _missing_(cls, value: str) -> type:
+          """constructs Enum by member name, if not constructable by value"""
+          return cls.__dict__[value]
+  
+  class SchedulerTypes(LookupEnum):
+      StepLR = torch.optim.lr_scheduler.StepLR
+      ConstantLR = torch.optim.lr_scheduler.ConstantLR
+  
+  class StepLRConfig(BaseModel):
+      step_size: conint(ge=1)
+      gamma: confloat(ge=0.0)
+  
+  
+  class ConstantLRConfig(BaseModel):
+      factor: PositiveFloat
+      total_iters: PositiveInt
+  
+  
+  class SchedulerConfig(BaseModel):
+      type_hint: SchedulerTypes
+      config: StepLRConfig | ConstantLRConfig
+
+To allow a user-friendly instantiation, all class resolvers are defined in the :python:`ResolverRegistry` and :python:`build_component_by_config` as convenience function is introduced. Dependecies can be passed-through with the :python:`extra_kwargs` argument:
+
+.. code-block:: python
+
+  resolvers = ResolverRegister(config=config)
+  optimizer = ...  # our example dependency
+  scheduler = resolvers.build_component_by_config(config=config.scheduler, extra_kwargs=dict(optimizer=optimizer))
+
+To add a new resolver use :python:`add_resolver`, and the corresponding added resolver will be accessible by the register_key given during adding.
+
+For access use the :python:`build_component_by_key_query` function of the :python:`ResolverRegistry`.
+
+
+
diff --git a/docs/source/entrypoints.rst b/docs/source/entrypoints.rst
@@ -0,0 +1,66 @@
+.. role:: python(code)
+   :language: python
+
+.. role:: bash(code)
+   :language: bash
+
+**EDIT "docs/source/entrypoints.rst" IN ORDER TO MAKE CHANGES HERE**
+
+We use `click <https://click.palletsprojects.com/en/>`_ as a tool to add new entry points and their CLI arguments.
+For this we have a main entry point from which all other entry points are started. 
+
+The main entry point is :file:`src/llm_gym/__main__.py:main()`. 
+We register other sub-entrypoints by using our main :python:`click.group`, called :python:`main`, as follows:
+
+.. code-block:: python
+
+  @main.command(name="my_new_entry_point")
+
+
+See the following full example:
+
+.. code-block:: python
+
+  
+  import click
+  import click_pathlib
+  
+  
+  @click.group()
+  def main() -> None:
+      pass
+  
+  
+  config_option = click.option(
+      "--config_file_path",
+      type=click_pathlib.Path(exists=False),
+      required=True,
+      help="Path to a file with the YAML config file.",
+  )
+  
+  
+  @main.command(name="do_stuff")
+  @config_option
+  @click.option(
+      "--my_cli_argument",
+      type=int,
+      required=True,
+      help="New integer argument",
+  )
+  def entry_point_do_stuff(config_file_path: Path, my_cli_argument: int):
+      print(f"Do stuff with {config_file_path} and {my_cli_argument}...)
+      ...
+  
+  if __name__ == "__main__":
+      main()
+
+With 
+    
+.. code-block:: python
+    
+  [project.scripts]
+  llm_gym = "llm_gym.__main__:main"
+
+in our :file:`pyproject.toml`, we can start only main with :python:`llm_gym` (which does nothing), or a specific sub-entrypoint e.g. :bash:`llm_gym do_stuff --config_file_path config_files/config.yaml --my_cli_argument 3537`.
+
+Alternatively, directly use :bash:`src/llm_gym/__main__.py do_stuff --config_file_path config_files/config.yaml --my_cli_argument 3537`.
diff --git a/docs/source/future_work.rst b/docs/source/future_work.rst
@@ -0,0 +1,9 @@
+Future Work
+===========
+
+The team is currently working on our already established LLM code base to bring in multi-modality into the mix. This extension will be based on ideas similar to CoCa and/or AudioPaLM, which would enable users to either use different encoders for different modalities in conjunction with a text-based decoder, or use a decoder-only architecture.
+Future modalities other than text can be used, namely,
+
+* image
+* audio
+* video
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -0,0 +1,45 @@
+Welcome to LLMgym's documentation!
+===================================
+
+**EDIT "docs/source/index.rst" IN ORDER TO MAKE CHANGES HERE**
+
+<TODO: Add abstract --> still needed: USPs, key features; include FSDP here;>
+
+<TODO: CAN ADD LINKS TO SPECIFIC THINGS USERS CAN EXPLORE AT FIRST>
+
+
+.. note::
+
+   This project is under active development.
+
+.. toctree::
+   :caption: Getting Started
+
+   quickstart
+   configuration
+   model_cards
+   benchmarking
+   known_issues
+
+.. toctree::
+   :caption: Datasets
+
+   memmap
+
+.. toctree::
+   :caption: Entrypoints
+
+   entrypoints
+
+.. toctree::
+   :caption: VSCode Setup
+
+   vs_code_setup
+
+
+.. toctree::
+   :caption: Future Work
+
+   future_work
+
+
diff --git a/docs/source/known_issues.rst b/docs/source/known_issues.rst
@@ -0,0 +1,7 @@
+Known Issues
+===========
+
+**EDIT "docs/source/known_issues.rst" IN ORDER TO MAKE CHANGES HERE**
+
+1. hardcoded dataset path :file:`/raid/s3/opengptx/mehdi/temp/temp_data/train_text_document.bin` in :file:`config/config.yaml`
+2. Dependency on weights&biases
diff --git a/docs/source/memmap.rst b/docs/source/memmap.rst
@@ -0,0 +1,43 @@
+.. role:: python(code)
+   :language: python
+
+.. role:: bash(code)
+   :language: bash
+
+MemMap Datasets
+==============
+
+**EDIT "docs/source/memmap.rst" IN ORDER TO MAKE CHANGES HERE**
+
+MemMapDataset Index Generator
+-------------
+
+The :python:`MemMapDataset` requires an index file providing the necessary pointers into the raw data file. The :python:`MemMapDataset` can create the index file lazily, however, it is advised to create it beforehand. This can be done by running
+
+.. code-block:: bash
+
+  llm_gym create_memmap_index <path/to/jsonl/file>
+
+The index will be created in the same directory as the raw data file. For further options you may look into the usage documentation via :bash:`llm_gym create_memmap_index --help`.
+
+Packed Dataset Generator
+---------------
+
+The :python:`PackedMemMapDatasetContinuous` and :python:`PackedMemMapDatasetMegatron` require a packed data file. To create the data file, you first have to generate a :python:`MemMapDataset` index file as described `above <memMapDataset-index-generator>`_. Assuming the index and raw data are located in the same directory, you can simply execute the following command:
+
+.. code-block:: bash
+
+  llm_gym create_packed_data <path/to/jsonl/file>
+
+The packed data file will be created in the same directory as the raw data file. For further options you may look into the usage documentation via :bash:`llm_gym create_packed_data --help`.
+
+Packed Data Format
+~~~~~~~~~~~~~~~
+
+The packed data file is a bytestream containing both the tokenized data as well as an index denoting the start and length of the tokenized documents inside the bytestream. The data file consists of 3 concatenated parts:
+
+header segment | data segment | index segment
+
+* **header segment**: This section is a 8 bytes sized integer which encodes the length of the data segment in bytes.
+* **data segment**: This section contains a concatenation of all documents in form of 4 bytes sized tokens. An end-of-sequence token is placed between consecutive documents.
+* **index segment**: This section contains a pickled index which locates the documents inside the data segment. The index is basically a list of tuples, where each tuple contains the start position and length in bytes for the corresponding document, e.g., :python:`[(start_doc1, len_doc1), (start_doc2, len_doc2), ....]`.