Merge pull request #1 from pyjanitor-devs/infra-docs

Infra + docs
pyjanitor-devs · Feb 18, 2023 · cfadc99 · cfadc99
2 parents cbab671 + 3a02037
commit cfadc99
Show file tree

Hide file tree

Showing 44 changed files with 1,137 additions and 513 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -0,0 +1,76 @@
+# Publish documentation
+name: documentation
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      # See: https://github.com/marketplace/actions/setup-miniconda
+      - name: Setup miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          miniforge-variant: Mambaforge
+          environment-file: environment.yml
+          use-mamba: true
+
+      - name: Install pyjviz
+        # use editable mode to avoid _pytest.pathlib.ImportPathMismatchError
+        run: pip install -e .
+
+      - name: Build docs
+        run: mkdocs build
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: website
+          path: site/
+
+      - name: Docs preview
+        if: ${{ github.event_name == 'pull_request' }}
+        uses: nwtgck/[email protected]
+        with:
+          publish-dir: "./site"
+          production-deploy: false
+          github-token: ${{ secrets.GHPAGES_TOKEN }}
+          deploy-message: "Deploy from GitHub Actions"
+          enable-pull-request-comment: true
+          enable-commit-comment: false
+          overwrites-pull-request-comment: true
+          alias: deploy-preview-${{ github.event.number }}
+        env:
+          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
+          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
+        timeout-minutes: 1
+
+      - name: Deploy website
+        if: ${{ github.event_name == 'push' }}
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          # https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-set-personal-access-token-personal_token
+          personal_token: ${{ secrets.GHPAGES_TOKEN }}
+          publish_dir: ./site/
+          publish_branch: gh-pages
+          # destination_dir: manuscript
+          allow_empty_commit: false
+          keep_files: false
+          force_orphan: true
+          enable_jekyll: false
+          disable_nojekyll: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,35 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: check-yaml
+  - id: check-added-large-files
+- repo: https://github.com/psf/black
+  rev: 23.1.0
+  hooks:
+  - id: black
+    args: [--config, pyproject.toml]
+# - repo: https://github.com/pycqa/isort
+#   rev: 5.11.2
+#   hooks:
+#     - id: isort
+#       name: isort (python)
+- repo: https://github.com/econchick/interrogate
+  rev: 1.5.0
+  hooks:
+  - id: interrogate
+    args: [-c, pyproject.toml]
+- repo: https://github.com/terrencepreilly/darglint
+  rev: v1.8.1
+  hooks:
+  - id: darglint
+    args: [-v 2]  # this config makes the error messages a bit less cryptic.
+- repo: https://github.com/PyCQA/flake8
+  rev: 6.0.0
+  hooks:
+    - id: flake8
+      args: [--exclude, nbconvert_config.py]
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "editor.rulers": [
+        79
+    ]
+}
diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ The [`resulting SVG file`][res] of the run shows method chain calls along with i
 
 ## How it works?
 
-`pyjviz` provides the way to create logfile which contains RDF graph of program behaviour. The visualization features of pyjviz provided in the package itself are based on RDF graph translation to graphviz dot lanuage. `pyjanitor` method chains are represented using certain RDF data schema (ref here to shacl defs). Using `pandas` extentions API `pyjanitor` (and `pandas`) method call arguments and returns are saved into RDF log. 
+`pyjviz` provides the way to create logfile which contains RDF graph of program behaviour. The visualization features of pyjviz provided in the package itself are based on RDF graph translation to graphviz dot lanuage. `pyjanitor` method chains are represented using certain RDF data schema (ref here to shacl defs). Using `pandas` extentions API `pyjanitor` (and `pandas`) method call arguments and returns are saved into RDF log.
 
 > **Note**
 > Visualisation of pyjviz RDF graph is not a main goal of provided package. Graphviz-based visualization avaiable in the package is rather reference implementation with quite limited (but still useful) capablities.
@@ -68,12 +68,10 @@ Python objects from `pyjviz` point of view have `object identity` and `object st
 
 E.g. the simplest form of pandas dataframe 'carbon copy' can be obtained via using output of method head() then converted to HTML format - result of df.head().to_html() call. More comprehensive CC would be dataframe plot as generated by .plot method and saved as byte sequence. Note that 'carbon copy' is not necessary capture all details of original object state. If one need to have precise object state she would have to use CC class which guarantee that. CC like that would be based on .to_csv method in example above.
 
-The way how particular call argument/return or other python objects are saved into RDF log is specified using CCGlance `carbon copy` class. For pandas dataframe it will save just shape of dataframe and its head() output serialized as HTML. If user wants to have other CC of the object it is always possible to use .cc() method ((ref here, rename .pin() to .cc()) 
+The way how particular call argument/return or other python objects are saved into RDF log is specified using CCGlance `carbon copy` class. For pandas dataframe it will save just shape of dataframe and its head() output serialized as HTML. If user wants to have other CC of the object it is always possible to use .cc() method ((ref here, rename .pin() to .cc())
 
 --------
 
 Obj is representation of pyjanitor object like pandas DataFrame. However input args are not objects rather object states. The state of object is represeneted by RDF class ObjState. The idea to separate object and object state is introduced to enable pyjviz to visualize situation when object has mutliple states used in method chain due to in-place operations. Such practice is discouraged by most of data packages but still may be used. In most cases where object has only state defined when object is created there is not difference betwen object and object state since there is one-to-one correspondence (isomorfism). So in some context below refernce to an object may imply object state instead.
 
 pyjviz also introduce MethodCall RDF class. It represents pyjanitor method call. MethodCall object has incoming links from input objects and outgoing link an object representing retirn object.
-
-
diff --git a/docs/index.md b/docs/index.md
diff --git a/doc/tmp/tmp2eu_4xcg.html → docs/tmp/tmp2eu_4xcg.html b/doc/tmp/tmp2eu_4xcg.html → docs/tmp/tmp2eu_4xcg.html
@@ -31,4 +31,4 @@
       <td>800.0</td>
     </tr>
   </tbody>
-</table>
+</table>
diff --git a/doc/tmp/tmpc3kafay7.html → docs/tmp/tmpc3kafay7.html b/doc/tmp/tmpc3kafay7.html → docs/tmp/tmpc3kafay7.html
@@ -27,4 +27,4 @@
       <td>675.0</td>
     </tr>
   </tbody>
-</table>
+</table>
diff --git a/doc/tmp/tmpd_ffzcp4.html → docs/tmp/tmpd_ffzcp4.html b/doc/tmp/tmpd_ffzcp4.html → docs/tmp/tmpd_ffzcp4.html
@@ -33,4 +33,4 @@
       <td>675.0</td>
     </tr>
   </tbody>
-</table>
+</table>
diff --git a/doc/tmp/tmpij_bu_q6.html → docs/tmp/tmpij_bu_q6.html b/doc/tmp/tmpij_bu_q6.html → docs/tmp/tmpij_bu_q6.html
@@ -27,4 +27,4 @@
       <td>675.0</td>
     </tr>
   </tbody>
-</table>
+</table>
diff --git a/doc/tmp/tmpqji_p6l7.html → docs/tmp/tmpqji_p6l7.html b/doc/tmp/tmpqji_p6l7.html → docs/tmp/tmpqji_p6l7.html
@@ -27,4 +27,4 @@
       <td>675.0</td>
     </tr>
   </tbody>
-</table>
+</table>
diff --git a/doc/tmp/tmpt16kh8sh.html → docs/tmp/tmpt16kh8sh.html b/doc/tmp/tmpt16kh8sh.html → docs/tmp/tmpt16kh8sh.html
@@ -38,4 +38,4 @@
       <td>675.0</td>
     </tr>
   </tbody>
-</table>
+</table>
diff --git a/doc/why-janitor.py.ttl.dot.svg → docs/why-janitor.py.ttl.dot.svg b/doc/why-janitor.py.ttl.dot.svg → docs/why-janitor.py.ttl.dot.svg
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,18 @@
+name: pyjviz
+channels:
+  - conda-forge
+dependencies:
+  - python=3.9
+  - rdflib
+  - graphviz
+  - python-graphviz
+  - pandas
+  - pandas-flavor
+  - black
+  - mypy
+  - flake8
+  - mkdocs
+  - pre-commit
+  - mkdocs-material
+  - mkdocstrings
+  - pymdown-extensions
diff --git a/examples/image-processing/image-processing.py b/examples/image-processing/image-processing.py
@@ -18,92 +18,115 @@
 # apply overload causes problems in printing of dataframes
 # so it is included here to make nested call visualization work
 old_apply = pd.Series.apply
+
+
 @pf.register_series_method
 def apply(s: pd.Series, func) -> pd.Series:
     ret = old_apply(s, func)
     return ret
 
+
 @pf.register_series_method
 def load_images(file_pathes: pd.Series) -> pd.DataFrame:
-    #ipdb.set_trace()
+    # ipdb.set_trace()
     df = pd.DataFrame()
     for file_path in file_pathes:
         x_image = imread(file_path)
         im_name = os.path.basename(file_path)
-        df = df.append({'im_name': im_name, 'image': x_image}, ignore_index = True)
-
+        df = df.append(
+            {"im_name": im_name, "image": x_image}, ignore_index=True
+        )
+
     return df
 
+
 @pf.register_dataframe_method
 def subplot(df: pd.DataFrame, *, image_col, title_col, title):
     return df
 
+
 @pf.register_dataframe_method
 def binarize_images(df: pd.DataFrame, thresholding_method) -> pd.DataFrame:
-    df['gray_leaf'] = df.image.apply(rgb2gray)
-    df['binarized'] = None
+    df["gray_leaf"] = df.image.apply(rgb2gray)
+    df["binarized"] = None
     for t in df.itertuples():
         thresh = thresholding_method(t.gray_leaf)
-        df.at[t.Index, 'binarized'] = (t.gray_leaf < thresh)
+        df.at[t.Index, "binarized"] = t.gray_leaf < thresh
     return df
 
+
 @pf.register_dataframe_method
 def morphology(df: pd.DataFrame) -> pd.DataFrame:
-    df['closed'] = df.binarized.apply(area_closing)
-    df['opened'] = df.closed.apply(area_opening)
+    df["closed"] = df.binarized.apply(area_closing)
+    df["opened"] = df.closed.apply(area_opening)
     return df
 
+
 @pf.register_dataframe_method
 def labeling(df):
-    df['label_im'] = df.opened.apply(label)
-    df['regions'] = df.label_im.apply(regionprops)
+    df["label_im"] = df.opened.apply(label)
+    df["regions"] = df.label_im.apply(regionprops)
     return df
 
+
 @pf.register_dataframe_method
 def get_properties_of_each_region(df: pd.DataFrame) -> pd.DataFrame:
-    properties = ['area','convex_area','bbox_area',
-                  'major_axis_length', 'minor_axis_length',
-                  'perimeter', 'equivalent_diameter',
-                  'mean_intensity', 'solidity', 'eccentricity']
+    properties = [
+        "area",
+        "convex_area",
+        "bbox_area",
+        "major_axis_length",
+        "minor_axis_length",
+        "perimeter",
+        "equivalent_diameter",
+        "mean_intensity",
+        "solidity",
+        "eccentricity",
+    ]
     res_df = []
     for t in df.itertuples():
-        #ipdb.set_trace()
-        p_df = pd.DataFrame(regionprops_table(t.label_im, t.gray_leaf, properties=properties))
+        # ipdb.set_trace()
+        p_df = pd.DataFrame(
+            regionprops_table(t.label_im, t.gray_leaf, properties=properties)
+        )
         p_df = p_df[(p_df.index != 0) & (p_df.area > 100)]
-        p_df['im_name'] = t.im_name
+        p_df["im_name"] = t.im_name
         res_df.append(p_df)
     return pd.concat(res_df)
 
+
 @pf.register_dataframe_method
 def apply_feature_engeneering(df: pd.DataFrame) -> pd.DataFrame:
-    df['ratio_length'] = (df['major_axis_length'] / df['minor_axis_length'])
-    df['perimeter_ratio_major'] = (df['perimeter'] / df['major_axis_length'])
-    df['perimeter_ratio_minor'] = (df['perimeter'] / df['minor_axis_length'])
-    df['area_ratio_convex'] = df['area'] / df['convex_area']
-    df['area_ratio_bbox'] = df['area'] / df['bbox_area']
-    df['peri_over_dia'] = df['perimeter'] / df['equivalent_diameter']
-    final_df = df[df.drop('type', axis=1).columns].astype(float)
+    df["ratio_length"] = df["major_axis_length"] / df["minor_axis_length"]
+    df["perimeter_ratio_major"] = df["perimeter"] / df["major_axis_length"]
+    df["perimeter_ratio_minor"] = df["perimeter"] / df["minor_axis_length"]
+    df["area_ratio_convex"] = df["area"] / df["convex_area"]
+    df["area_ratio_bbox"] = df["area"] / df["bbox_area"]
+    df["peri_over_dia"] = df["perimeter"] / df["equivalent_diameter"]
+    final_df = df[df.drop("type", axis=1).columns].astype(float)
     final_df = final_df.replace(np.inf, 0)
-    final_df['type'] = df['type']
+    final_df["type"] = df["type"]
     return final_df
 
+
 file_pathes = pd.Series(glob.glob("dataset/*.jpg"))
 
 with pyjviz.CB("initial-phase") as cc:
-    initial_phase_df = (file_pathes
-                        .load_images()#.subplot(image_col = 'image', title_col = 'im_name', title = '(Original Image by Gino Borja, AIM)')
-                        .binarize_images(threshold_otsu)#.subplot(image_col = 'binarized', title_col = file_pathes, title = 'binarized')
-                        .morphology()#.subplot(image_col = 'opened', title_col = file_pathes, title = 'opened')
-                        .labeling()#.subplot(image_col = 'label_im', title_col = file_pathes, title = 'labeled')
-                        )
-if 1:    
+    initial_phase_df = (
+        file_pathes.load_images()  # .subplot(image_col = 'image', title_col = 'im_name', title = '(Original Image by Gino Borja, AIM)')
+        .binarize_images(
+            threshold_otsu
+        )  # .subplot(image_col = 'binarized', title_col = file_pathes, title = 'binarized')
+        .morphology()  # .subplot(image_col = 'opened', title_col = file_pathes, title = 'opened')
+        .labeling()  # .subplot(image_col = 'label_im', title_col = file_pathes, title = 'labeled')
+    )
+if 1:
     with pyjviz.CB("build-features"):
-        final_df = (initial_phase_df
-                    .get_properties_of_each_region()
-                    .assign(type = lambda x: x.im_name.apply(lambda x: x.split('.')[0]))
-                    .drop(columns = 'im_name')
-                    .apply_feature_engeneering()
-                    )
-
-pyjviz.save_dot(vertical = True)
+        final_df = (
+            initial_phase_df.get_properties_of_each_region()
+            .assign(type=lambda x: x.im_name.apply(lambda x: x.split(".")[0]))
+            .drop(columns="im_name")
+            .apply_feature_engeneering()
+        )
 
+pyjviz.save_dot(vertical=True)