Update GNN reference implementation: add DGL backend

mlcommons · Nov 4, 2024 · 880a9b7 · 880a9b7
1 parent c8c1e61
commit 880a9b7
Show file tree

Hide file tree

Showing 22 changed files with 1,486 additions and 42 deletions.
diff --git a/upcomming_benchmarks/graph/R-GAT/README.md → graph/R-GAT/README.md b/upcomming_benchmarks/graph/R-GAT/README.md → graph/R-GAT/README.md
@@ -1,6 +1,6 @@
 # MLPerf™ Inference Benchmarks for Text to Image
 
-This is the reference implementation for MLPerf Inference text to image
+This is the reference implementation for MLPerf Inference text to image. Two implementation are currently supported, Graphlearn for Pytorch (GLT) and Deep Graph Library (DGL), both using pytorch as the backbone of the model.
 
 ## Supported Models
 
@@ -47,13 +47,14 @@ Install loadgen:
 cd $LOADGEN_FOLDER
 CFLAGS="-std=c++14" python setup.py install
 ```
-### Install graphlearn for pytorch
 
-Install pytorch geometric:
+### Install pytorch geometric
+
 ```bash
 export TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
 pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-${TORCH_VERSION}.html
 ```
+### Install graphlearn for pytorch (Only for GLT implementation)
 
 Follow instalation instructions at: https://github.com/alibaba/graphlearn-for-pytorch.git
 
@@ -80,7 +81,7 @@ cd $GRAPH_FOLDER
 python3 tools/split_seeds.py --path igbh --dataset_size tiny
 ```
 
-**Compress graph (optional)**
+**Compress graph (optional, only for GLT implementation)**
 ```bash
 cd $GRAPH_FOLDER
 python3 tools/compress_graph.py --path igbh --dataset_size tiny --layout <CSC or CSR>
@@ -99,7 +100,7 @@ cd $GRAPH_FOLDER
 python3 tools/split_seeds.py --path igbh --dataset_size full
 ```
 
-**Compress graph (optional)**
+**Compress graph (optional, only for GLT implementation)**
 ```bash
 cd $GRAPH_FOLDER
 python3 tools/compress_graph.py --path igbh --dataset_size tiny --layout <CSC or CSR>
@@ -114,16 +115,22 @@ TODO
 ```bash
 # Go to the benchmark folder
 cd $GRAPH_FOLDER
-# Run the benchmark
-python3 main.py --dataset igbh-tiny --dataset-path igbh/ --profile debug [--model-path <path_to_ckpt>] [--in-memory] [--device <cpu or gpu>] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>] [--layout <COO, CSC or CSR>]
+# Run the benchmark GLT
+python3 main.py --dataset igbh-glt-tiny --dataset-path igbh/ --profile debug-glt [--model-path <path_to_ckpt>] [--in-memory] [--device <cpu or gpu>] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>] [--layout <COO, CSC or CSR>]
+
+# Run the benchmark DGL
+python3 main.py --dataset igbh-dgl-tiny --dataset-path igbh/ --profile debug-dgl [--model-path <path_to_ckpt>] [--in-memory] [--device <cpu or gpu>] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>]
 ```
 
 #### Local run
 ```bash
 # Go to the benchmark folder
 cd $GRAPH_FOLDER
-# Run the benchmark
-python3 main.py --dataset igbh --dataset-path igbh/ [--model-path <path_to_ckpt>] [--in-memory] [--device <cpu or gpu>] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>] [--layout <COO, CSC or CSR>]
+# Run the benchmark GLT
+python3 main.py --dataset igbh-glt --dataset-path igbh/ --profile rgat-glt-full [--model-path <path_to_ckpt>] [--in-memory] [--device <cpu or gpu>] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>] [--layout <COO, CSC or CSR>]
+
+# Run the benchmark DGL
+python3 main.py --dataset igbh-dgl --dataset-path igbh/ --profile rgat-dgl-full [--model-path <path_to_ckpt>] [--in-memory] [--device <cpu or gpu>] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>]
 ```
 #### Run using docker
 

diff --git a/upcomming_benchmarks/graph/R-GAT/backend.py → graph/R-GAT/backend.py b/upcomming_benchmarks/graph/R-GAT/backend.py → graph/R-GAT/backend.py
diff --git a/graph/R-GAT/backend_dgl.py b/graph/R-GAT/backend_dgl.py
@@ -0,0 +1,97 @@
+
+from typing import Optional, List, Union, Any
+from dgl_utilities.feature_fetching import IGBHeteroGraphStructure, Features, IGBH
+from dgl_utilities.components import build_graph, get_loader, RGAT
+from dgl_utilities.pyg_sampler import PyGSampler
+import os
+import torch
+import logging
+import backend
+from typing import Literal
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("backend-dgl")
+
+
+
+class BackendDGL(backend.Backend):
+    def __init__(
+        self,
+        model_type="rgat",
+        type: Literal["fp16", "fp32"] = "fp16",
+        device: Literal["cpu", "gpu"] = "gpu",
+        ckpt_path: str = None,
+        igbh: IGBH = None,
+        batch_size: int = 1,
+        layout: Literal["CSC", "CSR", "COO"] = "COO",
+        edge_dir: str = "in",
+    ):
+        super(BackendDGL, self).__init__()
+        self.i = 0
+        # Set device and type
+        if device == "gpu":
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+
+        if type == "fp32":
+            self.type = torch.float32
+        else:
+            self.type = torch.float16
+        # Create Node and neighbor loader
+        self.fan_out = [5, 10, 15]
+        self.igbh_graph_structure = igbh.igbh_dataset
+        self.feature_store = Features(
+            self.igbh_graph_structure.dir, 
+            self.igbh_graph_structure.dataset_size, 
+            self.igbh_graph_structure.in_memory, 
+            use_fp16=self.igbh_graph_structure.use_fp16,
+        )
+        self.feature_store.build_features(use_journal_conference=True)
+        self.graph = build_graph(self.igbh_graph_structure, "dgl", features=self.feature_store)
+        self.neighbor_loader = PyGSampler([5, 10, 15])
+        # Load model Architechture
+        self.model = RGAT(
+            backend="dgl",
+            device=device,
+            graph=self.graph,
+            in_feats=1024,
+            h_feats=512,
+            num_classes=2983,
+            num_layers=len(self.fan_out),
+            n_heads=4
+        ).to(self.type).to(self.device)
+        self.model.eval()
+        # Load model checkpoint
+        ckpt = None
+        if ckpt_path is not None:
+            try:
+                ckpt = torch.load(ckpt_path, map_location=self.device)
+            except FileNotFoundError as e:
+                print(f"Checkpoint file not found: {e}")
+                return -1
+        if ckpt is not None:
+            self.model.load_state_dict(ckpt["model_state_dict"])
+
+    def version(self):
+        return torch.__version__
+
+    def name(self):
+        return "pytorch-SUT"
+
+    def image_format(self):
+        return "NCHW"
+
+    def load(self):
+        return self
+
+    def predict(self, inputs: torch.Tensor):
+        self.i+=1
+        print(self.i)
+        with torch.no_grad():
+            input_size = inputs.shape[0]
+            # Get batch
+            batch = self.neighbor_loader.sample(self.graph, {"paper": inputs})
+            batch_preds, batch_labels = self.model(batch, self.device, self.feature_store)
+        return batch_preds
+
diff --git a/...benchmarks/graph/R-GAT/backend_pytorch.py → graph/R-GAT/backend_glt.py b/...benchmarks/graph/R-GAT/backend_pytorch.py → graph/R-GAT/backend_glt.py
@@ -13,7 +13,7 @@
 import graphlearn_torch as glt
 
 logging.basicConfig(level=logging.INFO)
-log = logging.getLogger("backend-pytorch")
+log = logging.getLogger("backend-glt")
 
 
 class CustomNeighborLoader(NodeLoader):
@@ -114,19 +114,19 @@ def get_neighbors(self, seeds: torch.Tensor):
         return result
 
 
-class BackendPytorch(backend.Backend):
+class BackendGLT(backend.Backend):
     def __init__(
         self,
         model_type="rgat",
         type: Literal["fp16", "fp32"] = "fp16",
         device: Literal["cpu", "gpu"] = "gpu",
         ckpt_path: str = None,
-        igbh_dataset: IGBHeteroDataset = None,
+        igbh: IGBH = None,
         batch_size: int = 1,
         layout: Literal["CSC", "CSR", "COO"] = "COO",
         edge_dir: str = "in",
     ):
-        super(BackendPytorch, self).__init__()
+        super(BackendGLT, self).__init__()
         self.i = 0
         # Set device and type
         if device == "gpu":
@@ -140,6 +140,7 @@ def __init__(
             self.type = torch.float16
         # Create Node and neighbor loade
         self.glt_dataset = glt.data.Dataset(edge_dir=edge_dir)
+        igbh_dataset = igbh.igbh_dataset
         self.glt_dataset.init_node_features(
             node_feature_data=igbh_dataset.feat_dict,
             with_gpu=(device == "gpu"),

diff --git a/upcomming_benchmarks/graph/R-GAT/dataset.py → graph/R-GAT/dataset.py b/upcomming_benchmarks/graph/R-GAT/dataset.py → graph/R-GAT/dataset.py