From 02b519035669e6c042a9153a73a221469911cfbc Mon Sep 17 00:00:00 2001
From: joddiy <joddiy@qq.com>
Date: Wed, 25 Mar 2020 14:04:42 +0800
Subject: [PATCH] upgrade sonnx for nlp and cv models

---
 examples/onnx/arcface.py             |  122 ++
 examples/onnx/bert/bert-squad.py     |  169 +++
 examples/onnx/bert/inputs.json       |   27 +
 examples/onnx/bert/run_onnx_squad.py |  507 ++++++++
 examples/onnx/bert/tokenization.py   |  399 ++++++
 examples/onnx/fer_emotion.py         |  111 ++
 examples/onnx/mnist.py               |  320 +++++
 examples/onnx/mobilenet.py           |  116 ++
 examples/onnx/resnet18.py            |  115 ++
 examples/onnx/tiny_yolov2.py         |  167 +++
 examples/onnx/utils.py               |   72 ++
 examples/onnx/vgg16.py               |  114 ++
 python/singa/autograd.py             |  141 ++-
 python/singa/sonnx.py                | 1719 ++++++++++++++++++--------
 python/singa/utils.py                |   40 +
 test/python/test_onnx.py             |  259 +++-
 test/python/test_onnx_backend.py     | 1053 ++++++++++++++--
 test/python/test_operation.py        |   43 +-
 18 files changed, 4784 insertions(+), 710 deletions(-)
 create mode 100644 examples/onnx/arcface.py
 create mode 100644 examples/onnx/bert/bert-squad.py
 create mode 100644 examples/onnx/bert/inputs.json
 create mode 100644 examples/onnx/bert/run_onnx_squad.py
 create mode 100644 examples/onnx/bert/tokenization.py
 create mode 100644 examples/onnx/fer_emotion.py
 create mode 100644 examples/onnx/mnist.py
 create mode 100644 examples/onnx/mobilenet.py
 create mode 100644 examples/onnx/resnet18.py
 create mode 100644 examples/onnx/tiny_yolov2.py
 create mode 100644 examples/onnx/utils.py
 create mode 100644 examples/onnx/vgg16.py

diff --git a/examples/onnx/arcface.py b/examples/onnx/arcface.py
new file mode 100644
index 0000000000..933e1b126b
--- /dev/null
+++ b/examples/onnx/arcface.py
@@ -0,0 +1,122 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import os
+import numpy as np
+from PIL import Image
+from sklearn import preprocessing
+
+from singa import device
+from singa import tensor
+from singa import autograd
+from singa import sonnx
+import onnx
+from utils import download_model, update_batch_size, check_exist_or_download
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+def preprocess(img):
+    w, h = img.size
+    img = img.crop((0, (h - w) // 2, w, h - (h - w) // 2))
+    img = img.resize((112, 112))
+    img = np.array(img).astype(np.float32)
+    img = np.rollaxis(img, 2, 0)
+    img = np.expand_dims(img, axis=0)
+    return img
+
+
+def get_image():
+    # download image
+    img1 = Image.open(
+        check_exist_or_download(
+            'https://angus-doc.readthedocs.io/en/latest/_images/aurelien.jpg'))
+    img2 = Image.open(
+        check_exist_or_download(
+            'https://angus-doc.readthedocs.io/en/latest/_images/gwenn.jpg'))
+    return img1, img2
+
+
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad = True
+            sg_ir.tensor_map[idx] = tens
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+
+if __name__ == "__main__":
+
+    download_dir = '/tmp'
+    url = 'https://s3.amazonaws.com/onnx-model-zoo/arcface/resnet100/resnet100.tar.gz'
+    model_path = os.path.join(download_dir, 'resnet100', 'resnet100.onnx')
+
+    logging.info("onnx load model...")
+    download_model(url)
+    onnx_model = onnx.load(model_path)
+
+    # set batch size
+    onnx_model = update_batch_size(onnx_model, 2)
+
+    # prepare the model
+    logging.info("prepare model...")
+    dev = device.create_cuda_gpu()
+    sg_ir = sonnx.prepare(onnx_model, device=dev)
+    autograd.training = False
+    model = Infer(sg_ir)
+
+    # verifty the test dataset
+    # from utils import load_dataset
+    # inputs, ref_outputs = load_dataset(
+    #     os.path.join('/tmp', 'resnet100', 'test_data_set_0'))
+    # x_batch = tensor.Tensor(device=dev, data=inputs[0])
+    # outputs = model.forward(x_batch)
+    # for ref_o, o in zip(ref_outputs, outputs):
+    #     np.testing.assert_almost_equal(ref_o, tensor.to_numpy(o), 4)
+
+    # inference demo
+    logging.info("preprocessing...")
+    img1, img2 = get_image()
+    img1 = preprocess(img1)
+    img2 = preprocess(img2)
+
+    x_batch = tensor.Tensor(device=dev,
+                            data=np.concatenate((img1, img2), axis=0))
+    logging.info("model running...")
+    y = model.forward(x_batch)
+
+    logging.info("postprocessing...")
+    embedding = tensor.to_numpy(y)
+    embedding = preprocessing.normalize(embedding)
+    embedding1 = embedding[0]
+    embedding2 = embedding[1]
+
+    # Compute squared distance between embeddings
+    dist = np.sum(np.square(embedding1 - embedding2))
+    # Compute cosine similarity between embedddings
+    sim = np.dot(embedding1, embedding2.T)
+    # logging.info predictions
+    logging.info('Distance = %f' % (dist))
+    logging.info('Similarity = %f' % (sim))
diff --git a/examples/onnx/bert/bert-squad.py b/examples/onnx/bert/bert-squad.py
new file mode 100644
index 0000000000..e4a8488014
--- /dev/null
+++ b/examples/onnx/bert/bert-squad.py
@@ -0,0 +1,169 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under th
+
+import os
+import zipfile
+import numpy as np
+import json
+
+from singa import device
+from singa import tensor
+from singa import sonnx
+from singa import autograd
+import onnx
+import tokenization
+from run_onnx_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
+
+import sys
+sys.path.append(os.path.dirname(__file__) + '/..')
+from utils import download_model, update_batch_size, check_exist_or_download
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(message)s')
+
+max_answer_length = 30
+max_seq_length = 256
+doc_stride = 128
+max_query_length = 64
+n_best_size = 20
+batch_size = 1
+
+
+def load_vocab():
+    url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip'
+    download_dir = '/tmp/'
+    filename = os.path.join(download_dir, 'uncased_L-12_H-768_A-12', '.',
+                            'vocab.txt')
+    with zipfile.ZipFile(check_exist_or_download(url), 'r') as z:
+        z.extractall(path=download_dir)
+    return filename
+
+
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+
+    def forward(self, x):
+        return sg_ir.run(x)
+
+
+def preprocess():
+    vocab_file = load_vocab()
+    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
+                                           do_lower_case=True)
+    predict_file = os.path.join(os.path.dirname(__file__), 'inputs.json')
+    # print content
+    with open(predict_file) as json_file:
+        test_data = json.load(json_file)
+        print("The input is:", json.dumps(test_data, indent=2))
+
+    eval_examples = read_squad_examples(input_file=predict_file)
+
+    # Use convert_examples_to_features method from run_onnx_squad to get parameters from the input
+    input_ids, input_mask, segment_ids, extra_data = convert_examples_to_features(
+        eval_examples, tokenizer, max_seq_length, doc_stride, max_query_length)
+    return input_ids, input_mask, segment_ids, extra_data, eval_examples
+
+
+def postprocess(eval_examples, extra_data, all_results):
+    output_dir = 'predictions'
+    os.makedirs(output_dir, exist_ok=True)
+    output_prediction_file = os.path.join(output_dir, "predictions.json")
+    output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
+    write_predictions(eval_examples, extra_data, all_results, n_best_size,
+                      max_answer_length, True, output_prediction_file,
+                      output_nbest_file)
+
+    # print results
+    with open(output_prediction_file) as json_file:
+        test_data = json.load(json_file)
+        print("The result is:", json.dumps(test_data, indent=2))
+
+
+if __name__ == "__main__":
+
+    url = 'https://media.githubusercontent.com/media/onnx/models/master/text/machine_comprehension/bert-squad/model/bertsquad-10.tar.gz'
+    download_dir = '/tmp/'
+    model_path = os.path.join(download_dir, 'download_sample_10',
+                              'bertsquad10.onnx')
+
+    logging.info("onnx load model...")
+    download_model(url)
+    onnx_model = onnx.load(model_path)
+
+    # set batch size
+    onnx_model = update_batch_size(onnx_model, batch_size)
+    dev = device.create_cuda_gpu()
+    autograd.training = False
+
+    # inference
+    logging.info("preprocessing...")
+    input_ids, input_mask, segment_ids, extra_data, eval_examples = preprocess()
+
+    sg_ir = None
+    n = len(input_ids)
+    bs = batch_size
+    all_results = []
+
+    tmp_dict = {}
+    for idx in range(0, n):
+        logging.info("starting infer sample {}...".format(idx))
+        item = eval_examples[idx]
+        inputs = [
+            np.array([item.qas_id], dtype=np.int32),
+            segment_ids[idx:idx + bs].astype(np.int32),
+            input_mask[idx:idx + bs].astype(np.int32),
+            input_ids[idx:idx + bs].astype(np.int32),
+        ]
+
+        if sg_ir is None:
+            # prepare the model
+            logging.info("model is none, prepare model...")
+            sg_ir = sonnx.prepare(onnx_model,
+                                  device=dev,
+                                  init_inputs=inputs,
+                                  keep_initializers_as_inputs=False)
+            model = Infer(sg_ir)
+
+        x_batch = []
+        for inp in inputs:
+            tmp_tensor = tensor.from_numpy(inp)
+            tmp_tensor.to_device(dev)
+            x_batch.append(tmp_tensor)
+
+        logging.info("model running for sample {}...".format(idx))
+        outputs = model.forward(x_batch)
+
+        logging.info("hanlde the result of sample {}...".format(idx))
+        result = []
+        for outp in outputs:
+            result.append(tensor.to_numpy(outp))
+
+        in_batch = result[1].shape[0]
+        start_logits = [float(x) for x in result[1][0].flat]
+        end_logits = [float(x) for x in result[0][0].flat]
+        for i in range(0, in_batch):
+            unique_id = len(all_results)
+            all_results.append(
+                RawResult(unique_id=unique_id,
+                          start_logits=start_logits,
+                          end_logits=end_logits))
+    # postprocessing
+    logging.info("postprocessing...")
+    postprocess(eval_examples, extra_data, all_results)
\ No newline at end of file
diff --git a/examples/onnx/bert/inputs.json b/examples/onnx/bert/inputs.json
new file mode 100644
index 0000000000..b9e313bf79
--- /dev/null
+++ b/examples/onnx/bert/inputs.json
@@ -0,0 +1,27 @@
+{
+  "version": "1.4",
+  "data": [
+    {
+      "paragraphs": [
+        {
+          "context": "In its early years, the new convention center failed to meet attendance and revenue expectations.[12] By 2002, many Silicon Valley businesses were choosing the much larger Moscone Center in San Francisco over the San Jose Convention Center due to the latter's limited space. A ballot measure to finance an expansion via a hotel tax failed to reach the required two-thirds majority to pass. In June 2005, Team San Jose built the South Hall, a $6.77 million, blue and white tent, adding 80,000 square feet (7,400 m2) of exhibit space",
+          "qas": [
+            {
+              "question": "where is the businesses choosing to go?",
+              "id": "1"
+            },
+            {
+              "question": "how may votes did the ballot measure need?",
+              "id": "2"
+            },
+            {
+              "question": "By what year many Silicon Valley businesses were choosing the Moscone Center?",
+              "id": "3"
+            }
+          ]
+        }
+      ],
+      "title": "Conference Center"
+    }
+  ]
+}
diff --git a/examples/onnx/bert/run_onnx_squad.py b/examples/onnx/bert/run_onnx_squad.py
new file mode 100644
index 0000000000..f9a60da47c
--- /dev/null
+++ b/examples/onnx/bert/run_onnx_squad.py
@@ -0,0 +1,507 @@
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Inference for squad/bert using onnx.
+
+This is going to do the samem as 'python run_squad.py --do_predict=True ...' using a squad/bert model
+that was converted to onnx. Lots of code was taken from run_squad.py.
+You run it with:
+
+
+python onnx_squad.py --model $SQUAD_MODEL/squad.onnx \
+                     --vocab_file $BERT_BASE_DIR/uncased_L-12_H-768_A-12/vocab.txt
+                     --predict_file $SQUAD_DATA/dev-v1.1.json \
+                     --bert_config_file $BERT_BASE_DIR/uncased_L-12_H-768_A-12/bert_config.json \
+                     --output /tmp/
+"""
+
+import collections
+import json
+import math
+
+import numpy as np
+import six
+import tokenization
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+Feature = collections.namedtuple("Feature", [
+    "unique_id", "tokens", "example_index", "token_to_orig_map",
+    "token_is_max_context"
+])
+
+
+class SquadExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = []
+        s.append("qas_id: %s" % (tokenization.printable_text(self.qas_id)))
+        s.append("question_text: %s" %
+                 (tokenization.printable_text(self.question_text)))
+        s.append("doc_tokens: [%s]" % (" ".join(self.doc_tokens)))
+        if self.start_position:
+            s.append("start_position: %d" % (self.start_position))
+        if self.start_position:
+            s.append("end_position: %d" % (self.end_position))
+        return ", ".join(s)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 doc_stride, max_query_length):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    res_input_ids = []
+    res_input_mask = []
+    res_segment_ids = []
+    extra = []
+    unique_id = 0
+
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(
+                    tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans,
+                                                       doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+            res_input_ids.append(np.array(input_ids, dtype=np.int64))
+            res_input_mask.append(np.array(input_mask, dtype=np.int64))
+            res_segment_ids.append(np.array(segment_ids, dtype=np.int64))
+            feature = Feature(unique_id=unique_id,
+                              tokens=tokens,
+                              example_index=example_index,
+                              token_to_orig_map=token_to_orig_map,
+                              token_is_max_context=token_is_max_context)
+            extra.append(feature)
+            unique_id += 1
+    return np.array(res_input_ids), np.array(res_input_mask), np.array(
+        res_segment_ids), extra
+
+
+def read_squad_examples(input_file):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r") as f:
+        input_data = json.load(f)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for idx, entry in enumerate(input_data):
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                example = SquadExample(qas_id=qas_id,
+                                       question_text=question_text,
+                                       doc_tokens=doc_tokens,
+                                       orig_answer_text=orig_answer_text,
+                                       start_position=start_position,
+                                       end_position=end_position)
+                examples.append(example)
+    return examples
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file):
+    """Write final predictions to the json file."""
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+        prelim_predictions = []
+        for (feature_index, feature) in enumerate(features):
+            if not feature.unique_id in unique_id_to_result:
+                print("feature not in unique_Id", feature.unique_id)
+                continue
+            result = unique_id_to_result[feature.unique_id]
+
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        prelim_predictions = sorted(prelim_predictions,
+                                    key=lambda x: (x.start_logit + x.end_logit),
+                                    reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = " ".join(tok_tokens)
+
+            # De-tokenize WordPieces that have been split off.
+            tok_text = tok_text.replace(" ##", "")
+            tok_text = tok_text.replace("##", "")
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case)
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+            nbest.append(
+                _NbestPrediction(text=final_text,
+                                 start_logit=pred.start_logit,
+                                 end_logit=pred.end_logit))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = float(entry.start_logit)
+            output["end_logit"] = float(entry.end_logit)
+            nbest_json.append(output)
+
+        all_predictions[example.qas_id] = nbest_json[0]["text"]
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+
+def get_final_text(pred_text, orig_text, do_lower_case):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits),
+                             key=lambda x: x[1],
+                             reverse=True)
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
\ No newline at end of file
diff --git a/examples/onnx/bert/tokenization.py b/examples/onnx/bert/tokenization.py
new file mode 100644
index 0000000000..4dd0a31280
--- /dev/null
+++ b/examples/onnx/bert/tokenization.py
@@ -0,0 +1,399 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+import six
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  index = 0
+  with open(vocab_file, "rb") as reader:
+    while True:
+      token = reader.readline()
+      token = token.decode("utf-8", "ignore")
+      if not token:
+        break
+      token = token.strip()
+      vocab[token] = index
+      index += 1
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True):
+    """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+    """
+    self.do_lower_case = do_lower_case
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically contorl characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
diff --git a/examples/onnx/fer_emotion.py b/examples/onnx/fer_emotion.py
new file mode 100644
index 0000000000..adbadafa9e
--- /dev/null
+++ b/examples/onnx/fer_emotion.py
@@ -0,0 +1,111 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under th
+
+import os
+import numpy as np
+from PIL import Image
+
+from singa import device
+from singa import tensor
+from singa import autograd
+from singa import sonnx
+import onnx
+from utils import download_model, update_batch_size, check_exist_or_download
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+def preprocess(img):
+    input_shape = (1, 1, 64, 64)
+    img = img.resize((64, 64), Image.ANTIALIAS)
+    img_data = np.array(img).astype(np.float32)
+    img_data = np.resize(img_data, input_shape)
+    return img_data
+
+
+def get_image_labe():
+    labels = [
+        'neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust',
+        'fear', 'contempt'
+    ]
+    # download image
+    image_url = 'https://microsoft.github.io/onnxjs-demo/img/fear.8d1417fa.jpg'
+    img = Image.open(check_exist_or_download(image_url))
+
+    return img, labels
+
+
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad = True
+            sg_ir.tensor_map[idx] = tens
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+
+if __name__ == "__main__":
+
+    url = 'https://onnxzoo.blob.core.windows.net/models/opset_8/emotion_ferplus/emotion_ferplus.tar.gz'
+    download_dir = '/tmp/'
+    model_path = os.path.join(download_dir, 'emotion_ferplus', 'model.onnx')
+
+    logging.info("onnx load model...")
+    download_model(url)
+    onnx_model = onnx.load(model_path)
+
+    # set batch size
+    onnx_model = update_batch_size(onnx_model, 1)
+
+    # prepare the model
+    logging.info("prepare model...")
+    dev = device.create_cuda_gpu()
+    sg_ir = sonnx.prepare(onnx_model, device=dev)
+    autograd.training = False
+    model = Infer(sg_ir)
+
+    # verifty the test 
+    # from utils import load_dataset
+    # inputs, ref_outputs = load_dataset(os.path.join('/tmp', 'emotion_ferplus', 'test_data_set_0'))
+    # x_batch = tensor.Tensor(device=dev, data=inputs[0])
+    # outputs = model.forward(x_batch)
+    # for ref_o, o in zip(ref_outputs, outputs):
+    #     np.testing.assert_almost_equal(ref_o, tensor.to_numpy(o), 4)
+
+    # inference
+    logging.info("preprocessing...")
+    img, labels = get_image_labe()
+    img = preprocess(img)
+
+    x_batch = tensor.Tensor(device=dev, data=img)
+
+    logging.info("model running...")
+    y = model.forward(x_batch)
+
+    logging.info("postprocessing...")
+    y = tensor.softmax(y)
+    scores = tensor.to_numpy(y)
+    scores = np.squeeze(scores)
+    a = np.argsort(scores)[::-1]
+    for i in a[0:5]:
+        logging.info('class=%s ; probability=%f' % (labels[i], scores[i]))
diff --git a/examples/onnx/mnist.py b/examples/onnx/mnist.py
new file mode 100644
index 0000000000..d4f7d2f696
--- /dev/null
+++ b/examples/onnx/mnist.py
@@ -0,0 +1,320 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under th
+
+import os
+import gzip
+import numpy as np
+import codecs
+
+
+from singa import device
+from singa import tensor
+from singa import opt
+from singa import autograd
+from singa import sonnx
+import onnx
+from utils import check_exist_or_download
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(message)s')
+
+def load_dataset():
+    train_x_url = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
+    train_y_url = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'
+    valid_x_url = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
+    valid_y_url = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
+    train_x = read_image_file(check_exist_or_download(train_x_url)).astype(
+        np.float32)
+    train_y = read_label_file(check_exist_or_download(train_y_url)).astype(
+        np.float32)
+    valid_x = read_image_file(check_exist_or_download(valid_x_url)).astype(
+        np.float32)
+    valid_y = read_label_file(check_exist_or_download(valid_y_url)).astype(
+        np.float32)
+    return train_x, train_y, valid_x, valid_y
+
+
+def read_label_file(path):
+    with gzip.open(path, 'rb') as f:
+        data = f.read()
+        assert get_int(data[:4]) == 2049
+        length = get_int(data[4:8])
+        parsed = np.frombuffer(data, dtype=np.uint8, offset=8).reshape((length))
+        return parsed
+
+
+def get_int(b):
+    return int(codecs.encode(b, 'hex'), 16)
+
+
+def read_image_file(path):
+    with gzip.open(path, 'rb') as f:
+        data = f.read()
+        assert get_int(data[:4]) == 2051
+        length = get_int(data[4:8])
+        num_rows = get_int(data[8:12])
+        num_cols = get_int(data[12:16])
+        parsed = np.frombuffer(data, dtype=np.uint8, offset=16).reshape(
+            (length, 1, num_rows, num_cols))
+        return parsed
+
+
+def to_categorical(y, num_classes):
+    y = np.array(y, dtype="int")
+    n = y.shape[0]
+    categorical = np.zeros((n, num_classes))
+    categorical[np.arange(n), y] = 1
+    categorical = categorical.astype(np.float32)
+    return categorical
+
+
+class CNN:
+
+    def __init__(self):
+        self.conv1 = autograd.Conv2d(1, 20, 5, padding=0)
+        self.conv2 = autograd.Conv2d(20, 50, 5, padding=0)
+        self.linear1 = autograd.Linear(4 * 4 * 50, 500, bias=False)
+        self.linear2 = autograd.Linear(500, 10, bias=False)
+        self.pooling1 = autograd.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = autograd.MaxPool2d(2, 2, padding=0)
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = autograd.relu(y)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = autograd.relu(y)
+        y = self.pooling2(y)
+        y = autograd.flatten(y)
+        y = self.linear1(y)
+        y = autograd.relu(y)
+        y = self.linear2(y)
+        return y
+
+
+def accuracy(pred, target):
+    y = np.argmax(pred, axis=1)
+    t = np.argmax(target, axis=1)
+    a = y == t
+    return np.array(a, "int").sum() / float(len(t))
+
+
+def train(model,
+          x,
+          y,
+          epochs=1,
+          batch_size=64,
+          dev=device.get_default_device()):
+    batch_number = x.shape[0] // batch_size
+
+    for i in range(epochs):
+        for b in range(batch_number):
+            l_idx = b * batch_size
+            r_idx = (b + 1) * batch_size
+
+            x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
+            target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
+
+            output_batch = model.forward(x_batch)
+
+            loss = autograd.softmax_cross_entropy(output_batch, target_batch)
+            accuracy_rate = accuracy(tensor.to_numpy(output_batch),
+                                     tensor.to_numpy(target_batch))
+
+            sgd = opt.SGD(lr=0.001)
+            for p, gp in autograd.backward(loss):
+                sgd.update(p, gp)
+            sgd.step()
+
+            if b % 1e2 == 0:
+                logging.info("acc %6.2f loss, %6.2f" %
+                      (accuracy_rate, tensor.to_numpy(loss)[0]))
+    logging.info("training completed")
+    return x_batch, output_batch
+
+
+def make_onnx(x, y):
+    return sonnx.to_onnx([x], [y])
+
+
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad = True
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+
+def re_train(sg_ir,
+             x,
+             y,
+             epochs=1,
+             batch_size=64,
+             dev=device.get_default_device()):
+    batch_number = x.shape[0] // batch_size
+
+    new_model = Infer(sg_ir)
+
+    for i in range(epochs):
+        for b in range(batch_number):
+            l_idx = b * batch_size
+            r_idx = (b + 1) * batch_size
+
+            x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
+            target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
+
+            output_batch = new_model.forward(x_batch)
+
+            loss = autograd.softmax_cross_entropy(output_batch, target_batch)
+            accuracy_rate = accuracy(tensor.to_numpy(output_batch),
+                                     tensor.to_numpy(target_batch))
+
+            sgd = opt.SGD(lr=0.01)
+            for p, gp in autograd.backward(loss):
+                sgd.update(p, gp)
+            sgd.step()
+
+            if b % 1e2 == 0:
+                logging.info("acc %6.2f loss, %6.2f" %
+                      (accuracy_rate, tensor.to_numpy(loss)[0]))
+    logging.info("re-training completed")
+    return new_model
+
+
+class Trans:
+
+    def __init__(self, sg_ir, last_layers):
+        self.sg_ir = sg_ir
+        self.last_layers = last_layers
+        self.append_linear1 = autograd.Linear(500, 128, bias=False)
+        self.append_linear2 = autograd.Linear(128, 32, bias=False)
+        self.append_linear3 = autograd.Linear(32, 10, bias=False)
+
+    def forward(self, x):
+        y = sg_ir.run([x], last_layers=self.last_layers)[0]
+        y = self.append_linear1(y)
+        y = autograd.relu(y)
+        y = self.append_linear2(y)
+        y = autograd.relu(y)
+        y = self.append_linear3(y)
+        y = autograd.relu(y)
+        return y
+
+
+def transfer_learning(sg_ir,
+                      x,
+                      y,
+                      epochs=1,
+                      batch_size=64,
+                      dev=device.get_default_device()):
+    batch_number = x.shape[0] // batch_size
+
+    trans_model = Trans(sg_ir, -1)
+
+    for i in range(epochs):
+        for b in range(batch_number):
+            l_idx = b * batch_size
+            r_idx = (b + 1) * batch_size
+
+            x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
+            target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
+            output_batch = trans_model.forward(x_batch)
+
+            loss = autograd.softmax_cross_entropy(output_batch, target_batch)
+            accuracy_rate = accuracy(tensor.to_numpy(output_batch),
+                                     tensor.to_numpy(target_batch))
+
+            sgd = opt.SGD(lr=0.07)
+            for p, gp in autograd.backward(loss):
+                sgd.update(p, gp)
+            sgd.step()
+
+            if b % 1e2 == 0:
+                logging.info("acc %6.2f loss, %6.2f" %
+                      (accuracy_rate, tensor.to_numpy(loss)[0]))
+    logging.info("transfer-learning completed")
+    return trans_model
+
+
+def test(model, x, y, batch_size=64, dev=device.get_default_device()):
+    batch_number = x.shape[0] // batch_size
+
+    result = 0
+    for b in range(batch_number):
+        l_idx = b * batch_size
+        r_idx = (b + 1) * batch_size
+
+        x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
+        target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
+
+        output_batch = model.forward(x_batch)
+        result += accuracy(tensor.to_numpy(output_batch),
+                           tensor.to_numpy(target_batch))
+
+    logging.info("testing acc %6.2f" % (result / batch_number))
+
+
+if __name__ == "__main__":
+    # create device
+    dev = device.create_cuda_gpu()
+    #dev = device.get_default_device()
+    # create model
+    model = CNN()
+    # load data
+    train_x, train_y, valid_x, valid_y = load_dataset()
+    # normalization
+    train_x = train_x / 255
+    valid_x = valid_x / 255
+    train_y = to_categorical(train_y, 10)
+    valid_y = to_categorical(valid_y, 10)
+    # do training
+    autograd.training = True
+    x, y = train(model, train_x, train_y, dev=dev)
+    onnx_model = make_onnx(x, y)
+    # logging.info('The model is:\n{}'.format(onnx_model))
+
+    # Save the ONNX model
+    model_path = os.path.join('/', 'tmp', 'mnist.onnx')
+    onnx.save(onnx_model, model_path)
+    logging.info('The model is saved.')
+
+    # load the ONNX model
+    onnx_model = onnx.load(model_path)
+    sg_ir = sonnx.prepare(onnx_model, device=dev)
+
+    # inference
+    autograd.training = False
+    logging.info('The inference result is:')
+    test(Infer(sg_ir), valid_x, valid_y, dev=dev)
+
+    # re-training
+    autograd.training = True
+    new_model = re_train(sg_ir, train_x, train_y, dev=dev)
+    autograd.training = False
+    test(new_model, valid_x, valid_y, dev=dev)
+
+    # transfer-learning
+    autograd.training = True
+    new_model = transfer_learning(sg_ir, train_x, train_y, dev=dev)
+    autograd.training = False
+    test(new_model, valid_x, valid_y, dev=dev)
\ No newline at end of file
diff --git a/examples/onnx/mobilenet.py b/examples/onnx/mobilenet.py
new file mode 100644
index 0000000000..e9fd90cba2
--- /dev/null
+++ b/examples/onnx/mobilenet.py
@@ -0,0 +1,116 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under th
+
+import os
+import numpy as np
+from PIL import Image
+
+from singa import device
+from singa import tensor
+from singa import autograd
+from singa import sonnx
+import onnx
+from utils import download_model, update_batch_size, check_exist_or_download
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+def preprocess(img):
+    img = img.resize((256, 256))
+    img = img.crop((16, 16, 240, 240))
+    img = np.array(img).astype(np.float32) / 255.
+    img = np.rollaxis(img, 2, 0)
+    for channel, mean, std in zip(range(3), [0.485, 0.456, 0.406],
+                                  [0.229, 0.224, 0.225]):
+        img[channel, :, :] -= mean
+        img[channel, :, :] /= std
+    img = np.expand_dims(img, axis=0)
+    return img
+
+
+def get_image_labe():
+    # download label
+    label_url = 'https://s3.amazonaws.com/onnx-model-zoo/synset.txt'
+    with open(check_exist_or_download(label_url), 'r') as f:
+        labels = [l.rstrip() for l in f]
+
+    # download image
+    image_url = 'https://s3.amazonaws.com/model-server/inputs/kitten.jpg'
+    img = Image.open(check_exist_or_download(image_url))
+    return img, labels
+
+
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad = True
+            sg_ir.tensor_map[idx] = tens
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+
+if __name__ == "__main__":
+
+    url = 'https://s3.amazonaws.com/onnx-model-zoo/mobilenet/mobilenetv2-1.0/mobilenetv2-1.0.tar.gz'
+    download_dir = '/tmp/'
+    model_path = os.path.join(download_dir, 'mobilenetv2-1.0',
+                              'mobilenetv2-1.0.onnx')
+
+    logging.info("onnx load model...")
+    download_model(url)
+    onnx_model = onnx.load(model_path)
+
+    # set batch size
+    onnx_model = update_batch_size(onnx_model, 1)
+
+    # prepare the model
+    logging.info("prepare model...")
+    dev = device.create_cuda_gpu()
+    sg_ir = sonnx.prepare(onnx_model, device=dev)
+    autograd.training = False
+    model = Infer(sg_ir)
+
+    # verifty the test dataset
+    # from utils import load_dataset
+    # inputs, ref_outputs = load_dataset(os.path.join('/tmp', 'mobilenetv2-1.0', 'test_data_set_0'))
+    # x_batch = tensor.Tensor(device=dev, data=inputs[0])
+    # outputs = model.forward(x_batch)
+    # for ref_o, o in zip(ref_outputs, outputs):
+    #     np.testing.assert_almost_equal(ref_o, tensor.to_numpy(o), 4)
+
+    # inference
+    logging.info("preprocessing...")
+    img, labels = get_image_labe()
+    img = preprocess(img)
+
+    logging.info("model running...")
+    x_batch = tensor.Tensor(device=dev, data=img)
+    y = model.forward(x_batch)
+
+    logging.info("postprocessing...")
+    y = tensor.softmax(y)
+    scores = tensor.to_numpy(y)
+    scores = np.squeeze(scores)
+    a = np.argsort(scores)[::-1]
+    for i in a[0:5]:
+        logging.info('class=%s ; probability=%f' % (labels[i], scores[i]))
diff --git a/examples/onnx/resnet18.py b/examples/onnx/resnet18.py
new file mode 100644
index 0000000000..c0ef13a930
--- /dev/null
+++ b/examples/onnx/resnet18.py
@@ -0,0 +1,115 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under th
+
+import os
+import numpy as np
+from PIL import Image
+
+from singa import device
+from singa import tensor
+from singa import autograd
+from singa import sonnx
+import onnx
+from utils import download_model, update_batch_size, check_exist_or_download
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+def preprocess(img):
+    img = img.resize((256, 256))
+    img = img.crop((16, 16, 240, 240))
+    img = np.array(img).astype(np.float32) / 255.
+    img = np.rollaxis(img, 2, 0)
+    for channel, mean, std in zip(range(3), [0.485, 0.456, 0.406],
+                                  [0.229, 0.224, 0.225]):
+        img[channel, :, :] -= mean
+        img[channel, :, :] /= std
+    img = np.expand_dims(img, axis=0)
+    return img
+
+
+def get_image_labe():
+    # download label
+    label_url = 'https://s3.amazonaws.com/onnx-model-zoo/synset.txt'
+    with open(check_exist_or_download(label_url), 'r') as f:
+        labels = [l.rstrip() for l in f]
+
+    # download image
+    image_url = 'https://s3.amazonaws.com/model-server/inputs/kitten.jpg'
+    img = Image.open(check_exist_or_download(image_url))
+    return img, labels
+
+
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad = True
+            sg_ir.tensor_map[idx] = tens
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+
+if __name__ == "__main__":
+
+    url = 'https://s3.amazonaws.com/onnx-model-zoo/resnet/resnet18v1/resnet18v1.tar.gz'
+    download_dir = '/tmp/'
+    model_path = os.path.join(download_dir, 'resnet18v1', 'resnet18v1.onnx')
+
+    logging.info("onnx load model...")
+    download_model(url)
+    onnx_model = onnx.load(model_path)
+
+    # set batch size
+    onnx_model = update_batch_size(onnx_model, 1)
+
+    # prepare the model
+    logging.info("prepare model...")
+    dev = device.create_cuda_gpu()
+    sg_ir = sonnx.prepare(onnx_model, device=dev)
+    autograd.training = False
+    model = Infer(sg_ir)
+
+    # verifty the test 
+    # from utils import load_dataset
+    # inputs, ref_outputs = load_dataset(os.path.join('/tmp', 'resnet18v1', 'test_data_set_0'))
+    # x_batch = tensor.Tensor(device=dev, data=inputs[0])
+    # outputs = model.forward(x_batch)
+    # for ref_o, o in zip(ref_outputs, outputs):
+    #     np.testing.assert_almost_equal(ref_o, tensor.to_numpy(o), 4)
+
+    # inference
+    logging.info("preprocessing...")
+    img, labels = get_image_labe()
+    img = preprocess(img)
+
+    logging.info("model running...")
+    x_batch = tensor.Tensor(device=dev, data=img)
+    y = model.forward(x_batch)
+
+    logging.info("postprocessing...")
+    y = tensor.softmax(y)
+    scores = tensor.to_numpy(y)
+    scores = np.squeeze(scores)
+    a = np.argsort(scores)[::-1]
+    for i in a[0:5]:
+        logging.info('class=%s ; probability=%f' % (labels[i], scores[i]))
\ No newline at end of file
diff --git a/examples/onnx/tiny_yolov2.py b/examples/onnx/tiny_yolov2.py
new file mode 100644
index 0000000000..8aff76976f
--- /dev/null
+++ b/examples/onnx/tiny_yolov2.py
@@ -0,0 +1,167 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under th
+
+import os
+import numpy as np
+from PIL import Image, ImageDraw
+
+
+from singa import device
+from singa import tensor
+from singa import autograd
+from singa import sonnx
+import onnx
+from utils import download_model, update_batch_size, check_exist_or_download
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(message)s')
+
+
+def preprocess(img):
+    img = np.array(img).astype(np.float32)
+    img = np.rollaxis(img, 2, 0)
+    img = np.expand_dims(img, axis=0)
+    return img
+
+
+def get_image():
+    image_url = 'https://raw.githubusercontent.com/simo23/tinyYOLOv2/master/person.jpg'
+    img = Image.open(check_exist_or_download(image_url))
+    img = img.resize((416, 416))
+    return img
+
+
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad = True
+            sg_ir.tensor_map[idx] = tens
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+
+def postprcess(out):
+    numClasses = 20
+    anchors = [1.08, 1.19, 3.42, 4.41, 6.63, 11.38, 9.42, 5.11, 16.62, 10.52]
+
+    def sigmoid(x, derivative=False):
+        return x * (1 - x) if derivative else 1 / (1 + np.exp(-x))
+
+    def softmax(x):
+        scoreMatExp = np.exp(np.asarray(x))
+        return scoreMatExp / scoreMatExp.sum(0)
+
+    clut = [(0, 0, 0), (255, 0, 0), (255, 0, 255), (0, 0, 255), (0, 255, 0),
+            (0, 255, 128), (128, 255, 0), (128, 128, 0), (0, 128, 255),
+            (128, 0, 128), (255, 0, 128), (128, 0, 255), (255, 128, 128),
+            (128, 255, 128), (255, 255, 0), (255, 128, 128), (128, 128, 255),
+            (255, 128, 128), (128, 255, 128)]
+    label = [
+        "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+        "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+        "pottedplant", "sheep", "sofa", "train", "tvmonitor"
+    ]
+
+    img = get_image()
+    draw = ImageDraw.Draw(img)
+
+    for cy in range(13):
+        for cx in range(13):
+            for b in range(5):
+                channel = b * (numClasses + 5)
+                tx = out[channel][cy][cx]
+                ty = out[channel + 1][cy][cx]
+                tw = out[channel + 2][cy][cx]
+                th = out[channel + 3][cy][cx]
+                tc = out[channel + 4][cy][cx]
+                x = (float(cx) + sigmoid(tx)) * 32
+                y = (float(cy) + sigmoid(ty)) * 32
+
+                w = np.exp(tw) * 32 * anchors[2 * b]
+                h = np.exp(th) * 32 * anchors[2 * b + 1]
+
+                confidence = sigmoid(tc)
+
+                classes = np.zeros(numClasses)
+                for c in range(0, numClasses):
+                    classes[c] = out[channel + 5 + c][cy][cx]
+
+                classes = softmax(classes)
+                detectedClass = classes.argmax()
+                if 0.5 < classes[detectedClass] * confidence:
+                    color = clut[detectedClass]
+                    x = x - w / 2
+                    y = y - h / 2
+                    draw.line((x, y, x + w, y), fill=color)
+                    draw.line((x, y, x, y + h), fill=color)
+                    draw.line((x + w, y, x + w, y + h), fill=color)
+                    draw.line((x, y + h, x + w, y + h), fill=color)
+                    draw.text((x, y), label[detectedClass], fill=color)
+                    logging.info("bounding box: (%.2f, %.2f, %.2f, %.2f)" %
+                                 (x, y, x + w, y + h))
+                    logging.info('class=%s ; probability=%f' %
+                                 (label[detectedClass],
+                                  classes[detectedClass] * confidence))
+    img.save("result.png")
+
+
+if __name__ == "__main__":
+
+    url = 'https://onnxzoo.blob.core.windows.net/models/opset_8/tiny_yolov2/tiny_yolov2.tar.gz'
+    download_dir = '/tmp/'
+    model_path = os.path.join(download_dir, 'tiny_yolov2', 'Model.onnx')
+
+    logging.info("onnx load model...")
+    download_model(url)
+    onnx_model = onnx.load(model_path)
+
+    # set batch size
+    onnx_model = update_batch_size(onnx_model, 1)
+
+    # prepare the model
+    logging.info("prepare model...")
+    dev = device.create_cuda_gpu()
+    sg_ir = sonnx.prepare(onnx_model, device=dev)
+    autograd.training = False
+    model = Infer(sg_ir)
+
+    # verifty the test dataset
+    # from utils import load_dataset
+    # inputs, ref_outputs = load_dataset(os.path.join('/tmp', 'tiny_yolov2', 'test_data_set_0'))
+    # x_batch = tensor.Tensor(device=dev, data=inputs[0])
+    # outputs = model.forward(x_batch)
+    # for ref_o, o in zip(ref_outputs, outputs):
+    #     np.testing.assert_almost_equal(ref_o, tensor.to_numpy(o), 4)
+
+    # inference
+    logging.info("preprocessing...")
+    img = get_image()
+    img = preprocess(img)
+
+    logging.info("model running...")
+    x_batch = tensor.Tensor(device=dev, data=img)
+    y = model.forward(x_batch)
+
+    logging.info("postprocessing...")
+    out = tensor.to_numpy(y)[0]
+    postprcess(out)
diff --git a/examples/onnx/utils.py b/examples/onnx/utils.py
new file mode 100644
index 0000000000..aff44927f8
--- /dev/null
+++ b/examples/onnx/utils.py
@@ -0,0 +1,72 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under th
+
+import os
+import urllib.request
+import tarfile
+import glob
+import onnx
+from onnx import numpy_helper
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(message)s')
+
+def download_model(url):
+    download_dir = '/tmp/'
+    with tarfile.open(check_exist_or_download(url), 'r') as t:
+        t.extractall(path=download_dir)
+
+
+def load_dataset(test_data_dir):
+    # load inputs
+    inputs = []
+    inputs_num = len(glob.glob(os.path.join(test_data_dir, 'input_*.pb')))
+    for i in range(inputs_num):
+        input_file = os.path.join(test_data_dir, 'input_{}.pb'.format(i))
+        onnx_tensor = onnx.TensorProto()
+        with open(input_file, 'rb') as f:
+            onnx_tensor.ParseFromString(f.read())
+        inputs.append(numpy_helper.to_array(onnx_tensor))
+
+    # load reference outputs
+    ref_outputs = []
+    ref_outputs_num = len(glob.glob(os.path.join(test_data_dir, 'output_*.pb')))
+    for i in range(ref_outputs_num):
+        output_file = os.path.join(test_data_dir, 'output_{}.pb'.format(i))
+        onnx_tensor = onnx.TensorProto()
+        with open(output_file, 'rb') as f:
+            onnx_tensor.ParseFromString(f.read())
+        ref_outputs.append(numpy_helper.to_array(onnx_tensor))
+    return inputs, ref_outputs
+
+
+def check_exist_or_download(url):
+    download_dir = '/tmp/'
+    name = url.rsplit('/', 1)[-1]
+    filename = os.path.join(download_dir, name)
+    if not os.path.isfile(filename):
+        logging.info("Downloading %s" % url)
+        urllib.request.urlretrieve(url, filename)
+    return filename
+
+
+def update_batch_size(onnx_model, batch_size):
+    model_input = onnx_model.graph.input[0]
+    model_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+    model_output = onnx_model.graph.output[0]
+    model_output.type.tensor_type.shape.dim[0].dim_value = batch_size
+    return onnx_model
diff --git a/examples/onnx/vgg16.py b/examples/onnx/vgg16.py
new file mode 100644
index 0000000000..d97b025e8d
--- /dev/null
+++ b/examples/onnx/vgg16.py
@@ -0,0 +1,114 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under th
+
+import os
+import numpy as np
+from PIL import Image
+
+from singa import device
+from singa import tensor
+from singa import autograd
+from singa import sonnx
+import onnx
+from utils import download_model, update_batch_size, check_exist_or_download
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(message)s')
+
+def preprocess(img):
+    img = img.resize((256, 256))
+    img = img.crop((16, 16, 240, 240))
+    img = np.array(img).astype(np.float32) / 255.
+    img = np.rollaxis(img, 2, 0)
+    for channel, mean, std in zip(range(3), [0.485, 0.456, 0.406],
+                                  [0.229, 0.224, 0.225]):
+        img[channel, :, :] -= mean
+        img[channel, :, :] /= std
+    img = np.expand_dims(img, axis=0)
+    return img
+
+
+def get_image_labe():
+    # download label
+    label_url = 'https://s3.amazonaws.com/onnx-model-zoo/synset.txt'
+    with open(check_exist_or_download(label_url), 'r') as f:
+        labels = [l.rstrip() for l in f]
+
+    # download image
+    image_url = 'https://s3.amazonaws.com/model-server/inputs/kitten.jpg'
+    img = Image.open(check_exist_or_download(image_url))
+    return img, labels
+
+
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad = True
+            sg_ir.tensor_map[idx] = tens
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+
+if __name__ == "__main__":
+    url = 'https://s3.amazonaws.com/onnx-model-zoo/vgg/vgg16/vgg16.tar.gz'
+    download_dir = '/tmp/'
+    model_path = os.path.join(download_dir, 'vgg16', 'vgg16.onnx')
+
+    logging.info("onnx load model...")
+    download_model(url)
+    onnx_model = onnx.load(model_path)
+
+    # set batch size
+    onnx_model = update_batch_size(onnx_model, 1)
+
+    # prepare the model
+    logging.info("prepare model...")
+    dev = device.create_cuda_gpu()
+    sg_ir = sonnx.prepare(onnx_model, device=dev)
+    autograd.training = False
+    model = Infer(sg_ir)
+
+    # verifty the test 
+    # from utils import load_dataset
+    # inputs, ref_outputs = load_dataset(os.path.join('/tmp', 'vgg16', 'test_data_set_0'))
+    # x_batch = tensor.Tensor(device=dev, data=inputs[0])
+    # outputs = model.forward(x_batch)
+    # for ref_o, o in zip(ref_outputs, outputs):
+    #     np.testing.assert_almost_equal(ref_o, tensor.to_numpy(o), 4)
+
+    # inference
+    logging.info("preprocessing...")
+    img, labels = get_image_labe()
+    img = preprocess(img)
+
+    logging.info("model running...")
+    x_batch = tensor.Tensor(device=dev, data=img)
+    y = model.forward(x_batch)
+
+    logging.info("postprocessing...")
+    y = tensor.softmax(y)
+    scores = tensor.to_numpy(y)
+    scores = np.squeeze(scores)
+    a = np.argsort(scores)[::-1]
+    for i in a[0:5]:
+        logging.info('class=%s ; probability=%f' % (labels[i], scores[i]))
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 8f64eb89da..17ce07e682 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -638,11 +638,7 @@ class Reshape(Operation):
 
     def __init__(self, shape):
         super(Reshape, self).__init__()
-        if isinstance(shape, tensor.Tensor):
-            self.shape = np.asarray(tensor.to_numpy(shape).astype(
-                np.int32)).tolist()
-        else:
-            self.shape = list(shape)
+        self.shape = list(shape)
 
     def forward(self, x):
         self._shape = x.shape()
@@ -656,7 +652,6 @@ def forward(self, x):
         # handle the shape with -1
         hidden_shape = int(np.prod(self._shape) // np.abs(np.prod(shape)))
         self.cache = [s if s != -1 else hidden_shape for s in shape]
-
         return singa.Reshape(x, self.cache)
 
     def backward(self, dy):
@@ -1431,7 +1426,6 @@ def __init__(self,
         self.pad_mode = pad_mode
 
     def __call__(self, x):
-
         assert x.shape[1] == self.in_channels, "in_channels mismatched"
 
         # if same pad mode, re-compute the padding
@@ -1689,10 +1683,8 @@ def forward(self, x):
             y = singa.GpuPoolingForward(self.handle, x)
         else:
             y = singa.CpuPoolingForward(self.handle, x)
-
         if training:
             self.cache = (x, y)
-
         return y
 
     def backward(self, dy):
@@ -2243,11 +2235,21 @@ def __init__(self):
         super(Mul, self).__init__()
 
     def forward(self, a, b):
-        res = singa.__mul__(a, b)
+        # todo we cannot support mul op for int tensors
+        _a, _b = a, b
+        dtype0 = _a.data_type()
+        dtype1 = _b.data_type()
+        if dtype0 == singa.kInt or dtype1 == singa.kInt:
+            _a = a.AsType(singa.kFloat32)
+            _b = b.AsType(singa.kFloat32)
+            res = singa.__mul__(_a, _b)
+            res = res.AsType(singa.kInt)
+        else:
+            res = singa.__mul__(_a, _b)
         if training:
-            self.input = (a, b)
-            self.shape0 = list(a.shape())
-            self.shape1 = list(b.shape())
+            self.input = (_a, _b)
+            self.shape0 = list(_a.shape())
+            self.shape1 = list(_b.shape())
             self.shape3 = list(res.shape())
         return res
 
@@ -2275,6 +2277,9 @@ def __init__(self, axis):
     def forward(self, x):
         self.cache = x.shape()
         cur = list(self.cache)
+        # todo, need optimize after we have scalar tensor
+        if len(self.cache) == 1 and self.axis == [0]:
+            return x
         for i in self.axis:
             cur.insert(i, 1)
         return singa.Reshape(x, cur)
@@ -2542,7 +2547,7 @@ def exp(a):
 class LeakyRelu(Operation):
 
     def __init__(self, a):
-        super().__init__(self)
+        super(LeakyRelu, self).__init__()
         self.a = a
 
     def forward(self, x):
@@ -2840,14 +2845,18 @@ def forward(self, x):
         if (self.axis == []):
             newshape = list(filter(lambda i: i != 1, self.cache))
         else:
-            for i in self.axis:
+            for id, i in enumerate(self.axis):
                 assert i < len(self.cache)
+                self.axis[id] = i % len(self.cache)
                 assert self.cache[
                     i] == 1, "the length of axis {} is {}, which should be 1".format(
                         i, self.cache[i])
             for ind, v in enumerate(self.cache):
                 if ind not in self.axis:
                     newshape.append(v)
+        # todo, need optimize after we have scalar tensor
+        if newshape == []:
+            return x
         return singa.Reshape(x, newshape)
 
     def backward(self, dy):
@@ -3556,6 +3565,8 @@ def forward(self, x):
             self.steps = [1] * len(x_shape)  # steps = None
         for idx, axis in enumerate(self.axes):
             start, end, step = self.starts[idx], self.ends[idx], self.steps[idx]
+            if end > x_shape[axis]:
+                end = x_shape[axis]
             self.cache.append((axis, x_shape[axis], start, end, step))
             xs = []
             for step_idx in range(x_shape[axis])[start:end:step]:
@@ -3658,7 +3669,7 @@ def ceil(x):
 
 class Split(Operation):
 
-    def __init__(self, axis, parts):
+    def __init__(self, axis, parts, num_output=None):
         """
         Init a Split, Split a tensor into a list of tensors, along the specified 'axis'. 
         Args:
@@ -3667,10 +3678,15 @@ def __init__(self, axis, parts):
         Args:
             parts: list of ints, length of each output, which can be specified using argument 'parts'. 
             Otherwise, the tensor is parts to equal sized parts.
+        Args:
+            num_output: once parts is none, the tensor is split to equal sized parts for each output.
         """
         super(Split, self).__init__()
         self.axis = axis
         self.parts = parts
+        self.num_output = num_output
+        if self.parts is None:
+            assert self.num_output is not None, "For (parts, num_output), it at least requires one."
 
     def forward(self, x):
         """
@@ -3680,6 +3696,10 @@ def forward(self, x):
         Returns:
             the output CTensor.
         """
+        x_shape = list(x.shape())
+        self.axis  = self.axis % len(x_shape)
+        if self.parts is None:
+            self.parts = [x_shape[self.axis]//self.num_output] * self.num_output
         xs = []
         _s = 0
         for _l in self.parts:
@@ -3700,7 +3720,7 @@ def backward(self, *dys):
         return dy
 
 
-def split(x, axis, parts):
+def split(x, axis, parts, num_output=None):
     """
     Init a Split, Split a tensor into a list of tensors, along the specified 'axis'. 
     Args:
@@ -3711,10 +3731,12 @@ def split(x, axis, parts):
     Args:
         parts: list of ints, length of each output, which can be specified using argument 'parts'. 
         Otherwise, the tensor is split to equal sized parts.
+    Args:
+        num_output: once parts is none, the tensor is split to equal sized parts for each output.
     Returns:
         the output CTensor.
     """
-    return Split(axis, parts)(x)
+    return Split(axis, parts, num_output)(x)
 
 
 class Gather(Operation):
@@ -3931,7 +3953,7 @@ def forward(self, x):
             the output CTensor.
         """
         y = tensor.to_numpy(tensor.from_raw_tensor(x))
-        y = np.array((np.nonzero(y)))
+        y = np.array((np.nonzero(y))).astype(np.int32)
         y = tensor.from_numpy(y)
         y.to_device(x.device())
         return y.data
@@ -3980,9 +4002,10 @@ def forward(self, x):
             the output CTensor.
         """
         if x.data_type() != self.to:
-            x.AsType(self.to)
+            x = x.AsType(self.to)
         return x
 
+
     def backward(self, dy):
         """
         backward of Cast
@@ -4006,3 +4029,81 @@ def cast(x, to):
         the output CTensor.
     """
     return Cast(to)(x)[0]
+
+
+class OneHot(Operation):
+
+    def __init__(self, axis, depth, values):
+        """
+        Produces a one-hot tensor based on inputs. 
+        Args:
+            axis: Axis along which one-hot representation in added. Default: axis=-1. 
+            axis=-1 means that the additional dimension will be inserted as the innermost/last dimension in the output tensor.
+        Args:
+            values: Rank 1 tensor containing exactly two elements, in the format [off_value, on_value], 
+            where 'on_value' is the value used for filling locations specified in 'indices' input tensor, 
+            and 'off_value' is the value used for filling locations other than those specified in 'indices' input tensor.
+        """
+        super(OneHot, self).__init__()
+        self.axis = axis
+        self.depth = depth
+        self.values = values
+
+    def forward(self, indices):
+        """
+        forward of OneHot
+        ! borrow from onnx
+        Args:
+            indices: Scalar specifying the number of classes in one-hot tensor. 
+            This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. 
+            The values in the 'indices' input tensor are expected to be in the range [-depth, depth-1]. 
+            In case 'depth' is of non-integer type, it will be casted to int64 before use.
+        Returns:
+            the output CTensor.
+        """
+        values = tensor.to_numpy(tensor.from_raw_tensor(indices))
+        rank = len(values.shape)
+        depth_range = np.arange(self.depth)
+        if self.axis < 0:
+            self.axis += (rank + 1)
+        ls = values.shape[0:self.axis]
+        rs = values.shape[self.axis:rank]
+        targets = np.reshape(depth_range, (1,) * len(ls) + depth_range.shape + (1,) * len(rs))
+        values = np.reshape(np.mod(values, self.depth), ls + (1,) + rs)
+        np_tensor = np.asarray(targets == values, dtype=np.float32)
+        np_tensor = np_tensor * (self.values[1] - self.values[0]) + self.values[0]
+        tmp_tensor = tensor.from_numpy(np_tensor)
+        tmp_tensor.to_device(indices.device())
+        return tmp_tensor.data
+
+
+    def backward(self, dy):
+        """
+        backward of OneHot
+        Args:f
+            dy: CTensor, gradient tensor.
+        Returns:
+            the gradient tensor over input tensor.
+        """
+        assert False, ('no gradient for backward function')
+
+
+def onehot(axis, indices, depth, values):
+    """
+    Produces a one-hot tensor based on inputs. 
+    Args:
+        axis: Axis along which one-hot representation in added. Default: axis=-1. 
+        axis=-1 means that the additional dimension will be inserted as the innermost/last dimension in the output tensor.
+    Args:
+        indices: Scalar specifying the number of classes in one-hot tensor. 
+        This is also the size of the one-hot dimension (specified by 'axis' attribute) added on in the output tensor. 
+        The values in the 'indices' input tensor are expected to be in the range [-depth, depth-1]. 
+        In case 'depth' is of non-integer type, it will be casted to int64 before use.
+    Args:
+        values: Rank 1 tensor containing exactly two elements, in the format [off_value, on_value], 
+        where 'on_value' is the value used for filling locations specified in 'indices' input tensor, 
+        and 'off_value' is the value used for filling locations other than those specified in 'indices' input tensor.
+    Returns:
+        the output CTensor.
+    """
+    return OneHot(axis, depth, values)(indices)[0]
diff --git a/python/singa/sonnx.py b/python/singa/sonnx.py
index 86dbec745e..fa87485ba4 100755
--- a/python/singa/sonnx.py
+++ b/python/singa/sonnx.py
@@ -30,111 +30,11 @@
 from . import singa_wrap as singa
 from . import autograd
 from . import tensor
+from singa import utils
 
 import collections
-deque = collections.deque
-
-
-def postorderRecursive(root, root_t):
-    """
-    return a list by the topological ordering (postorder of Depth-first search)
-    Args:
-        root: singa operator
-    Args:
-        root_t: tensor
-    Returns: 
-        deque[int]
-    """
-
-    def recursive(root, yid, root_t, res):
-        if root:
-            for srcop, yid, y, _ in root.src:
-                recursive(srcop, yid, y, res)
-            res.append((root, yid, root_t))
-
-    res = deque([])
-    recursive(root, None, root_t, res)
-    return res
-
-
-def force_unicode(s):
-    """
-    return string of a bytes
-    ! borrow from onnx
-    Args:
-        s: string or bytes
-    Returns: 
-        string
-    """
-    try:
-        return s.decode('utf-8')
-    except AttributeError:
-        return s
-
-
-def get_pad_shape(auto_pad, input_spatial_shape, kernel_spatial_shape,
-                  strides_spatial, output_spatial_shape):
-    """
-    return padding shape of conv2d or pooling,
-    ! borrow from onnx
-    Args:
-        auto_pad: string
-    Args:
-        input_spatial_shape: list[int]
-    Args:
-        kernel_spatial_shape: list[int]
-    Args:
-        strides_spatial: list[int]
-    Args:
-        output_spatial_shape: list[int]
-    Returns: 
-        list[int]
-    """
-    pad_shape = [0] * len(input_spatial_shape)
-    if auto_pad in ('SAME_UPPER', 'SAME_LOWER'):
-        for i in range(len(input_spatial_shape)):
-            pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial[i] + \
-                kernel_spatial_shape[i] - input_spatial_shape[i]
-            if (pad_shape[i] % 2) == 0:
-                pad_shape[i] = pad_shape[i] // 2
-    elif auto_pad == 'VALID':
-        pass
-    if pad_shape[0] != pad_shape[1]:
-        # once the padding is odd, it means we must add extra padding at one end of the input
-        raise ValueError("Not implemented two directional padding")
-    return pad_shape
-
-
-def get_output_shape(auto_pad, input_spatial_shape, kernel_spatial_shape,
-                     strides_spatial):
-    """
-    return output shape of conv2d or pooling,
-    ! borrow from onnx
-    Args:
-        auto_pad: string
-    Args:
-        input_spatial_shape: list[int]
-    Args:
-        kernel_spatial_shape: list[int]
-    Args:
-        strides_spatial: list[int]
-    Returns: 
-        list[int]
-    """
-    out_shape = [0] * len(input_spatial_shape)
-    if auto_pad in ('SAME_UPPER', 'SAME_LOWER'):
-        for i in range(len(input_spatial_shape)):
-            out_shape[i] = int(
-                np.ceil(
-                    float(input_spatial_shape[i]) / float(strides_spatial[i])))
-    elif auto_pad == 'VALID':
-        for i in range(len(input_spatial_shape)):
-            out_shape[i] = int(
-                np.ceil(
-                    float(input_spatial_shape[i] -
-                          (kernel_spatial_shape[i] - 1)) /
-                    float(strides_spatial[i])))
-    return out_shape
+OrderedDict = collections.OrderedDict
+namedtuple = collections.namedtuple
 
 
 class SingaFrontend(object):
@@ -151,7 +51,6 @@ class SingaFrontend(object):
     _rename_operators = {
         '_Conv2d': 'Conv',
         'ReLU': 'Relu',
-        'Dummy': 'Constant',
         'MaxPool2d': 'MaxPool',
         'AvgPool2d': 'AveragePool',
         'SoftMax': 'Softmax',
@@ -179,7 +78,7 @@ class SingaFrontend(object):
         'atanh': 'Atanh',
         'SeLU': 'Selu',
         'Elu': 'Elu',
-        'Equal': 'Equal',
+        'Equal': 'equal',
         'Less': 'Less',
         'Sign': 'Sign',
         'Div': 'Div',
@@ -206,7 +105,22 @@ class SingaFrontend(object):
         'Not': 'Not',
         'Negative': 'Neg',
         'Reciprocal': 'Reciprocal',
-        'GlobalAveragePool' : 'GlobalAveragePool'
+        'ConstantOfShape': 'ConstantOfShape',
+        'Dropout': 'Dropout',
+        'ReduceSum': 'ReduceSum',
+        'ReduceMean': 'ReduceMean',
+        'LeakyRelu': 'LeakyRelu',
+        'GlobalAveragePool': 'GlobalAveragePool',
+        'Squeeze': 'Squeeze',
+        'Unsqueeze': 'Unsqueeze',
+        'Slice': 'Slice',
+        'Ceil': 'Ceil',
+        'Split': 'Split',
+        'Gather': 'Gather',
+        'Tile': 'Tile',
+        'NonZero': 'NonZero',
+        'Cast': 'Cast',
+        'OneHot': 'OneHot',
     }
 
     # this dict indicates the operators that need extra handle
@@ -214,7 +128,6 @@ class SingaFrontend(object):
     _special_operators = {
         '_Conv2d': '_create_conv_pool',
         '_Pooling2d': '_create_conv_pool',
-        'Dummy': '_create_dummy',
         '_BatchNorm2d': '_create_batchnorm',
         'Concat': '_create_concat',
         'Flatten': '_create_flatten',
@@ -226,6 +139,18 @@ class SingaFrontend(object):
         'HardSigmoid': '_create_hardsigmoid',
         'Clip': '_create_clip',
         'Transpose': '_create_transpose',
+        'ConstantOfShape': '_create_constantOfShape',
+        'Dropout': '_create_dropout',
+        'ReduceSum': '_create_reduceOp',
+        'ReduceMean': '_create_reduceOp',
+        'Squeeze': '_create_squeeze',
+        'Unsqueeze': '_create_squeeze',
+        'Slice': '_create_slice',
+        'Split': '_create_split',
+        'Gather': '_create_gather',
+        'Tile': '_create_tile',
+        'Cast': '_create_cast',
+        'OneHot': '_create_onehot',
     }
 
     # operators with bool output
@@ -238,12 +163,209 @@ class SingaFrontend(object):
         'Or': TensorProto.BOOL,
         'Xor': TensorProto.BOOL,
         'Shape': TensorProto.INT64,
+        'NonZero': TensorProto.INT64,
     }
 
     # some ops(such as batchnorm) has inputs we cannot handle directly,
     # so we record these items firstly so that we can handle then
     # at other place.
-    _unhandled_operators = {}
+    _unhandled_operators = {
+        "_BatchNorm2d": "_special_handle_batchnorm",
+        "Reshape": "_special_handle_reshape",
+        "Clip": "_special_handle_clip",
+        "Slice": "_special_handle_slice",
+        "Gather": "_special_handle_gather",
+        "Tile": "_special_handle_tile",
+        "OneHot": "_special_handle_onehot",
+    }
+
+    @classmethod
+    def _create_onehot(cls, op, op_t):
+        """
+        get a onnx node from singa onthot
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+        # axis, indices, depth, values
+        node.attribute.extend([
+            helper.make_attribute('axis', op.axis),
+        ])
+        for attr in ['depth', 'values']:
+            node.input.append(op.name + ":" + attr)
+        return node
+
+    @classmethod
+    def _create_cast(cls, op, op_t):
+        """
+        get a onnx node from singa cast
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+
+        map_dict = {
+            tensor.float32: TensorProto.FLOAT,  # FLOAT to float32
+            tensor.int32: TensorProto.INT32,  # INT32 to int32
+        }
+        node.attribute.extend([
+            helper.make_attribute('to', map_dict[op.to]),
+        ])
+        return node
+
+    @classmethod
+    def _create_tile(cls, op, op_t):
+        """
+        get a onnx node from singa tile
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+
+        node.input.append(op.name + ":repeats")
+        return node
+
+    @classmethod
+    def _create_gather(cls, op, op_t):
+        """
+        get a onnx node from singa gather
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+
+        node.attribute.extend([
+            helper.make_attribute('axis', op.axis),
+        ])
+        node.input.append(op.name + ":indices")
+        return node
+
+    @classmethod
+    def _create_split(cls, op, op_t):
+        """
+        get a onnx node from singa split
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+
+        node.attribute.extend([
+            helper.make_attribute('axis', op.axis),
+            helper.make_attribute('split', op.parts),
+        ])
+        return node
+
+    @classmethod
+    def _create_slice(cls, op, op_t):
+        """
+        get a onnx node from singa slice
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+        for attr in ['starts', 'ends', 'axes', 'steps']:
+            node.input.append(op.name + ":" + attr)
+        return node
+
+    @classmethod
+    def _create_squeeze(cls, op, op_t):
+        """
+        get a onnx node from singa squeeze and unsqueeze
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+
+        node.attribute.extend([
+            helper.make_attribute('axes', list(op.axis)),
+        ])
+        return node
+
+    @classmethod
+    def _create_reduceOp(cls, op, op_t):
+        """
+        get a onnx node from singa ReduceSum, ReduceMean, ReduceMax, ReduceMin, etc.
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+
+        node.attribute.extend([
+            helper.make_attribute('axes', list(op.axes)),
+            helper.make_attribute('keepdims', op.keepdims),
+        ])
+        return node
+
+    @classmethod
+    def _create_dropout(cls, op, op_t):
+        """
+        get a onnx node from singa Dropout operator
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+
+        node.attribute.extend([
+            helper.make_attribute('ratio', op.ratio),
+        ])
+        return node
+
+    @classmethod
+    def _create_constantOfShape(cls, op, op_t):
+        """
+        get a onnx node from singa ConstantOfShape operator
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+        tensor_type = onnx.TensorProto.FLOAT if isinstance(
+            op.value, float) else onnx.TensorProto.INT32
+        tensor_value = onnx.helper.make_tensor("value", tensor_type, [1],
+                                               [op.value])
+        node.attribute.extend([
+            helper.make_attribute('value', tensor_value),
+        ])
+        return node
 
     @classmethod
     def _create_transpose(cls, op, op_t):
@@ -274,35 +396,16 @@ def _create_clip(cls, op, op_t):
         Returns: 
             the onnx node
         """
-
-        nodes = []
-        clip_node = cls._common_singa_tensor_to_onnx_node(op, op_t)
-
-        # firstly we add the max and min
-        for tmp_name in ['min', 'max']:
-            node_name = op.name + ":" + tmp_name
-            # moidfy the input of clip
-            clip_node.input.append(node_name)
-
-            # node = NodeProto()
-            # node.name = node_name
-            # node.op_type = cls._rename_operators.get("Dummy", "Dummy")
-            # node.output.extend([node_name])
-
-            # node.attribute.extend([helper.make_attribute(
-            #     'value', helper.make_tensor(
-            #         name=node_name,
-            #         data_type=TensorProto.FLOAT,
-            #         dims=[1],
-            #         vals=[getattr(op,tmp_name)],
-            #     )
-            # )])
-            # nodes.append(node)
-
-        # then we add the clip op itself
-        nodes.append(clip_node)
-
-        return nodes
+        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
+        if op.min is not None:
+            node.input.append(op.name + ":min")
+        else:
+            node.input.append("")
+        if op.max is not None:
+            node.input.append(op.name + ":max")
+        else:
+            node.input.append("")
+        return node
 
     @classmethod
     def _create_hardsigmoid(cls, op, op_t):
@@ -497,11 +600,12 @@ def _create_conv_pool(cls, op, op_t):
 
         k = [op.handle.kernel_h, op.handle.kernel_w]
         s = [op.handle.stride_h, op.handle.stride_w]
+        oddp = op.odd_padding
         p = [
-            op.handle.pad_h,
-            op.handle.pad_w,
-            op.handle.pad_w,
-            op.handle.pad_h,
+            op.handle.pad_h + oddp[0],
+            op.handle.pad_w + oddp[1],
+            op.handle.pad_w + oddp[2],
+            op.handle.pad_h + oddp[3],
         ]
 
         node.attribute.extend([
@@ -513,6 +617,7 @@ def _create_conv_pool(cls, op, op_t):
             node.op_type = cls._rename_operators.get('_Conv2d')
             node.attribute.extend([
                 helper.make_attribute('group', op.handle.group),
+                helper.make_attribute('auto_pad', 'NOTSET'),
             ])
 
         elif op.handle.is_max_pooling:
@@ -522,334 +627,769 @@ def _create_conv_pool(cls, op, op_t):
         return node
 
     @classmethod
-    def _create_dummy(cls, op, op_t):
+    def _get_singa_op_inputs_outputs(cls, op):
         """
-        get a onnx node from singa dummy (constant)
+        get inputs and outputs from a given operator
         Args:
             op: a given operator
-        Args:
-            op_t: the tensor of the operator
         Returns: 
-            the onnx node
+            inputs and outputs of the op
         """
-        node = cls._common_singa_tensor_to_onnx_node(op, op_t)
-        node.attribute.extend([
-            helper.make_attribute(
-                'value',
-                helper.make_tensor(
-                    name=op.name,
-                    data_type=TensorProto.FLOAT,
-                    dims=op_t.shape,
-                    vals=tensor.to_numpy(op_t).flatten().astype(float),
-                ))
-        ])
-        del node.input[:]
-        return node
+        outputs = [op.output_name(idx) for _, idx in op.y_id2idx.items()]
+        inputs = [
+            srcop.output_name(srcop.y_id2idx[yid])
+            for (srcop, yid, _, _) in op.src
+        ]
+        return inputs, outputs
 
     @classmethod
-    def _common_singa_tensor_to_onnx_node(cls, op, op_t):
+    def _get_singa_op_type(cls, op):
         """
-        get a onnx node from a singa operator, prepare its type, inputs and outputs
+        get the operator type from a given operator
         Args:
             op: a given operator
-        Args:
-            op: the tensor of the operator
-        Returns: the onnx node
+        Returns: 
+            operator type
         """
-        node_def = NodeProto()
-        node_def.name = op.name
-
-        optype = cls._get_singa_op_type(op)
-        node_def.op_type = cls._rename_operators.get(optype, optype)
-
-        inputs, outputs = cls._get_singa_op_inputs_outputs(op)
-        node_def.input.extend(inputs)
-        node_def.output.extend(outputs)
+        return type(op).__name__
 
-        return node_def
+    @classmethod
+    def _special_handle_batchnorm(cls, op, X, W):
+        """
+        hanlde the special operators
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            onnx tensor list
+        """
+        # for singa, x, scale, bias is input
+        # and mean and var is attribute
+        # so we add the mean and var to W
+        tensor_list = []
+        append_inputs = {"mean": op.running_mean, "var": op.running_var}
+        for tmp_name, append_input in append_inputs.items():
+            node_name = op.name + ":" + tmp_name
+            append_input = tensor.to_numpy(tensor.from_raw_tensor(append_input))
+            tensor_list.append(numpy_helper.from_array(append_input, node_name))
+        return tensor_list
 
     @classmethod
-    def singa_op_to_onnx_node(cls, op, op_t):
+    def _special_handle_reshape(cls, op, X, W):
         """
-        get a onnx node from singa operator
+        hanlde the special operators
         Args:
             op: a given operator
         Args:
             op_t: the tensor of the operator
         Returns: 
-            the onnx node
+            onnx tensor list
         """
-        optype = cls._get_singa_op_type(op)
-        # wether the operator needs special handler
-        if optype in cls._special_operators:
-            translator = getattr(cls, cls._special_operators[optype])
-        else:
-            translator = cls._common_singa_tensor_to_onnx_node
-        nodes = translator(op, op_t)
-        if not isinstance(nodes, collections.Iterable):
-            nodes = [nodes]
-        nodes = [node for node in nodes if node is not None]
-        return nodes
+        node_name = op.name + ":shape"
+        return [
+            numpy_helper.from_array(np.array(op.shape, dtype=np.int64),
+                                    node_name)
+        ]
 
     @classmethod
-    def singa_to_onnx_graph(cls, inputs, y, model_name="sonnx"):
+    def _special_handle_clip(cls, op, X, W):
         """
-        get onnx model from singa computational graph
+        hanlde the special operators
         Args:
-            inputs: a list of input tensors (each is initialized with a name)
+            op: a given operator
         Args:
-            y: a list of tensors, usually the outputs of the graph
+            op_t: the tensor of the operator
         Returns: 
-            the onnx model
+            onnx tensor list
         """
-        assert len(y) == 1  # assume there is only one output
-        y = y[0]
+        tensor_list = []
+        # clip add min and max
+        append_inputs = {"min": op.min, "max": op.max}
+        for tmp_name, append_input in append_inputs.items():
+            node_name = op.name + ":" + tmp_name
+            tensor_list.append(
+                helper.make_tensor(node_name, TensorProto.FLOAT, [],
+                                   [append_input]))
+        return tensor_list
 
-        graph_def = GraphProto()
-        graph_def.name = model_name
-        topol = postorderRecursive(y.creator, y)
-        # since tensor's name might change
-        # we record its id
-        input_tensors = {id(x): x for x in inputs}
-        # print(input_tensors)
-        X = []
-        optype = cls._get_singa_op_type(y.creator)
-        y_dtype = TensorProto.FLOAT
-        if optype in cls._bool_operators:
-            y_dtype = cls._bool_operators[optype]
-        Y = [helper.make_tensor_value_info(y.name, y_dtype, y.shape)]
-        for op, yid, op_t in topol:
-            optype = cls._get_singa_op_type(op)
-            # print(op.name, cls._get_singa_op_type(op), op_t, optype, yid)
-            if yid in input_tensors and optype == 'Dummy':
-                # find the input by its id
-                op_t = input_tensors[yid]
-                dtype = TensorProto.FLOAT
-                if op_t.dtype == tensor.int32:
-                    dtype = TensorProto.INT32
-                X.append(
-                    helper.make_tensor_value_info(op.name, dtype, op_t.shape))
-            # because the inputs of batchnorm and reshape are differnet with onnx
-            # we need to add these inputs into onnx model mannully
-            elif yid in input_tensors and optype == '_BatchNorm2d':
-                # batchnorm add scale, bias, mean, var as inputs
-                running_values = {
-                    "mean": op.running_mean,
-                    "var": op.running_var
-                }
-                for tmp_name, running_value in running_values.items():
-                    node_name = op.name + ":" + tmp_name
-                    tmp_device = running_value.device()
-                    running_value.ToHost()
-                    np_running_value = running_value.GetFloatValue(
-                        int(running_value.Size()))
-                    running_value.ToDevice(tmp_device)
-                    X.append(
-                        helper.make_tensor_value_info(node_name,
-                                                      TensorProto.FLOAT,
-                                                      np_running_value.shape))
-                graph_def.node.extend(cls.singa_op_to_onnx_node(op, op_t))
-            elif yid in input_tensors and optype == 'Reshape':
-                # reshape add shape
-                node_name = op.name + ":shape"
-                X.append(
-                    helper.make_tensor_value_info(node_name, TensorProto.FLOAT,
-                                                  [len(op.shape)]))
-                graph_def.node.extend(cls.singa_op_to_onnx_node(op, op_t))
-            elif yid in input_tensors and optype == 'Clip':
-                # Clip add min and max
-                node_name = op.name + ":min"
-                X.append(
-                    helper.make_tensor_value_info(node_name, TensorProto.FLOAT,
-                                                  [1]))
-                node_name = op.name + ":max"
-                X.append(
-                    helper.make_tensor_value_info(node_name, TensorProto.FLOAT,
-                                                  [1]))
-                graph_def.node.extend(cls.singa_op_to_onnx_node(op, op_t))
-            else:
-                graph_def.node.extend(cls.singa_op_to_onnx_node(op, op_t))
+    @classmethod
+    def _special_handle_slice(cls, op, X, W):
+        """
+        hanlde the special operators
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            onnx tensor list
+        """
+        tensor_list = []
+        # slice add starts, ends, axes, steps
+        append_inputs = {
+            "starts": op.starts,
+            "ends": op.ends,
+            "axes": op.axes,
+            "steps": op.steps,
+        }
+        for tmp_name, append_input in append_inputs.items():
+            node_name = op.name + ":" + tmp_name
+            tensor_list.append(
+                numpy_helper.from_array(np.array(append_input), node_name))
+        return tensor_list
 
-        graph_def.input.extend(X)
-        graph_def.output.extend(Y)
+    @classmethod
+    def _special_handle_gather(cls, op, X, W):
+        """
+        hanlde the special operators
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            onnx tensor list
+        """
+        tensor_list = []
+        append_inputs = {
+            "indices": op.indices,
+        }
+        for tmp_name, append_input in append_inputs.items():
+            node_name = op.name + ":" + tmp_name
+            tensor_list.append(
+                numpy_helper.from_array(np.array(append_input), node_name))
+        return tensor_list
+
+    @classmethod
+    def _special_handle_tile(cls, op, X, W):
+        """
+        hanlde the special operators
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            onnx tensor list
+        """
+        tensor_list = []
+        append_inputs = {
+            "repeats": op.repeats,
+        }
+        for tmp_name, append_input in append_inputs.items():
+            node_name = op.name + ":" + tmp_name
+            tensor_list.append(
+                numpy_helper.from_array(np.array(append_input), node_name))
+        return tensor_list
+
+    @classmethod
+    def _special_handle_onehot(cls, op, X, W):
+        """
+        hanlde the special operators
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            onnx tensor list
+        """
+        tensor_list = []
+        append_inputs = {
+            "depth": op.depth,
+            "values": op.values,
+        }
+        for tmp_name, append_input in append_inputs.items():
+            node_name = op.name + ":" + tmp_name
+            tensor_list.append(
+                numpy_helper.from_array(np.array(append_input), node_name))
+        return tensor_list
+
+    @classmethod
+    def handle_special_ops(cls, op, X, W):
+        """
+        hanlde the special operators, 
+        because the inputs of batchnorm and reshape are differnet with onnx
+        we need to add these inputs into onnx model mannully
+        Args:
+            op: a given operator
+        Args:
+            X: onnx input list
+        Args:
+            X: onnx weight list
+        Returns: the onnx node
+        """
+        optype = cls._get_singa_op_type(op)
+        translator = getattr(cls, cls._unhandled_operators[optype])
+        tensor_list = translator(op, X, W)
+        for tensor in tensor_list:
+            X.append(
+                helper.make_tensor_value_info(tensor.name, tensor.data_type,
+                                              tensor.dims))
+            W.append(tensor)
+        # return X, W
+
+    @classmethod
+    def _common_singa_tensor_to_onnx_node(cls, op, op_t):
+        """
+        get a onnx node from a singa operator, prepare its type, inputs and outputs
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: the onnx node
+        """
+        node_def = NodeProto()
+        node_def.name = op.name
+
+        optype = cls._get_singa_op_type(op)
+        node_def.op_type = cls._rename_operators.get(optype, optype)
+
+        inputs, outputs = cls._get_singa_op_inputs_outputs(op)
+        node_def.input.extend(inputs)
+        node_def.output.extend(outputs)
+
+        return node_def
+
+    @classmethod
+    def singa_op_to_onnx_node(cls, op, op_t):
+        """
+        get a onnx node from singa operator
+        Args:
+            op: a given operator
+        Args:
+            op_t: the tensor of the operator
+        Returns: 
+            the onnx node
+        """
+        optype = cls._get_singa_op_type(op)
+        # wether the operator needs special handler
+        if optype in cls._special_operators:
+            translator = getattr(cls, cls._special_operators[optype])
+        else:
+            translator = cls._common_singa_tensor_to_onnx_node
+        nodes = translator(op, op_t)
+        if not isinstance(nodes, collections.Iterable):
+            nodes = [nodes]
+        nodes = [node for node in nodes if node is not None]
+        return nodes
+
+    @classmethod
+    def singa_to_onnx_graph(cls, inputs, y, model_name="sonnx"):
+        """
+        get onnx model from singa computational graph
+        Args:
+            inputs: a list of input tensors (each is initialized with a name)
+        Args:
+            y: a list of tensors, usually the outputs of the graph
+        Returns: 
+            the onnx model
+        """
+        assert len(
+            y
+        ) == 1, "Not support multiple output now."  # assume there is only one output
+        y = y[0]
+
+        graph_def = GraphProto()
+        graph_def.name = model_name
+        topol, ws, ins = utils.post_order_recursive(y.creator, y)
+
+        # prepare the input
+        X = []
+        for op_name, op_t in ins.items():
+            op_t = inputs.pop(0)
+            dtype = TensorProto.INT32 if op_t.dtype == tensor.int32 else TensorProto.FLOAT
+            X.append(helper.make_tensor_value_info(op_name, dtype, op_t.shape))
+
+        # prepare the output
+        y_optype = cls._get_singa_op_type(y.creator)
+        if y_optype in cls._bool_operators:
+            y_dtype = cls._bool_operators[y_optype]
+        elif y.dtype == tensor.int32:
+            y_dtype = TensorProto.INT32
+        else:
+            y_dtype = TensorProto.FLOAT
+        Y = [helper.make_tensor_value_info(y.name, y_dtype, y.shape)]
+
+        # prepare the weight
+        W = []
+        for op_name, op_t in ws.items():
+            dtype = TensorProto.INT32 if op_t.dtype == tensor.int32 else TensorProto.FLOAT
+            wt = tensor.to_numpy(op_t)
+            wt = numpy_helper.from_array(wt)
+            wt.name = op_name
+            W.append(wt)
+            X.append(helper.make_tensor_value_info(op_name, dtype, op_t.shape))
+
+        # iterate the node graph
+        for op_name, op in topol.items():
+            optype = cls._get_singa_op_type(op)
+            if optype in cls._unhandled_operators:
+                cls.handle_special_ops(op, X, W)
+            graph_def.node.extend(cls.singa_op_to_onnx_node(op, op_t))
+
+        graph_def.input.extend(X)
+        graph_def.output.extend(Y)
+        graph_def.initializer.extend(W)
         return graph_def
 
     @classmethod
-    def singa_to_onnx_model(cls, inputs, y, model_name="sonnx"):
+    def singa_to_onnx_model(cls, inputs, y, model_name="sonnx"):
+        """
+        get onnx model from singa computational graph
+        Args:
+            inputs: a list of input tensors (each is initialized with a name)
+        Args:
+            y: a list of tensors, usually the outputs of the graph
+        Returns: 
+            the onnx model
+        """
+        opset_id = OperatorSetIdProto()
+        opset_id.version = cls._target_opset_version
+        model = helper.make_model(cls.singa_to_onnx_graph(inputs,
+                                                          y,
+                                                          model_name="sonnx"),
+                                  producer_name='sonnx',
+                                  opset_imports=[opset_id])
+        model = optimizer.optimize(model)
+        checker.check_model(model)
+        return model
+
+
+class OnnxNode(object):
+    """
+    Reimplementation of NodeProto from ONNX, but in a form
+    more convenient to work with from Python.
+    """
+
+    def __init__(self, node):
+        self.name = str(node.name)
+        self.op_type = str(node.op_type)
+        self.attrs = OnnxAttributes.from_onnx(node.attribute)
+        # there may some inputs which we regard as attribute, so we mark them there
+        self.consumed_inputs = list()
+        self.inputs = list(node.input)
+        self.outputs = list(node.output)
+
+    def getattr(self, key, default=None):
+        return self.attrs[key] if key in self.attrs else default
+
+
+class OnnxAttributes(dict):
+    """
+    This is a more convenient way to work with ONNX attributes
+    that is not the protobuf representation.
+    """
+
+    @staticmethod
+    def from_onnx(args):
+        d = OnnxAttributes()
+        for arg in args:
+            d[arg.name] = helper.get_attribute_value(arg)
+        return d
+
+
+class SingaBackend(Backend):
+
+    # This number indicates the onnx operator set version
+    _known_opset_version = 11
+
+    # beceuase singa's operators are different from onnx.
+    # we define a dict for the name projection
+    _rename_operators = {
+        'Relu': 'relu',
+        'Softmax': 'SoftMax',
+        'Sigmoid': 'sigmoid',
+        'Add': 'add',
+        'MatMul': 'matmul',
+        'Conv': '_Conv2d',
+        'MaxPool': '_Pooling2d',
+        'AveragePool': '_Pooling2d',
+        'BatchNormalization': 'batchnorm_2d',
+        'Concat': 'Concat',
+        'Flatten': 'Flatten',
+        'Gemm': 'Gemm',
+        'Reshape': 'Reshape',
+        'Sum': 'sum',
+        'Cos': 'cos',
+        'Cosh': 'cosh',
+        'Sin': 'sin',
+        'Sinh': 'sinh',
+        'Tan': 'tan',
+        'Tanh': 'tanh',
+        'Acos': 'acos',
+        'Acosh': 'acosh',
+        'Asin': 'asin',
+        'Asinh': 'asinh',
+        'Atan': 'atan',
+        'Atanh': 'atanh',
+        'Selu': 'SeLU',
+        'Elu': 'Elu',
+        'Equal': 'equal',
+        'Less': 'less',
+        'Sign': 'sign',
+        'Div': 'div',
+        'Sub': 'sub',
+        'Sqrt': 'sqrt',
+        'Log': 'log',
+        'Greater': 'greater',
+        'HardSigmoid': 'HardSigmoid',
+        'Identity': 'identity',
+        'Softplus': 'softplus',
+        'Softsign': 'softsign',
+        'Mean': 'mean',
+        'Pow': 'pow',
+        'Clip': 'Clip',
+        'PRelu': 'prelu',
+        'Mul': 'mul',
+        'Transpose': 'Transpose',
+        'Max': 'max',
+        'Min': 'min',
+        'Shape': 'shape',
+        'And': '_and',
+        'Or': '_or',
+        'Xor': '_xor',
+        'Not': '_not',
+        'Neg': 'negative',
+        'Reciprocal': 'reciprocal',
+        'ConstantOfShape': 'ConstantOfShape',
+        'Dropout': 'Dropout',
+        'ReduceSum': 'ReduceSum',
+        'ReduceMean': 'ReduceMean',
+        'LeakyRelu': 'LeakyRelu',
+        'GlobalAveragePool': 'GlobalAveragePool',
+        'Squeeze': 'Squeeze',
+        'Unsqueeze': 'Unsqueeze',
+        'Slice': 'Slice',
+        'Ceil': 'Ceil',
+        'Split': 'Split',
+        'Gather': 'Gather',
+        'Tile': 'Tile',
+        'NonZero': 'nonzero',
+        'Cast': 'Cast',
+        'OneHot': 'OneHot',
+    }
+
+    # this dict indicates the operators that need extra handle
+    # each indicates a function name
+    _special_operators = {
+        'Conv': '_create_conv',
+        'MaxPool': '_create_max_avg_pool',
+        'AveragePool': '_create_max_avg_pool',
+        'BatchNormalization': '_create_batchnorm',
+        'Concat': '_create_concat',
+        'Flatten': '_create_flatten',
+        'Gemm': '_create_gemm',
+        'Reshape': '_create_reshape',
+        'Softmax': '_create_softmax',
+        'Selu': '_create_selu',
+        'Elu': '_create_elu',
+        'HardSigmoid': '_create_hardsigmoid',
+        'Clip': '_create_clip',
+        'Transpose': '_create_transpose',
+        'ConstantOfShape': '_create_constantOfShape',
+        'Dropout': '_create_dropout',
+        'ReduceSum': '_create_reduceOp',
+        'ReduceMean': '_create_reduceOp',
+        'LeakyRelu': '_create_leakyrelu',
+        'GlobalAveragePool': '_create_globalaveragepool',
+        'Squeeze': '_create_squeeze',
+        'Unsqueeze': '_create_squeeze',
+        'Slice': '_create_slice',
+        'Split': '_create_split',
+        'Gather': '_create_gather',
+        'Tile': '_create_tile',
+        'Cast': '_create_cast',
+        'OneHot': '_create_onehot',
+        'Constant': "_create_constant"
+    }
+
+    @classmethod
+    def _create_constant(cls, onnx_node, inputs, opset_version):
+        """
+        parse onnx constatn node to weights
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        tmp_tensor = onnx_node.getattr('value')
+        np_dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[tmp_tensor.data_type]
+        np_tensor = np.frombuffer(tmp_tensor.raw_data, dtype=np_dtype)
+        if np_tensor.dtype == "int64":
+            np_tensor = np_tensor.astype(np.int32)
+        # todo, we cannot support scalar tensor
+        if np.ndim(np_tensor) == 0:
+            np_tensor = np.array(np_tensor, ndmin=1)
+        return None, np_tensor
+
+    @classmethod
+    def _create_onehot(cls, onnx_node, inputs, opset_version):
+        """
+        get the OneHot operator from onnx node
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        axis = onnx_node.getattr("axis", -1)
+        # we move several inputs to singa's attribuates
+        # and mark them so we don't use them when we run this operator
+        depth = tensor.to_numpy(inputs.pop(1)).astype(np.int32)
+        value = tensor.to_numpy(inputs.pop(1))
+        onnx_node.consumed_inputs.extend(onnx_node.inputs[1:])
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(axis, depth, value)
+
+    @classmethod
+    def _create_cast(cls, onnx_node, inputs, opset_version):
+        """
+        get the Cast operator from onnx node
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        to = onnx_node.getattr("to")
+        # singa only supports float32 and int32
+        map_dict = {
+            TensorProto.FLOAT: tensor.float32,  # FLOAT to float32
+            TensorProto.UINT8: None,  # UINT8
+            TensorProto.INT8: tensor.int32,  # INT8 to int32
+            TensorProto.UINT16: None,  # UINT16
+            TensorProto.INT16: tensor.int32,  # INT16 to int32
+            TensorProto.INT32: tensor.int32,  # INT32 to int32
+            TensorProto.INT64: tensor.int32,  # INT64 to int32
+            TensorProto.STRING: None,  # stirng
+            TensorProto.BOOL: None,  # bool
+        }
+        to = map_dict[to]
+        assert to != None, "not support cast type: {}".format(to)
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(to)
+
+    @classmethod
+    def _create_tile(cls, onnx_node, inputs, opset_version):
+        """
+        get the Tile operator from onnx node
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        # we move several inputs to singa's attribuates
+        # and mark them so we don't use them when we run this operator
+        repeats = tensor.to_numpy(inputs.pop(1)).astype(np.int32).tolist()
+        onnx_node.consumed_inputs.append(onnx_node.inputs[1])
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(repeats)
+
+    @classmethod
+    def _create_gather(cls, onnx_node, inputs, opset_version):
+        """
+        get the Gather operator from onnx node
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        axis = onnx_node.getattr("axis", 0)
+        # we move several inputs to singa's attribuates
+        # and mark them so we don't use them when we run this operator
+        indices = tensor.to_numpy(inputs.pop(1)).astype(np.int32).tolist()
+        onnx_node.consumed_inputs.append(onnx_node.inputs[1])
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(axis, indices)
+
+    @classmethod
+    def _create_split(cls, onnx_node, inputs, opset_version):
+        """
+        get the Split operator from onnx node
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        axis = onnx_node.getattr("axis", 0)
+        split = onnx_node.getattr("split", None)
+        num_output = len(onnx_node.outputs)
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(axis, split, num_output)
+
+    @classmethod
+    def _create_slice(cls, onnx_node, inputs, opset_version):
+        """
+        get the Slice operator from onnx node
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        # we move several inputs to singa's attribuates
+        # and mark them so we don't use them when we run this operator
+        starts = tensor.to_numpy(inputs.pop(1)).astype(np.int32).tolist()
+        ends = tensor.to_numpy(inputs.pop(1)).astype(np.int32).tolist()
+        # sometime onnx may ignore these two inputs, axes and step
+        if len(inputs) >= 2 and onnx_node.inputs[3] != '':
+            axes = tensor.to_numpy(inputs.pop(1)).astype(np.int32).tolist()
+        else:
+            axes = None
+        steps = tensor.to_numpy(inputs.pop(1)).astype(
+            np.int32).tolist() if len(inputs) >= 2 else None
+        onnx_node.consumed_inputs.extend(onnx_node.inputs[1:])
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(starts, ends, axes, steps)
+
+    @classmethod
+    def _create_squeeze(cls, onnx_node, inputs, opset_version):
+        """
+        get the Squeeze and Unsqueeze operator from onnx node
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        axes = onnx_node.getattr("axes")
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(axes)
+
+    @classmethod
+    def _create_globalaveragepool(cls, onnx_node, inputs, opset_version):
         """
-        get onnx model from singa computational graph
+        get the GlobalAveragePool operator from onnx node
         Args:
-            inputs: a list of input tensors (each is initialized with a name)
+            onnx_node: a given onnx node
         Args:
-            y: a list of tensors, usually the outputs of the graph
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
         Returns: 
-            the onnx model
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
         """
-        opset_id = OperatorSetIdProto()
-        opset_id.version = cls._target_opset_version
-        model = helper.make_model(cls.singa_to_onnx_graph(inputs,
-                                                          y,
-                                                          model_name="sonnx"),
-                                  producer_name='sonnx',
-                                  opset_imports=[opset_id])
-        # print('The model is:\n{}'.format(model))
-        model = optimizer.optimize(model)
-        checker.check_model(model)
-        return model
+        data_format = onnx_node.getattr("data_format", 'channels_first')
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(data_format)
 
     @classmethod
-    def _get_singa_op_inputs_outputs(cls, op):
+    def _create_leakyrelu(cls, onnx_node, inputs, opset_version):
         """
-        get inputs and outputs from a given operator
+        get the LeakyRelu operator from onnx node
         Args:
-            op: a given operator
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
         Returns: 
-            inputs and outputs of the op
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
         """
-        outputs = [op.output_name(idx) for yid, idx in op.y_id2idx.items()]
-        inputs = [
-            srcop.output_name(srcop.y_id2idx[yid])
-            for (srcop, yid, _, _) in op.src
-        ]
-        return inputs, outputs
+        alpha = onnx_node.getattr("alpha", 0.01)
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(alpha)
 
     @classmethod
-    def _get_singa_op_type(cls, op):
+    def _create_reduceOp(cls, onnx_node, inputs, opset_version):
         """
-        get the operator type from a given operator
+        get the ReduceSum, ReduceMean, ReduceMax, ReduceMin, etc, operator from onnx node
         Args:
-            op: a given operator
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
         Returns: 
-            operator type
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
         """
-        return type(op).__name__
-
-
-class OnnxNode(object):
-    """
-    Reimplementation of NodeProto from ONNX, but in a form
-    more convenient to work with from Python.
-    We may temporarily edit these nodes to get them into Caffe2 form,
-    before actually translating into the Caffe2 protobuf, since this
-    is easier than decomposing everything, and putting it back together
-    when we're ready.
-    """
-
-    def __init__(self, node):
-        self.name = str(node.name)
-        self.op_type = str(node.op_type)
-        self.attrs = OnnxAttributes.from_onnx(node.attribute)
-        self.inputs = list(node.input)
-        self.outputs = list(node.output)
-
-    def getattr(self, key, default=None):
-        return self.attrs[key] if key in self.attrs else default
-
-
-class OnnxAttributes(dict):
-    """
-    This is a more convenient way to work with ONNX/Caffe2 attributes
-    that is not the protobuf representation.
-    """
-
-    @staticmethod
-    def from_onnx(args):
-        d = OnnxAttributes()
-        for arg in args:
-            d[arg.name] = helper.get_attribute_value(arg)
-        return d
-
-
-class SingaBackend(Backend):
-
-    # This number indicates the onnx operator set version
-    _known_opset_version = 11
-
-    # beceuase singa's operators are different from onnx.
-    # we define a dict for the name projection
-    _rename_operators = {
-        'Relu': 'relu',
-        'Softmax': 'SoftMax',
-        'Sigmoid': 'sigmoid',
-        'Add': 'add',
-        'MatMul': 'Matmul',
-        'Conv': 'conv2d',
-        'MaxPool': 'pooling_2d',
-        'AveragePool': 'pooling_2d',
-        'BatchNormalization': 'batchnorm_2d',
-        'Concat': 'Concat',
-        'Flatten': 'Flatten',
-        'Gemm': 'Gemm',
-        'Reshape': 'reshape',
-        'Sum': 'sum',
-        'Cos': 'cos',
-        'Cosh': 'cosh',
-        'Sin': 'sin',
-        'Sinh': 'sinh',
-        'Tan': 'tan',
-        'Tanh': 'tanh',
-        'Acos': 'acos',
-        'Acosh': 'acosh',
-        'Asin': 'asin',
-        'Asinh': 'asinh',
-        'Atan': 'atan',
-        'Atanh': 'atanh',
-        'Selu': 'SeLU',
-        'Elu': 'Elu',
-        'Equal': 'equal',
-        'Less': 'less',
-        'Sign': 'sign',
-        'Div': 'div',
-        'Sub': 'sub',
-        'Sqrt': 'sqrt',
-        'Log': 'log',
-        'Greater': 'greater',
-        'HardSigmoid': 'HardSigmoid',
-        'Identity': 'identity',
-        'Softplus': 'softplus',
-        'Softsign': 'softsign',
-        'Mean': 'mean',
-        'Pow': 'pow',
-        'Clip': 'clip',
-        'PRelu': 'prelu',
-        'Mul': 'mul',
-        'Transpose': 'Transpose',
-        'Max': 'max',
-        'Min': 'min',
-        'Shape': 'shape',
-        'And': '_and',
-        'Or': '_or',
-        'Xor': '_xor',
-        'Not': '_not',
-        'Neg': 'negative',
-        'Reciprocal': 'reciprocal',
-        'GlobalAveragePool' : 'globalaveragepool'
-    }
+        axes = onnx_node.getattr("axes", None)
+        keepdims = onnx_node.getattr("keepdims", 1)
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(axes, keepdims)
 
-    # this dict indicates the operators that need extra handle
-    # each indicates a function name
-    _special_operators = {
-        'Conv': '_create_conv',
-        'MaxPool': '_create_max_avg_pool',
-        'AveragePool': '_create_max_avg_pool',
-        'BatchNormalization': '_create_batchnorm',
-        'Concat': '_create_concat',
-        'MatMul': '_create_matmul',
-        'Flatten': '_create_flatten',
-        'Gemm': '_create_gemm',
-        'Reshape': '_create_reshape',
-        'Softmax': '_create_softmax',
-        'Selu': '_create_selu',
-        'Elu': '_create_elu',
-        'HardSigmoid': '_create_hardsigmoid',
-        'Clip': '_create_clip',
-        'Transpose': '_create_transpose',
-    }
+    @classmethod
+    def _create_dropout(cls, onnx_node, inputs, opset_version):
+        """
+        get the Dropout operator from onnx node
+        Args:
+            onnx_node: a given onnx node
+        Args:
+            inputs: the input tensor
+        Args:
+            opset_version: the opset version
+        Returns: 
+            handle, the handle of singa operator
+        Returns: 
+            forward, the autograd of singa operator
+        """
+        ratio = onnx_node.getattr("ratio", 0)
+        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
+                                                       opset_version)
+        return _, forward(ratio)
 
     @classmethod
-    def _create_transpose(cls, onnx_node, inputs, opset_version):
+    def _create_constantOfShape(cls, onnx_node, inputs, opset_version):
         """
-        get the Transpose operator from onnx node
+        get the ConstantOfShape operator from onnx node
         Args:
             onnx_node: a given onnx node
         Args:
@@ -861,16 +1401,17 @@ def _create_transpose(cls, onnx_node, inputs, opset_version):
         Returns: 
             forward, the autograd of singa operator
         """
-        shape = inputs[0].shape
-        perm = onnx_node.getattr("perm", list(range(len(shape) - 1, -1, -1)))
+        value = onnx_node.getattr("value", 0)
+        if isinstance(value, onnx.TensorProto):
+            value = numpy_helper.to_array(value)[0].item()
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
-        return _, forward(perm)
+        return _, forward(value)
 
     @classmethod
-    def _create_clip(cls, onnx_node, inputs, opset_version):
+    def _create_transpose(cls, onnx_node, inputs, opset_version):
         """
-        get the clip operator from onnx node
+        get the Transpose operator from onnx node
         Args:
             onnx_node: a given onnx node
         Args:
@@ -882,14 +1423,16 @@ def _create_clip(cls, onnx_node, inputs, opset_version):
         Returns: 
             forward, the autograd of singa operator
         """
+        shape = inputs[0].shape
+        perm = onnx_node.getattr("perm", list(range(len(shape) - 1, -1, -1)))
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
-        return _, forward
+        return _, forward(perm)
 
     @classmethod
-    def _create_hardsigmoid(cls, onnx_node, inputs, opset_version):
+    def _create_clip(cls, onnx_node, inputs, opset_version):
         """
-        get the HardSigmoid operator from onnx node
+        get the clip operator from onnx node
         Args:
             onnx_node: a given onnx node
         Args:
@@ -901,16 +1444,24 @@ def _create_hardsigmoid(cls, onnx_node, inputs, opset_version):
         Returns: 
             forward, the autograd of singa operator
         """
-        alpha = onnx_node.getattr("alpha", 0.2)
-        beta = onnx_node.getattr("beta", 0.5)
+        # sometime onnx may ignore these two inputs, min or max or both
+        if len(inputs) >= 2 and onnx_node.inputs[1] != '':
+            min_v = tensor.to_numpy(inputs.pop(1)).tolist()[0]
+        else:
+            min_v = None
+        if len(inputs) >= 2 and onnx_node.inputs[2] != '':
+            max_v = tensor.to_numpy(inputs.pop(1)).tolist()[0]
+        else:
+            max_v = None
+        onnx_node.consumed_inputs.extend(onnx_node.inputs[1:])
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
-        return _, forward(alpha, beta)
+        return _, forward(min_v, max_v)
 
     @classmethod
-    def _create_equal(cls, onnx_node, inputs, opset_version):
+    def _create_hardsigmoid(cls, onnx_node, inputs, opset_version):
         """
-        get the equal operator from onnx node
+        get the HardSigmoid operator from onnx node
         Args:
             onnx_node: a given onnx node
         Args:
@@ -922,9 +1473,11 @@ def _create_equal(cls, onnx_node, inputs, opset_version):
         Returns: 
             forward, the autograd of singa operator
         """
+        alpha = onnx_node.getattr("alpha", 0.2)
+        beta = onnx_node.getattr("beta", 0.5)
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
-        return _, forward()
+        return _, forward(alpha, beta)
 
     @classmethod
     def _create_elu(cls, onnx_node, inputs, opset_version):
@@ -982,9 +1535,11 @@ def _create_reshape(cls, onnx_node, inputs, opset_version):
         Returns: 
             the autograd of singa operator
         """
+        shape = tensor.to_numpy(inputs.pop(1)).astype(np.int32).tolist()
+        onnx_node.consumed_inputs.append(onnx_node.inputs[1])
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
-        return _, forward
+        return _, forward(shape)
 
     @classmethod
     def _create_conv(cls, onnx_node, inputs, opset_version):
@@ -1002,24 +1557,27 @@ def _create_conv(cls, onnx_node, inputs, opset_version):
             forward, the autograd of singa operator
         """
         kernel = tuple(onnx_node.attrs["kernel_shape"])
-        # todo: we only support the padding with tuple
+        padding = tuple(
+            onnx_node.attrs["pads"]) if "pads" in onnx_node.attrs else (0, 0)
         stride = tuple(onnx_node.getattr('strides', (1, 1)))
-        padding = tuple(onnx_node.attrs["pads"][0:2]) if "pads" in onnx_node.attrs else (0, 0)
+        # default the odd_padding is 0, once there are same pad mode, we modify it
+        # for odd_padding, please refer the autegrade.py
+        odd_padding = (0, 0, 0, 0)
         if "auto_pad" in onnx_node.attrs:
-            auto_pad = force_unicode(onnx_node.attrs['auto_pad'])
-            out_shape = get_output_shape(auto_pad, inputs[0].shape[2:], kernel, stride)
-            padding = get_pad_shape(auto_pad, inputs[0].shape[2:], kernel, stride, out_shape)
-        dilation = onnx_node.getattr('dilations', 1)
-        group = onnx_node.getattr('group', 1)
+            auto_pad = utils.force_unicode(onnx_node.attrs['auto_pad'])
+            if auto_pad in ('SAME_UPPER', 'SAME_LOWER'):
+                padding, odd_padding = utils.get_padding_shape(
+                    auto_pad, inputs[0].shape[2:], kernel, stride)
 
         # not support dilation
-
+        dilation = onnx_node.getattr('dilations', 1)
         if dilation != 1 and list(dilation) != [1, 1]:
             raise ValueError("Not implemented yet for dilation")
+        group = onnx_node.getattr('group', 1)
 
-        # only support 2d
-        if len(kernel) != 2:
-            raise ValueError("Not implemented yet for 2d")
+        # only support 1d or 2d
+        if len(kernel) > 2:
+            raise ValueError("Only implemented for 1d or 2d")
 
         bias = len(inputs) == 3
         x = inputs[0]
@@ -1043,7 +1601,7 @@ def _create_conv(cls, onnx_node, inputs, opset_version):
 
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
-        return handle, forward
+        return _, forward(handle, odd_padding)
 
     @classmethod
     def _create_max_avg_pool(cls, onnx_node, inputs, opset_version):
@@ -1061,17 +1619,17 @@ def _create_max_avg_pool(cls, onnx_node, inputs, opset_version):
             forward, the autograd of singa operator
         """
         kernel = tuple(onnx_node.attrs["kernel_shape"])
-        # todo: we only support the padding with tuple
         padding = tuple(
-            onnx_node.attrs["pads"][0:2]) if "pads" in onnx_node.attrs else (0,
-                                                                             0)
+            onnx_node.attrs["pads"]) if "pads" in onnx_node.attrs else (0, 0)
         stride = tuple(onnx_node.getattr('strides', (1, 1)))
+        # default the odd_padding is 0, once there are same pad mode, we modify it
+        # for odd_padding, please refer the autegrade.py
+        odd_padding = (0, 0, 0, 0)
         if "auto_pad" in onnx_node.attrs:
-            auto_pad = force_unicode(onnx_node.attrs['auto_pad'])
-            out_shape = get_output_shape(auto_pad, inputs[0].shape[2:], kernel,
-                                         stride)
-            padding = get_pad_shape(auto_pad, inputs[0].shape[2:], kernel,
-                                    stride, out_shape)
+            auto_pad = utils.force_unicode(onnx_node.attrs['auto_pad'])
+            if auto_pad in ('SAME_UPPER', 'SAME_LOWER'):
+                padding, odd_padding = utils.get_padding_shape(
+                    auto_pad, inputs[0].shape[2:], kernel, stride)
 
         # not support count_include_pad and auto_pad
         if "count_include_pad" in onnx_node.attrs or "ceil_mode" in onnx_node.attrs:
@@ -1093,7 +1651,7 @@ def _create_max_avg_pool(cls, onnx_node, inputs, opset_version):
 
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
-        return handle, forward
+        return _, forward(handle, odd_padding)
 
     @classmethod
     def _create_batchnorm(cls, onnx_node, inputs, opset_version):
@@ -1156,12 +1714,8 @@ def _create_softmax(cls, onnx_node, inputs, opset_version):
         """
         factor = onnx_node.getattr('axis', 1)
         if factor < 0:
-            factor = len(inputs[0].shape
-                        ) + factor  # in order to support the negative axis
-        # alpha = onnx_node.attrs["alpha"]
-        # beta = onnx_node.attrs["beta"]
-        # transA = False if onnx_node.attrs["transA"] == 0 else True
-        # transB = False if onnx_node.attrs["transB"] == 0 else True
+            # in order to support the negative axis
+            factor = len(inputs[0].shape) + factor
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
         return None, forward(axis=factor)
@@ -1210,32 +1764,13 @@ def _create_flatten(cls, onnx_node, inputs, opset_version):
         """
         factor = onnx_node.getattr('axis', 1)
         if factor < 0:
-            factor = len(inputs[0].shape
-                        ) + factor  # in order to support the negative axis
+            # in order to support the negative axis
+            factor = len(inputs[0].shape) + factor
 
         _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
                                                        opset_version)
         return None, forward(start_axis=factor)
 
-    @classmethod
-    def _create_matmul(cls, onnx_node, inputs, opset_version):
-        """
-        get the matmul operator from onnx node
-        Args:
-            onnx_node: a given onnx node
-        Args:
-            inputs: the input tensor
-        Args:
-            opset_version: the opset version
-        Returns: 
-            the handle of singa operator
-        Returns: 
-            the autograd of singa operator
-        """
-        _, forward = cls._common_onnx_node_to_singa_op(onnx_node, inputs,
-                                                       opset_version)
-        return None, forward()
-
     @classmethod
     def _common_onnx_node_to_singa_op(cls, onnx_node, inputs, opset_version):
         """
@@ -1253,12 +1788,16 @@ def _common_onnx_node_to_singa_op(cls, onnx_node, inputs, opset_version):
             a list of SingaOps('name', 'op', 'handle', 'forward')
         """
         onnx_op_type = onnx_node.op_type
-        autograd_op = getattr(
-            autograd, cls._rename_operators.get(onnx_op_type, onnx_op_type))
+        assert onnx_op_type in cls._rename_operators, "not support operator: {}".format(
+            onnx_op_type)
+        autograd_op = getattr(autograd, cls._rename_operators[onnx_op_type])
         return None, autograd_op
 
     @classmethod
-    def _onnx_node_to_singa_op(cls, onnx_node, inputs, opset_version):
+    def _onnx_node_to_singa_op(cls,
+                               onnx_node,
+                               inputs,
+                               opset_version=_known_opset_version):
         """
         get a singa operator(handle and autograd) from a onnx node
         Args:
@@ -1298,10 +1837,19 @@ def run_node(cls, onnx_node, inputs, opset_version=_known_opset_version):
             inputs), "{}: expected {} but got {}".format(
                 onnx_node.op_type, len(valid_inputs), len(inputs))
 
-        inputs = [inputs[x] for x in valid_inputs]
-        handle, forward = cls._onnx_node_to_singa_op(onnx_node, inputs,
+        tmp_inputs = [inputs[x] for x in onnx_node.inputs if x != ""]
+        handle, forward = cls._onnx_node_to_singa_op(onnx_node, tmp_inputs,
                                                      opset_version)
-        return cls._run_node(onnx_node, inputs, handle, forward, opset_version)
+        # only give the inputs it needs
+        # consumed_inputs are the inputs marked as attributes
+        # so we remove it here
+        tmp_inputs = [
+            inputs[x]
+            for x in onnx_node.inputs
+            if x not in onnx_node.consumed_inputs
+        ]
+        return cls._run_node(onnx_node, tmp_inputs, handle, forward,
+                             opset_version)
 
     @classmethod
     def _run_node(cls,
@@ -1323,42 +1871,78 @@ def _run_node(cls,
         Returns: 
             list, the output of the
         """
-        # since reshape acutally only needs one input tensor
-        # but onnx regard its shape as another tensor, we need to ommit it
         outputs = forward(*inputs) if handle is None else forward(
             handle, *inputs)
         if not isinstance(outputs, collections.Iterable):
             outputs = [outputs]
-        outputs_dict = collections.OrderedDict()
+        outputs_dict = OrderedDict()
         for (key, val) in zip(onnx_node.outputs, outputs):
             outputs_dict[key] = val
         return outputs_dict
 
     @classmethod
-    def _onnx_node_to_singa_tensor(cls, node_infos, tensor_map, device):
+    def _init_graph_parameter(cls, graph, init_inputs, device):
         """
         init the singa tensor from onnx infos
         Args:
-            node_infos: a given onnx model
+            graph: a given onnx graph
         Args:
-            tensor_map: the tensor map
+            init_inputs: a list of inputs, which used to init the operators
         Args:
             device: the used device
+        Returns:
+            a dict of tensors
         """
-        for x in node_infos:
-            x_shape = tuple(
-                dim.dim_value for dim in x.type.tensor_type.shape.dim)
-            tmp_tensor = tensor.from_numpy(
-                np.random.randn(*x_shape).astype(np.float32))
+        tensor_map = {}
+        # due to https://github.com/onnx/onnx/issues/2417
+        # sometimes, input contains all initializer's info
+        # sometimes, may not
+        all_inputs = OrderedDict()
+        for t in graph.input:
+            all_inputs[t.name] = t
+        # so we refresh the input by the initializer
+        for t in graph.initializer:
+            all_inputs[t.name] = t
+        initializers = {t.name for t in graph.initializer}
+        inp_idx = 0
+        for name, x in all_inputs.items():
+            if name in initializers:
+                # if it has initializer, we use its value as the input
+                np_tensor = numpy_helper.to_array(x)
+                if np_tensor.dtype == "int64":
+                    np_tensor = np_tensor.astype(np.int32)
+                # todo, we cannot support scalar tensor
+                if np.ndim(np_tensor) == 0:
+                    np_tensor = np.array(np_tensor, ndmin=1)
+            else:
+                # if not, means it's a input rather than a inner weight
+                # so if the user gives values, we use these values
+                # if not, we just use the shape of input gived by onnx to init a random value
+                # HOWEVER, the random value may not be correct for some inputs, such as gather which needs indices
+                # so if have operators, the user must give inputs
+                x_shape = tuple(
+                    dim.dim_value for dim in x.type.tensor_type.shape.dim)
+                if init_inputs is not None:
+                    np_tensor = init_inputs[inp_idx]
+                    inp_idx += 1
+                else:
+                    np_tensor = np.random.randn(*x_shape).astype(np.float32)
+            tmp_tensor = tensor.from_numpy(np_tensor)
             tmp_tensor.to_device(device)
+            # todo, for backward
+            tmp_tensor.stores_grad = (name in initializers)
             tensor_map[x.name] = tmp_tensor
+        return tensor_map
 
     @classmethod
-    def _onnx_model_to_singa_net(cls, onnx_model, device, opset_version):
+    def _onnx_model_to_singa_net(cls, model, init_inputs, device,
+                                 opset_version):
         """
         get all intermediate tensors and operators from onnx model
         Args:
-            onnx_model: a given onnx model
+            model: a given onnx model
+        Args:
+            init_inputs: a list of inputs, which used to init the operators
         Args:
             device: the used device
         Args:
@@ -1368,40 +1952,37 @@ def _onnx_model_to_singa_net(cls, onnx_model, device, opset_version):
         Returns:
             a list of SingaOps('name', 'op', 'handle', 'forward')
         """
-        #  runs model checker, optimizer, shape inference engine
-        optimized_model = onnx.utils.polish_model(onnx_model)
-        # print('The model is:\n{}'.format(optimized_model))
-        # this tensor_nap contains all tensors, including outputs of each op
-        tensor_map = {}
-        # this weights only contains the tensors which have stored the gradients
-        weights = {}
+        # init all tensor input and weight as a tensor map
+        tensor_map = cls._init_graph_parameter(model.graph, init_inputs, device)
+        # only weights tensor
+        weights = {x.name: tensor_map[x.name] for x in model.graph.initializer}
+        # the parsed operators queue
         singa_ops = []
-        singa_op = collections.namedtuple('SingaOps',
-                                          ['name', 'op', 'handle', 'forward'])
-        # init the input, output, and intermidate nodes as singa tensors
-        cls._onnx_node_to_singa_tensor(optimized_model.graph.input, tensor_map,
-                                       device)
-        cls._onnx_node_to_singa_tensor(optimized_model.graph.output, tensor_map,
-                                       device)
-        cls._onnx_node_to_singa_tensor(optimized_model.graph.value_info,
-                                       tensor_map, device)
-        # convert constant nodes to tensor, other nodes to handler
-        for node in optimized_model.graph.node:
+        singa_op = namedtuple('SingaOps', ['name', 'op', 'handle', 'forward'])
+        for node in model.graph.node:
             node = OnnxNode(node)
-            if node.op_type == "Constant":
-                requires_grad, stores_grad = False, False
-                tmp_tensor = tensor.Tensor(
-                    device=device,
-                    data=numpy_helper.to_array(node.attrs['value']),
-                    requires_grad=requires_grad,
-                    stores_grad=stores_grad,
-                )
-                tensor_map[node.name] = tmp_tensor
-                weights[node.name] = tmp_tensor
+            # only give the inputs it needs
+            # consumed_inputs are the inputs marked as attributes
+            # so we remove it here
+            inputs = [
+                tensor_map[x]
+                for x in node.inputs
+                if x not in node.consumed_inputs
+            ]
+            handle, forward = cls._onnx_node_to_singa_op(
+                node, inputs, opset_version)
+            # if it is Constant, we hanlde it as a weight
+            # otherwise, we run it and add its output into map for being used by later operators
+            if node.op_type == 'Constant':
+                tmp_tensor = tensor.from_numpy(forward)
+                tmp_tensor.to_device(device)
+                tmp_name = node.outputs.pop(0)
+                weights[tmp_name] = tmp_tensor
+                tensor_map[tmp_name] = tmp_tensor
             else:
-                inputs = [tensor_map[x].clone() for x in node.inputs]
-                handle, forward = cls._onnx_node_to_singa_op(
-                    node, inputs, opset_version)
+                outputs = cls._run_node(node, inputs, handle, forward)
+                for key, val in outputs.items():
+                    tensor_map[key] = val
                 singa_ops.extend([singa_op(node.name, node, handle, forward)])
         return weights, singa_ops
 
@@ -1410,17 +1991,28 @@ def prepare(cls, model, device, **kwargs):
         """
         get the batch norm operator from onnx node
         Args:
-            onnx_node: a given onnx node
-        Args:
-            tensor_map: the input tensor
+            model: a given onnx node
         Args:
             device: the used device
-        Args:
-            opset_version: the opset version
         Returns: 
             a list of output values
         """
         super(SingaBackend, cls).prepare(model, device, **kwargs)
+        # when parsing graph, we use the shape of input gived by onnx to init a random value
+        # HOWEVER, the random value may not be correct for some inputs, such as gather which needs indices
+        # so if have operators, the user must give inputs
+        init_inputs = kwargs.get("init_inputs", None)
+        # whether initializers are moved into inputs, due to https://github.com/onnx/onnx/issues/2417
+        # sometimes, input contains all initializer's info, sometimes, may not
+        cls.keep_initializers_as_inputs = kwargs.get(
+            'keep_initializers_as_inputs', True)
+        # optimize and infer the shape of the model
+        try:
+            model = onnx.utils.polish_model(model)
+        except IndexError as err:
+            # due to https://github.com/onnx/onnx/issues/2417
+            model = onnx.shape_inference.infer_shapes(model)
+
         # check the opset version and ir version
         opset_version = None
         for imp in model.opset_import:
@@ -1439,14 +2031,19 @@ def prepare(cls, model, device, **kwargs):
                 )
             else:
                 opset_version = 1
-        tensor_map, singa_ops = cls._onnx_model_to_singa_net(
-            model, device, opset_version)
-        return SingaRep(model, tensor_map, singa_ops)
+        weights, singa_ops = cls._onnx_model_to_singa_net(
+            model, init_inputs, device, opset_version)
+        return SingaRep(model, weights, singa_ops,
+                        cls.keep_initializers_as_inputs)
 
 
 class SingaRep(BackendRep):
 
-    def __init__(self, model, tensor_map, singa_ops):
+    def __init__(self,
+                 model,
+                 weights,
+                 singa_ops,
+                 keep_initializers_as_inputs=True):
         """
         SingaRep provides the intermediate representation of Singa,
         the user can run the forward of the singa model by run func,
@@ -1455,13 +2052,14 @@ def __init__(self, model, tensor_map, singa_ops):
         Args:
             model: a given operator
         Args:
-            tensor_map: the tensor of the operator
+            weights: the tensor of weights
         Args:
             singa_ops: the tensor of the operator
         """
         super(SingaRep, self).__init__()
         self.model = model
-        self.tensor_map = tensor_map
+        self.tensor_map = weights
+        self.keep_initializers_as_inputs = keep_initializers_as_inputs
         # this each item of singa_ops is: ('name', 'op', 'handle', 'forward')
         # the name is a string, op is OnnxNode,
         # handle is Singa handle to store the tensor into singa operator
@@ -1476,27 +2074,49 @@ def run(self, inputs, **kwargs):
         Returns: 
             the onnx node
         """
+        graph = self.model.graph
         # last_layers means we run this model until the last #N layers
         last_layers = kwargs.get('last_layers', len(self.singa_ops))
+        if last_layers != len(self.singa_ops):
+            final_outputs = self.singa_ops[last_layers-1].op.outputs
+        else:
+            final_outputs =  [outp.name for outp in graph.output]
         # whether return all outputs
         all_outputs = kwargs.get('all_outputs', False)
         # get a specific op by its name
         op_name = kwargs.get('op_name', None)
+        # record the tensor we added from input
+        tmp_tensor_map = {name: val for name, val in self.tensor_map.items()}
 
         # the dict will be returned
-        ret_outputs = collections.OrderedDict()
-        if len(self.model.graph.input) != len(inputs):
-            raise RuntimeError(
-                "The length of graph input is different from the tensor input: %d, %d"
-                % (len(self.model.graph.input), len(inputs)))
+        ret_outputs = OrderedDict()
+        if self.keep_initializers_as_inputs:
+            require_input_len = len(graph.input) - len(graph.initializer)
+            actual_input_len = len(inputs)
+        else:
+            require_input_len = len(graph.input)
+            actual_input_len = len(inputs)
+        assert require_input_len == actual_input_len, "The length of graph input is different from the tensor input: %d, %d" % (
+            require_input_len, actual_input_len)
         # run the handle by the order of the list(the list is Topological Sorting)
-        for x, val in zip(self.model.graph.input, inputs):
-            self.tensor_map[x.name] = val
+        for inp in graph.input:
+            if inp.name not in tmp_tensor_map:
+                tmp_tensor_map[inp.name] = inputs.pop(0)
+
         for _, op, handle, forward in self.singa_ops[:last_layers]:
-            inputs = [self.tensor_map[x] for x in op.inputs]
+            if len(op.consumed_inputs) != 0:
+                # because if op has consumed_inputs, it means it moved some inputs into attributes
+                # so when running, we should update these attributes
+                handle, forward = get_op(op,
+                                         [tmp_tensor_map[x] for x in op.inputs])
+            inputs = [
+                tmp_tensor_map[x]
+                for x in op.inputs
+                if x not in op.consumed_inputs
+            ]
             outputs = _run_node(op, inputs, handle, forward)
             for key, val in outputs.items():
-                self.tensor_map[key] = val
+                tmp_tensor_map[key] = val
                 ret_outputs[key] = val
 
         if op_name is not None:
@@ -1512,12 +2132,13 @@ def run(self, inputs, **kwargs):
         if all_outputs:
             return ret_outputs
         else:
-            return list(outputs.values())
+            return [ret_outputs[outp] for outp in final_outputs]
 
 
 run_node = SingaBackend.run_node
 _run_node = SingaBackend._run_node
 prepare = SingaBackend.prepare
+get_op = SingaBackend._onnx_node_to_singa_op
 to_onnx = SingaFrontend.singa_to_onnx_model
 save = onnx.save
 load = onnx.load
diff --git a/python/singa/utils.py b/python/singa/utils.py
index 8c38f6c8f0..78c9f2c05e 100644
--- a/python/singa/utils.py
+++ b/python/singa/utils.py
@@ -18,10 +18,13 @@
 import sys
 import math
 import numpy as np
+import collections
 
 from singa import tensor
 from . import singa_wrap as singa
 
+OrderedDict = collections.OrderedDict
+
 
 def update_progress(progress, info):
     """Display progress bar and user info.
@@ -231,3 +234,40 @@ def force_unicode(s):
         return s.decode('utf-8')
     except AttributeError:
         return s
+
+
+def post_order_recursive(root, root_t):
+    """
+    return a list by the topological ordering (postorder of Depth-first search)
+    Args:
+        root: singa operator
+    Args:
+        root_t: tensor
+    Returns: 
+        deque[int]
+    """
+
+    def recursive(root, yid, root_t, nodes, weights, inputs):
+        if root:
+            # srcop: operator for a input of root
+            # yid: id(output of this operator)
+            # y: output of this operator
+            for srcop, yid, y, _ in root.src:
+                recursive(srcop, yid, y, nodes, weights, inputs)
+
+            if type(root).__name__ == 'Dummy':
+                if root_t != None:
+                    # constant within a node: weight
+                    weights[root.name] = root_t
+                else:
+                    # constant outside a node: input
+                    inputs[root.name] = root_t
+            else:
+                nodes[root.name] = root
+
+    nodes = OrderedDict()
+    weights = OrderedDict()
+    inputs = OrderedDict()
+
+    recursive(root, None, root_t, nodes, weights, inputs)
+    return nodes, weights, inputs
diff --git a/test/python/test_onnx.py b/test/python/test_onnx.py
index 59d8440d45..18afa0b70e 100644
--- a/test/python/test_onnx.py
+++ b/test/python/test_onnx.py
@@ -37,8 +37,19 @@
 autograd.training = True
 
 
+def _tuple_to_string(t):
+    lt = [str(x) for x in t]
+    return '(' + ', '.join(lt) + ')'
+
+
 class TestPythonOnnx(unittest.TestCase):
 
+    def check_shape(self, actual, expect):
+        self.assertEqual(
+            actual, expect, 'shape mismatch, actual shape is %s'
+            ' exepcted is %s' %
+            (_tuple_to_string(actual), _tuple_to_string(expect)))
+
     def test_conv2d(self):
         x = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev)
         x.gaussian(0.0, 1.0)
@@ -235,7 +246,7 @@ def test_batch_norm(self):
 
         # backend
         sg_ir = sonnx.prepare(model, device=gpu_dev)
-        y_t = sg_ir.run([x, s, bias, mean, var])
+        y_t = sg_ir.run([x, s, bias]) # mean and var has been stored in graph
 
         np.testing.assert_array_almost_equal(tensor.to_numpy(y),
                                              tensor.to_numpy(y_t[0]),
@@ -299,7 +310,7 @@ def test_reshape(self):
 
         # backend
         sg_ir = sonnx.prepare(model, device=gpu_dev)
-        y_t = sg_ir.run([x, (2, 3)])
+        y_t = sg_ir.run([x]) # shape has been stored in graph
 
         np.testing.assert_array_almost_equal(tensor.to_numpy(y),
                                              tensor.to_numpy(y_t[0]),
@@ -889,7 +900,7 @@ def test_clip(self):
 
         # backend
         sg_ir = sonnx.prepare(model, device=gpu_dev)
-        y_t = sg_ir.run([x, min, max])
+        y_t = sg_ir.run([x]) # min, max has been stored in model
 
         np.testing.assert_array_almost_equal(tensor.to_numpy(y),
                                              tensor.to_numpy(y_t[0]),
@@ -1165,6 +1176,230 @@ def test_reciprocal(self):
                                              tensor.to_numpy(y_t[0]),
                                              decimal=5)
 
+    def test_constantOfShape(self):
+        X = np.array([4, 3, 2]).astype(np.int64)
+        x = tensor.from_numpy(X)
+        x.to_device(cpu_dev)
+
+        y = autograd.constant_of_shape(x, 1.)
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev, init_inputs=[X])
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y),
+                                             tensor.to_numpy(y_t[0]),
+                                             decimal=5)
+
+    def test_dropout(self):
+        X = np.random.randn(3, 4, 5).astype(np.float32)
+
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.dropout(x, 0.5)
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        self.check_shape(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_reduceSum(self):
+        X = np.random.randn(3, 4, 5).astype(np.float32)
+
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.reduce_sum(x, None, 1)
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_reduceMean(self):
+        X = np.random.randn(3, 4, 5).astype(np.float32)
+
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.reduce_mean(x, None, 1)
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_squeeze(self):
+        X = np.random.randn(3, 1, 2, 1, 1)
+
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.squeeze(x, [1, 3, 4])
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_unsqueeze(self):
+        X = np.random.randn(3, 2)
+        
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.unsqueeze(x, [2, 4, 5])
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_slice(self):
+        X = np.random.randn(20, 10, 5).astype(np.float32)
+        starts, ends, axes, steps = [0, 0], [3, 10], [0, 1], [1, 1]
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.slice(x, starts, ends, axes, steps)
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    # todo, we don't support muli outputs
+    # def test_split(self):
+    #     X = np.array([1., 2., 3., 4., 5., 6.]).astype(np.float32)
+    #     x = tensor.from_numpy(X)
+    #     x.to_device(gpu_dev)
+    #     y = autograd.split(x, 0, (2, 4))
+
+    #     # frontend
+    #     model = sonnx.to_onnx([x], [*y])
+    #     # print('The model is:\n{}'.format(model))
+
+    #     # backend
+    #     sg_ir = sonnx.prepare(model, device=gpu_dev)
+    #     y_t = sg_ir.run([x])[0]
+
+    #     np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t).shape)
+
+    def test_gather(self):
+        X = np.array([0, 1, 2]).astype(np.float32)
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.gather(x, 0, [0, 1, 3])
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_tile(self):
+        X = np.array([0, 1, 2]).astype(np.float32)
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.tile(x, [2, 2])
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_nonzero(self):
+        X = np.array([[1, 0], [1, 1]]).astype(np.float32)
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.nonzero(x)
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_cast(self):
+        X = np.array([[1, 0], [1, 1]]).astype(np.float32)
+        x = tensor.from_numpy(X)
+        x.to_device(gpu_dev)
+        y = autograd.cast(x, tensor.int32)
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
+    def test_onehot(self):
+        axisValue = 1
+        on_value = 3
+        off_value = 1
+        output_type = np.float32
+        indices = np.array([[1, 9], [2, 4]], dtype=np.float32)
+        depth = np.array([10], dtype=np.float32)
+        values = np.array([off_value, on_value], dtype=output_type)
+
+        x = tensor.from_numpy(indices)
+        x.to_device(gpu_dev)
+        y = autograd.onehot(axisValue, x, depth, values)
+
+        # frontend
+        model = sonnx.to_onnx([x], [y])
+        # print('The model is:\n{}'.format(model))
+
+        # backend
+        sg_ir = sonnx.prepare(model, device=gpu_dev)
+        y_t = sg_ir.run([x])
+
+        self.check_shape(tensor.to_numpy(y).shape, tensor.to_numpy(y_t[0]).shape)
+
     def test_inference(self):
         x = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev)
         x.gaussian(0.0, 1.0)
@@ -1252,24 +1487,6 @@ def test_transfer_learning(self):
             sgd.update(p, gp)
         sgd.step()
 
-    def test_globalaveragepool(self):
-        X = np.array([[[
-            [1, 2, 3],
-            [4, 5, 6],
-            [7, 8, 9],
-        ]]]).astype(np.float32)
-
-        x = tensor.from_numpy(X)
-        x.to_device(gpu_dev)
-        y = autograd.globalaveragepool(x)
-
-        # frontend
-        model = sonnx.to_onnx([x], [y])
-        # backend
-        sg_ir = sonnx.prepare(model, device=gpu_dev)
-        y_t = sg_ir.run([x])
-
-        np.testing.assert_array_almost_equal(tensor.to_numpy(y), tensor.to_numpy(y_t[0]), decimal=5)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/python/test_onnx_backend.py b/test/python/test_onnx_backend.py
index 3d7427bf55..a1dc1f0331 100644
--- a/test/python/test_onnx_backend.py
+++ b/test/python/test_onnx_backend.py
@@ -37,29 +37,31 @@
 
 autograd.training = True
 
-_default_opset_version = 10
+_default_opset_version = 11
 
 
-def expect(node, inputs, outputs, name, opset_version=_default_opset_version):
+def expect(node,
+           inputs,
+           outputs,
+           name,
+           opset_version=_default_opset_version,
+           decimal=5):
     onnx_node = sonnx.OnnxNode(node)
     input_tensors = {}
     input_labels = [x for x in onnx_node.inputs if x != ""]
     # prepare input tensors
     for key, val in zip(input_labels, inputs):
-        if node.op_type=="Clip" and key in ("min", "max"):
-            input_tensors[key] = val.item()
-        else:
-            # very important! must be float
-            if not isinstance(val, np.ndarray) or len(val.shape) == 0:
-                val = np.array([val])
-            x = tensor.from_numpy(val.astype(np.float32))
-            x.to_device(gpu_dev)
-            input_tensors[key] = x
+        # very important! must be float
+        if not isinstance(val, np.ndarray) or len(val.shape) == 0:
+            val = np.array([val])
+        x = tensor.from_numpy(val.astype(np.float32))
+        x.to_device(gpu_dev)
+        input_tensors[key] = x
     outputs_dict = sonnx.run_node(onnx_node, input_tensors, opset_version)
     for out1, out2 in zip(outputs, outputs_dict.values()):
         np.testing.assert_array_almost_equal(out1,
                                              tensor.to_numpy(out2),
-                                             decimal=5)
+                                             decimal=decimal)
 
 
 class TestPythonOnnxBackend(unittest.TestCase):
@@ -886,7 +888,7 @@ def test_Tan(self):  # type: () -> None
 
         x = np.random.randn(3, 4, 5).astype(np.float32)
         y = np.tan(x)
-        expect(node, inputs=[x], outputs=[y], name='test_tan')
+        expect(node, inputs=[x], outputs=[y], name='test_tan', decimal=3)
 
     def test_Tanh(self):  # type: () -> None
         node = onnx.helper.make_node(
@@ -1851,13 +1853,17 @@ def test_pow(self):
         x = np.array([1, 2, 3]).astype(np.float32)
         y = np.array([4, 5, 6]).astype(np.float32)  # todo, not exactly same
         z = np.power(x, y)  # expected output [1., 32., 729.]
-        expect(node, inputs=[x, y], outputs=[z], name='test_pow_example')
+        expect(node,
+               inputs=[x, y],
+               outputs=[z],
+               name='test_pow_example',
+               decimal=3)
 
         x = np.arange(24).reshape(2, 3, 4).astype(
             np.float32)  # todo, cannot too big here
         y = np.random.randn(2, 3, 4).astype(np.float32)
         z = np.power(x, y)
-        expect(node, inputs=[x, y], outputs=[z], name='test_pow')
+        expect(node, inputs=[x, y], outputs=[z], name='test_pow', decimal=3)
 
     def test_pow_broadcast(self):  # type: () -> None
         node = onnx.helper.make_node(
@@ -1869,7 +1875,11 @@ def test_pow_broadcast(self):  # type: () -> None
         x = np.array([1, 2, 3]).astype(np.float32)
         y = np.array(2).astype(np.float32)
         z = np.power(x, y)  # expected output [1., 4., 9.]
-        expect(node, inputs=[x, y], outputs=[z], name='test_pow_bcast_scalar')
+        expect(node,
+               inputs=[x, y],
+               outputs=[z],
+               name='test_pow_bcast_scalar',
+               decimal=3)
 
         node = onnx.helper.make_node(
             'Pow',
@@ -1880,7 +1890,11 @@ def test_pow_broadcast(self):  # type: () -> None
         y = np.array([1, 2, 3]).astype(np.float32)
         # expected output [[1, 4, 27], [4, 25, 216]]
         z = np.power(x, y).astype(np.float32)
-        expect(node, inputs=[x, y], outputs=[z], name='test_pow_bcast_array')
+        expect(node,
+               inputs=[x, y],
+               outputs=[z],
+               name='test_pow_bcast_array',
+               decimal=3)
 
     def test_clip(self):
         node = onnx.helper.make_node(
@@ -2031,160 +2045,974 @@ def test_mul_broadcast(self):  # type: () -> None
         y = np.random.randn(5).astype(np.float32)
         z = x * y
         expect(node, inputs=[x, y], outputs=[z], name='test_mul_bcast')
-        
+
     def test_gemm_default_zero_bias(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y']
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'])
         a = np.random.ranf([3, 5]).astype(np.float32)
         b = np.random.ranf([5, 4]).astype(np.float32)
         c = np.zeros([1, 4]).astype(np.float32)
         y = gemm_reference_implementation(a, b, c)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_default_zero_bias')
+        expect(node,
+               inputs=[a, b, c],
+               outputs=[y],
+               name='test_gemm_default_zero_bias')
 
     def test_gemm_default_no_bias(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b'],
-            outputs=['y']
-        )
+        node = onnx.helper.make_node('Gemm', inputs=['a', 'b'], outputs=['y'])
         a = np.random.ranf([2, 10]).astype(np.float32)
         b = np.random.ranf([10, 3]).astype(np.float32)
         y = gemm_reference_implementation(a, b)
-        expect(node, inputs=[a, b], outputs=[y],
-                name='test_gemm_default_no_bias')
+        expect(node,
+               inputs=[a, b],
+               outputs=[y],
+               name='test_gemm_default_no_bias')
 
     def test_gemm_default_scalar_bias(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y']
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'])
         a = np.random.ranf([2, 3]).astype(np.float32)
         b = np.random.ranf([3, 4]).astype(np.float32)
         c = np.array(3.14).astype(np.float32)
         y = gemm_reference_implementation(a, b, c)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_default_scalar_bias')
+        expect(node,
+               inputs=[a, b, c],
+               outputs=[y],
+               name='test_gemm_default_scalar_bias')
 
     def test_gemm_default_single_elem_vector_bias(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y']
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'])
         a = np.random.ranf([3, 7]).astype(np.float32)
         b = np.random.ranf([7, 3]).astype(np.float32)
         c = np.random.ranf([1]).astype(np.float32)
         y = gemm_reference_implementation(a, b, c)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_default_single_elem_vector_bias')
+        expect(node,
+               inputs=[a, b, c],
+               outputs=[y],
+               name='test_gemm_default_single_elem_vector_bias')
 
     def test_gemm_default_vector_bias(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y']
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'])
         a = np.random.ranf([2, 7]).astype(np.float32)
         b = np.random.ranf([7, 4]).astype(np.float32)
         c = np.random.ranf([1, 4]).astype(np.float32)
         y = gemm_reference_implementation(a, b, c)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_default_vector_bias')
+        expect(node,
+               inputs=[a, b, c],
+               outputs=[y],
+               name='test_gemm_default_vector_bias')
 
     def test_gemm_default_matrix_bias(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y']
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'])
         a = np.random.ranf([3, 6]).astype(np.float32)
         b = np.random.ranf([6, 4]).astype(np.float32)
         c = np.random.ranf([3, 4]).astype(np.float32)
         y = gemm_reference_implementation(a, b, c)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_default_matrix_bias')
+        expect(node,
+               inputs=[a, b, c],
+               outputs=[y],
+               name='test_gemm_default_matrix_bias')
 
     def test_gemm_transposeA(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y'],
-            transA=1
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'],
+                                     transA=1)
         a = np.random.ranf([6, 3]).astype(np.float32)
         b = np.random.ranf([6, 4]).astype(np.float32)
         c = np.zeros([1, 4]).astype(np.float32)
         y = gemm_reference_implementation(a, b, c, transA=1)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_transposeA')
+        expect(node, inputs=[a, b, c], outputs=[y], name='test_gemm_transposeA')
 
     def test_gemm_transposeB(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y'],
-            transB=1
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'],
+                                     transB=1)
         a = np.random.ranf([3, 6]).astype(np.float32)
         b = np.random.ranf([4, 6]).astype(np.float32)
         c = np.zeros([1, 4]).astype(np.float32)
         y = gemm_reference_implementation(a, b, c, transB=1)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_transposeB')
+        expect(node, inputs=[a, b, c], outputs=[y], name='test_gemm_transposeB')
 
     def test_gemm_alpha(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y'],
-            alpha=0.5
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'],
+                                     alpha=0.5)
         a = np.random.ranf([3, 5]).astype(np.float32)
         b = np.random.ranf([5, 4]).astype(np.float32)
         c = np.zeros([1, 4]).astype(np.float32)
         y = gemm_reference_implementation(a, b, c, alpha=0.5)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_alpha')
+        expect(node, inputs=[a, b, c], outputs=[y], name='test_gemm_alpha')
 
     def test_gemm_beta(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y'],
-            beta=0.5
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'],
+                                     beta=0.5)
         a = np.random.ranf([2, 7]).astype(np.float32)
         b = np.random.ranf([7, 4]).astype(np.float32)
         c = np.random.ranf([1, 4]).astype(np.float32)
         y = gemm_reference_implementation(a, b, c, beta=0.5)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_beta')
+        expect(node, inputs=[a, b, c], outputs=[y], name='test_gemm_beta')
 
     def test_gemm_all_attributes(self):
-        node = onnx.helper.make_node(
-            'Gemm',
-            inputs=['a', 'b', 'c'],
-            outputs=['y'],
-            alpha=0.25,
-            beta=0.35,
-            transA=1,
-            transB=1
-        )
+        node = onnx.helper.make_node('Gemm',
+                                     inputs=['a', 'b', 'c'],
+                                     outputs=['y'],
+                                     alpha=0.25,
+                                     beta=0.35,
+                                     transA=1,
+                                     transB=1)
         a = np.random.ranf([4, 3]).astype(np.float32)
         b = np.random.ranf([5, 4]).astype(np.float32)
         c = np.random.ranf([1, 5]).astype(np.float32)
-        y = gemm_reference_implementation(a, b, c, transA=1, transB=1, alpha=0.25, beta=0.35)
-        expect(node, inputs=[a, b, c], outputs=[y],
-                name='test_gemm_all_attributes')
+        y = gemm_reference_implementation(a,
+                                          b,
+                                          c,
+                                          transA=1,
+                                          transB=1,
+                                          alpha=0.25,
+                                          beta=0.35)
+        expect(node,
+               inputs=[a, b, c],
+               outputs=[y],
+               name='test_gemm_all_attributes')
+
+    def test_constantOfShape_float_ones(self):
+            x = np.array([4, 3, 2]).astype(np.int64)
+            tensor_value = onnx.helper.make_tensor("value", onnx.TensorProto.FLOAT,
+                                                [1], [1])
+            node = onnx.helper.make_node(
+                'ConstantOfShape',
+                inputs=['x'],
+                outputs=['y'],
+                value=tensor_value,
+            )
+
+            y = np.ones(x, dtype=np.float32)
+            expect(node,
+                inputs=[x],
+                outputs=[y],
+                name='test_constantofshape_float_ones')
+
+    def test_constantOfShape_int32_zeros(self):
+        x = np.array([10, 6]).astype(np.int64)
+        tensor_value = onnx.helper.make_tensor("value", onnx.TensorProto.INT32,
+                                               [1], [0])
+        node = onnx.helper.make_node(
+            'ConstantOfShape',
+            inputs=['x'],
+            outputs=['y'],
+            value=tensor_value,
+        )
+        y = np.zeros(x, dtype=np.int32)
+        expect(node,
+               inputs=[x],
+               outputs=[y],
+               name='test_constantofshape_int_zeros')
+
+    # cannot support yet
+    # def test_int32_shape_zero(self):
+    #     x = np.array([0, ]).astype(np.int64)
+    #     tensor_value = onnx.helper.make_tensor("value", onnx.TensorProto.INT32,
+    #                                            [1], [0])
+    #     node = onnx.helper.make_node(
+    #         'ConstantOfShape',
+    #         inputs=['x'],
+    #         outputs=['y'],
+    #         value=tensor_value,
+    #     )
+    #     y = np.zeros(x, dtype=np.int32)
+    #     expect(node, inputs=[x], outputs=[y],
+    #            name='test_constantofshape_int_shape_zero')
+
+    def test_reduce_sum_do_not_keepdims(self):
+        shape = [3, 2, 2]
+        axes = [1]
+        keepdims = 0
+
+        node = onnx.helper.make_node('ReduceSum',
+                                     inputs=['data'],
+                                     outputs=['reduced'],
+                                     axes=axes,
+                                     keepdims=keepdims)
+
+        data = np.array(
+            [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]],
+            dtype=np.float32)
+        reduced = np.sum(data, axis=tuple(axes), keepdims=keepdims == 1)
+        #print(reduced)
+        #[[4., 6.]
+        # [12., 14.]
+        # [20., 22.]]
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_sum_do_not_keepdims_example')
+
+        np.random.seed(0)
+        data = np.random.uniform(-10, 10, shape).astype(np.float32)
+        reduced = np.sum(data, axis=tuple(axes), keepdims=keepdims == 1)
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_sum_do_not_keepdims_random')
+
+    def test_reduce_sum_keepdims(self):
+        shape = [3, 2, 2]
+        axes = [1]
+        keepdims = 1
+
+        node = onnx.helper.make_node('ReduceSum',
+                                     inputs=['data'],
+                                     outputs=['reduced'],
+                                     axes=axes,
+                                     keepdims=keepdims)
+
+        data = np.array(
+            [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]],
+            dtype=np.float32)
+        reduced = np.sum(data, axis=tuple(axes), keepdims=keepdims == 1)
+        #print(reduced)
+        #[[[4., 6.]]
+        # [[12., 14.]]
+        # [[20., 22.]]]
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_sum_keepdims_example')
+
+        np.random.seed(0)
+        data = np.random.uniform(-10, 10, shape).astype(np.float32)
+        reduced = np.sum(data, axis=tuple(axes), keepdims=keepdims == 1)
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_sum_keepdims_random')
+
+    def test_reduce_sum_default_axes_keepdims(self):
+        shape = [3, 2, 2]
+        axes = None
+        keepdims = 1
+
+        node = onnx.helper.make_node('ReduceSum',
+                                     inputs=['data'],
+                                     outputs=['reduced'],
+                                     keepdims=keepdims)
+
+        data = np.array(
+            [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]],
+            dtype=np.float32)
+        reduced = np.sum(data, axis=axes, keepdims=keepdims == 1)
+        #print(reduced)
+        #[[[78.]]]
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_sum_default_axes_keepdims_example')
+
+        np.random.seed(0)
+        data = np.random.uniform(-10, 10, shape).astype(np.float32)
+        reduced = np.sum(data, axis=axes, keepdims=keepdims == 1)
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_sum_default_axes_keepdims_random')
+
+    def test_reduce_sum_negative_axes_keepdims(self):
+        shape = [3, 2, 2]
+        axes = [-2]
+        keepdims = 1
+
+        node = onnx.helper.make_node('ReduceSum',
+                                     inputs=['data'],
+                                     outputs=['reduced'],
+                                     axes=axes,
+                                     keepdims=keepdims)
+
+        data = np.array(
+            [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]],
+            dtype=np.float32)
+        reduced = np.sum(data, axis=tuple(axes), keepdims=keepdims == 1)
+        # print(reduced)
+        #[[[4., 6.]]
+        # [[12., 14.]]
+        # [[20., 22.]]]
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_sum_negative_axes_keepdims_example')
+
+        np.random.seed(0)
+        data = np.random.uniform(-10, 10, shape).astype(np.float32)
+        reduced = np.sum(data, axis=tuple(axes), keepdims=keepdims == 1)
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_sum_negative_axes_keepdims_random')
+
+    def test_reduce_mean_do_not_keepdims(self):
+        shape = [3, 2, 2]
+        axes = [1]
+        keepdims = 0
+
+        node = onnx.helper.make_node('ReduceMean',
+                                     inputs=['data'],
+                                     outputs=['reduced'],
+                                     axes=axes,
+                                     keepdims=keepdims)
+
+        data = np.array(
+            [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]],
+            dtype=np.float32)
+        reduced = np.mean(data, axis=tuple(axes), keepdims=keepdims == 1)
+        #print(reduced)
+        #[[12.5, 1.5]
+        # [35., 1.5]
+        # [57.5, 1.5]]
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_mean_do_not_keepdims_example')
+
+        np.random.seed(0)
+        data = np.random.uniform(-10, 10, shape).astype(np.float32)
+        reduced = np.mean(data, axis=tuple(axes), keepdims=keepdims == 1)
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_mean_do_not_keepdims_random')
+
+    def test_reduce_mean_keepdims(self):
+        shape = [3, 2, 2]
+        axes = [1]
+        keepdims = 1
+
+        node = onnx.helper.make_node('ReduceMean',
+                                     inputs=['data'],
+                                     outputs=['reduced'],
+                                     axes=axes,
+                                     keepdims=keepdims)
+
+        data = np.array(
+            [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]],
+            dtype=np.float32)
+        reduced = np.mean(data, axis=tuple(axes), keepdims=keepdims == 1)
+        #print(reduced)
+        #[[[12.5, 1.5]]
+        # [[35., 1.5]]
+        # [[57.5, 1.5]]]
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_mean_keepdims_example')
+
+        np.random.seed(0)
+        data = np.random.uniform(-10, 10, shape).astype(np.float32)
+        reduced = np.mean(data, axis=tuple(axes), keepdims=keepdims == 1)
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_mean_keepdims_random')
+
+    def test_reduce_mean_default_axes_keepdims(self):
+        shape = [3, 2, 2]
+        axes = None
+        keepdims = 1
+
+        node = onnx.helper.make_node('ReduceMean',
+                                     inputs=['data'],
+                                     outputs=['reduced'],
+                                     keepdims=keepdims)
+
+        data = np.array(
+            [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]],
+            dtype=np.float32)
+        reduced = np.mean(data, axis=axes, keepdims=keepdims == 1)
+        #print(reduced)
+        #[[[18.25]]]
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_mean_default_axes_keepdims_example')
+
+        np.random.seed(0)
+        data = np.random.uniform(-10, 10, shape).astype(np.float32)
+        reduced = np.mean(data, axis=axes, keepdims=keepdims == 1)
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_mean_default_axes_keepdims_random')
+
+    def test_reduce_mean_negative_axes_keepdims(self):
+        shape = [3, 2, 2]
+        axes = [-2]
+        keepdims = 1
+
+        node = onnx.helper.make_node('ReduceMean',
+                                     inputs=['data'],
+                                     outputs=['reduced'],
+                                     axes=axes,
+                                     keepdims=keepdims)
+
+        data = np.array(
+            [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]],
+            dtype=np.float32)
+        reduced = np.mean(data, axis=tuple(axes), keepdims=keepdims == 1)
+        # print(reduced)
+        # [[[12.5, 1.5]]
+        # [[35., 1.5]]
+        # [[57.5, 1.5]]]
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_mean_negative_axes_keepdims_example')
+
+        np.random.seed(0)
+        data = np.random.uniform(-10, 10, shape).astype(np.float32)
+        reduced = np.mean(data, axis=tuple(axes), keepdims=keepdims == 1)
+
+        expect(node,
+               inputs=[data],
+               outputs=[reduced],
+               name='test_reduce_mean_negative_axes_keepdims_random')
+
+    def test_squeeze(self):
+        node = onnx.helper.make_node(
+            'Squeeze',
+            inputs=['x'],
+            outputs=['y'],
+            axes=[0],
+        )
+        x = np.random.randn(1, 3, 4, 5).astype(np.float32)
+        y = np.squeeze(x, axis=0)
+
+        expect(node, inputs=[x], outputs=[y], name='test_squeeze')
+
+    def test_squeeze_negative_axes(self):
+        node = onnx.helper.make_node(
+            'Squeeze',
+            inputs=['x'],
+            outputs=['y'],
+            axes=[-2],
+        )
+        x = np.random.randn(1, 3, 1, 5).astype(np.float32)
+        y = np.squeeze(x, axis=-2)
+        expect(node, inputs=[x], outputs=[y], name='test_squeeze_negative_axes')
+
+    def test_unsqueeze_one_axis(self):
+        x = np.random.randn(3, 4, 5).astype(np.float32)
+
+        for i in range(x.ndim):
+            node = onnx.helper.make_node(
+                'Unsqueeze',
+                inputs=['x'],
+                outputs=['y'],
+                axes=[i],
+            )
+            y = np.expand_dims(x, axis=i)
+
+            expect(node,
+                   inputs=[x],
+                   outputs=[y],
+                   name='test_unsqueeze_axis_' + str(i))
+
+    def test_unsqueeze_two_axes(self):
+        x = np.random.randn(3, 4, 5).astype(np.float32)
+
+        node = onnx.helper.make_node(
+            'Unsqueeze',
+            inputs=['x'],
+            outputs=['y'],
+            axes=[1, 4],
+        )
+        y = np.expand_dims(x, axis=1)
+        y = np.expand_dims(y, axis=4)
+
+        expect(node, inputs=[x], outputs=[y], name='test_unsqueeze_two_axes')
+
+    def test_unsqueeze_three_axes(self):
+        x = np.random.randn(3, 4, 5).astype(np.float32)
+
+        node = onnx.helper.make_node(
+            'Unsqueeze',
+            inputs=['x'],
+            outputs=['y'],
+            axes=[2, 4, 5],
+        )
+        y = np.expand_dims(x, axis=2)
+        y = np.expand_dims(y, axis=4)
+        y = np.expand_dims(y, axis=5)
+
+        expect(node, inputs=[x], outputs=[y], name='test_unsqueeze_three_axes')
+
+    def test_unsqueeze_unsorted_axes(self):
+        x = np.random.randn(3, 4, 5).astype(np.float32)
+
+        node = onnx.helper.make_node(
+            'Unsqueeze',
+            inputs=['x'],
+            outputs=['y'],
+            axes=[5, 4, 2],
+        )
+        y = np.expand_dims(x, axis=2)
+        y = np.expand_dims(y, axis=4)
+        y = np.expand_dims(y, axis=5)
+
+        expect(node,
+               inputs=[x],
+               outputs=[y],
+               name='test_unsqueeze_unsorted_axes')
+
+    def test_unsqueeze_negative_axes(self):
+        node = onnx.helper.make_node(
+            'Unsqueeze',
+            inputs=['x'],
+            outputs=['y'],
+            axes=[-2],
+        )
+        x = np.random.randn(1, 3, 1, 5).astype(np.float32)
+        y = np.expand_dims(x, axis=-2)
+        expect(node,
+               inputs=[x],
+               outputs=[y],
+               name='test_unsqueeze_negative_axes')
+
+    def test_slice(self):
+        node = onnx.helper.make_node(
+            'Slice',
+            inputs=['x', 'starts', 'ends', 'axes', 'steps'],
+            outputs=['y'],
+        )
+
+        x = np.random.randn(20, 10, 5).astype(np.float32)
+        y = x[0:3, 0:10]
+        starts = np.array([0, 0], dtype=np.int64)
+        ends = np.array([3, 10], dtype=np.int64)
+        axes = np.array([0, 1], dtype=np.int64)
+        steps = np.array([1, 1], dtype=np.int64)
+
+        expect(node, inputs=[x, starts, ends, axes, steps], outputs=[y],
+               name='test_slice')
+
+    def test_slice_neg(self):
+        node = onnx.helper.make_node(
+            'Slice',
+            inputs=['x', 'starts', 'ends', 'axes', 'steps'],
+            outputs=['y'],
+        )
+
+        x = np.random.randn(20, 10, 5).astype(np.float32)
+        starts = np.array([0], dtype=np.int64)
+        ends = np.array([-1], dtype=np.int64)
+        axes = np.array([1], dtype=np.int64)
+        steps = np.array([1], dtype=np.int64)
+        y = x[:, 0:-1]
+
+        expect(node, inputs=[x, starts, ends, axes, steps], outputs=[y],
+               name='test_slice_neg')
+
+    # not support empty tensor
+    # def test_slice_start_out_of_bounds(self):
+    #     node = onnx.helper.make_node(
+    #         'Slice',
+    #         inputs=['x', 'starts', 'ends', 'axes', 'steps'],
+    #         outputs=['y'],
+    #     )
+
+    #     x = np.random.randn(20, 10, 5).astype(np.float32)
+    #     starts = np.array([1000], dtype=np.int64)
+    #     ends = np.array([1000], dtype=np.int64)
+    #     axes = np.array([1], dtype=np.int64)
+    #     steps = np.array([1], dtype=np.int64)
+    #     y = x[:, 1000:1000]
+
+    #     expect(node, inputs=[x, starts, ends, axes, steps], outputs=[y],
+    #            name='test_slice_start_out_of_bounds')
+
+    def test_slice_end_out_of_bounds(self):
+        node = onnx.helper.make_node(
+            'Slice',
+            inputs=['x', 'starts', 'ends', 'axes', 'steps'],
+            outputs=['y'],
+        )
+
+        x = np.random.randn(20, 10, 5).astype(np.float32)
+        starts = np.array([1], dtype=np.int64)
+        ends = np.array([1000], dtype=np.int64)
+        axes = np.array([1], dtype=np.int64)
+        steps = np.array([1], dtype=np.int64)
+        y = x[:, 1:1000]
+
+        expect(node, inputs=[x, starts, ends, axes, steps], outputs=[y],
+               name='test_slice_end_out_of_bounds')
+
+    def test_slice_default_axes(self):
+        node = onnx.helper.make_node(
+            'Slice',
+            inputs=['x', 'starts', 'ends'],
+            outputs=['y'],
+        )
+
+        x = np.random.randn(20, 10, 5).astype(np.float32)
+        starts = np.array([0, 0, 3], dtype=np.int64)
+        ends = np.array([20, 10, 4], dtype=np.int64)
+        y = x[:, :, 3:4]
+
+        expect(node, inputs=[x, starts, ends], outputs=[y],
+               name='test_slice_default_axes')
+
+    def test_slice_default_steps(self):
+        node = onnx.helper.make_node(
+            'Slice',
+            inputs=['x', 'starts', 'ends', 'axes'],
+            outputs=['y'],
+        )
+
+        x = np.random.randn(20, 10, 5).astype(np.float32)
+        starts = np.array([0, 0, 3], dtype=np.int64)
+        ends = np.array([20, 10, 4], dtype=np.int64)
+        axes = np.array([0, 1, 2], dtype=np.int64)
+        y = x[:, :, 3:4]
+
+        expect(node, inputs=[x, starts, ends, axes], outputs=[y],
+               name='test_slice_default_steps')
+
+    def test_slice_neg_steps(self):
+        node = onnx.helper.make_node(
+            'Slice',
+            inputs=['x', 'starts', 'ends', 'axes', 'steps'],
+            outputs=['y'],
+        )
+
+        x = np.random.randn(20, 10, 5).astype(np.float32)
+        starts = np.array([20, 10, 4], dtype=np.int64)
+        ends = np.array([0, 0, 1], dtype=np.int64)
+        axes = np.array([0, 1, 2], dtype=np.int64)
+        steps = np.array([-1, -3, -2])
+        y = x[20:0:-1, 10:0:-3, 4:1:-2]
+
+        expect(node, inputs=[x, starts, ends, axes, steps], outputs=[y],
+               name='test_slice_neg_steps')
+
+    def test_slice_negative_axes(self):
+        node = onnx.helper.make_node(
+            'Slice',
+            inputs=['x', 'starts', 'ends', 'axes'],
+            outputs=['y'],
+        )
+
+        x = np.random.randn(20, 10, 5).astype(np.float32)
+        starts = np.array([0, 0, 3], dtype=np.int64)
+        ends = np.array([20, 10, 4], dtype=np.int64)
+        axes = np.array([0, -2, -1], dtype=np.int64)
+        y = x[:, :, 3:4]
+
+        expect(node, inputs=[x, starts, ends, axes], outputs=[y],
+               name='test_slice_negative_axes')
+
+    def test_split_1d(self):
+        input = np.array([1., 2., 3., 4., 5., 6.]).astype(np.float32)
+
+        node = onnx.helper.make_node(
+            'Split',
+            inputs=['input'],
+            outputs=['output_1', 'output_2', 'output_3'],
+            axis=0
+        )
+
+        expected_outputs = [np.array([1., 2.]).astype(np.float32), np.array([3., 4.]).astype(np.float32), np.array([5., 6.]).astype(np.float32)]
+        expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_equal_parts_1d')
+
+        node = onnx.helper.make_node(
+            'Split',
+            inputs=['input'],
+            outputs=['output_1', 'output_2'],
+            axis=0,
+            split=[2, 4]
+        )
+
+        expected_outputs = [np.array([1., 2.]).astype(np.float32), np.array([3., 4., 5., 6.]).astype(np.float32)]
+        expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_variable_parts_1d')
+
+    def test_split_2d(self):
+        input = np.array([[1., 2., 3., 4., 5., 6.],
+                          [7., 8., 9., 10., 11., 12.]]).astype(np.float32)
+
+        node = onnx.helper.make_node(
+            'Split',
+            inputs=['input'],
+            outputs=['output_1', 'output_2'],
+            axis=1
+        )
+
+        expected_outputs = [np.array([[1., 2., 3.], [7., 8., 9.]]).astype(np.float32),
+                            np.array([[4., 5., 6.], [10., 11., 12.]]).astype(np.float32)]
+
+        expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_equal_parts_2d')
+
+        node = onnx.helper.make_node(
+            'Split',
+            inputs=['input'],
+            outputs=['output_1', 'output_2'],
+            axis=1,
+            split=[2, 4]
+        )
+
+        expected_outputs = [np.array([[1., 2.], [7., 8.]]).astype(np.float32),
+                            np.array([[3., 4., 5., 6.], [9., 10., 11., 12.]]).astype(np.float32)]
+
+        expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_variable_parts_2d')
+
+    def test_split_default_values(self):
+        input = np.array([1., 2., 3., 4., 5., 6.]).astype(np.float32)
+
+        # If axis is not specified, split is applied on default axis 0
+        node = onnx.helper.make_node(
+            'Split',
+            inputs=['input'],
+            outputs=['output_1', 'output_2', 'output_3']
+        )
+
+        expected_outputs = [np.array([1., 2.]).astype(np.float32), np.array([3., 4.]).astype(np.float32), np.array([5., 6.]).astype(np.float32)]
+        expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_equal_parts_default_axis')
 
+        node = onnx.helper.make_node(
+            'Split',
+            inputs=['input'],
+            outputs=['output_1', 'output_2'],
+            split=[2, 4]
+        )
+
+        expected_outputs = [np.array([1., 2.]).astype(np.float32), np.array([3., 4., 5., 6.]).astype(np.float32)]
+        expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_variable_parts_default_axis')
+
+    # not support empty tensor
+    # def test_split_zero_size_splits(self):
+    #     input = np.array([]).astype(np.float32)
+
+    #     # Split emtpy tensor to tensors of size zero
+    #     node = onnx.helper.make_node(
+    #         'Split',
+    #         inputs=['input'],
+    #         outputs=['output_1', 'output_2', 'output_3'],
+    #         split=[0, 0, 0]
+    #     )
+
+    #     expected_outputs = [np.array([]).astype(np.float32), np.array([]).astype(np.float32), np.array([]).astype(np.float32)]
+    #     expect(node, inputs=[input], outputs=[y for y in expected_outputs], name='test_split_zero_size_splits')
+
+    def test_gather_0(self):
+        node = onnx.helper.make_node(
+            'Gather',
+            inputs=['data', 'indices'],
+            outputs=['y'],
+            axis=0,
+        )
+        data = np.random.randn(5, 4, 3, 2).astype(np.float32)
+        indices = np.array([0, 1, 3])
+        y = np.take(data, indices, axis=0)
 
-def gemm_reference_implementation(A, B, C=None, alpha=1., beta=1., transA=0,
-                                transB=0):  # type: (np.ndarray, np.ndarray, Optional[np.ndarray], float, float, int, int) -> np.ndarray
+        expect(node, inputs=[data, indices.astype(np.int64)], outputs=[y],
+               name='test_gather_0')
+
+    def test_gather_1(self):
+        node = onnx.helper.make_node(
+            'Gather',
+            inputs=['data', 'indices'],
+            outputs=['y'],
+            axis=1,
+        )
+        data = np.random.randn(5, 4, 3, 2).astype(np.float32)
+        indices = np.array([0, 1, 3])
+        y = np.take(data, indices, axis=1)
+
+        expect(node, inputs=[data, indices.astype(np.int64)], outputs=[y],
+               name='test_gather_1')
+
+    def test_gather_negative_indices(self):
+        node = onnx.helper.make_node(
+            'Gather',
+            inputs=['data', 'indices'],
+            outputs=['y'],
+            axis=0,
+        )
+        data = np.arange(10).astype(np.float32)
+        indices = np.array([0, -9, -10])
+        y = np.take(data, indices, axis=0)
+
+        expect(node, inputs=[data, indices.astype(np.int64)], outputs=[y],
+               name='test_gather_negative_indices')
+
+    def test_tile(self):
+        node = onnx.helper.make_node(
+            'Tile',
+            inputs=['x', 'y'],
+            outputs=['z']
+        )
+
+        x = np.random.rand(2, 3, 4, 5).astype(np.float32)
+
+        repeats = np.random.randint(low=1, high=10, size=(np.ndim(x),)).astype(np.int64)
+
+        z = np.tile(x, repeats)
+
+        expect(node,
+               inputs=[x, repeats],
+               outputs=[z],
+               name='test_tile')
+
+    def test_tile_precomputed(self):
+        node = onnx.helper.make_node(
+            'Tile',
+            inputs=['x', 'y'],
+            outputs=['z']
+        )
+
+        x = np.array([
+            [0, 1],
+            [2, 3]
+        ], dtype=np.float32)
+
+        repeats = np.array([2, 2], dtype=np.int64)
+
+        z = np.array([
+            [0, 1, 0, 1],
+            [2, 3, 2, 3],
+            [0, 1, 0, 1],
+            [2, 3, 2, 3]
+        ], dtype=np.float32)
+
+        expect(node,
+               inputs=[x, repeats],
+               outputs=[z],
+               name='test_tile_precomputed')
+
+    def test_onehot_without_axis(self):
+        on_value = 5
+        off_value = 2
+        output_type = np.int32
+        node = onnx.helper.make_node('OneHot',
+                                     inputs=['indices', 'depth', 'values'],
+                                     outputs=['y'])
+        indices = np.array([0, 7, 8], dtype=np.int64)
+        depth = np.float32(12)
+        values = np.array([off_value, on_value], dtype=output_type)
+        y = one_hot(indices, depth, dtype=output_type)
+        y = y * (on_value - off_value) + off_value
+        expect(node,
+               inputs=[indices, depth, values],
+               outputs=[y],
+               name='test_onehot_without_axis')
+
+    def test_onehot_with_axis(self):
+        axisValue = 1
+        on_value = 3
+        off_value = 1
+        output_type = np.float32
+        node = onnx.helper.make_node('OneHot',
+                                     inputs=['indices', 'depth', 'values'],
+                                     outputs=['y'],
+                                     axis=axisValue)
+        indices = np.array([[1, 9], [2, 4]], dtype=np.float32)
+        depth = np.array([10], dtype=np.float32)
+        values = np.array([off_value, on_value], dtype=output_type)
+        y = one_hot(indices, depth, axis=axisValue, dtype=output_type)
+        y = y * (on_value - off_value) + off_value
+        expect(node,
+               inputs=[indices, depth, values],
+               outputs=[y],
+               name='test_onehot_with_axis')
+
+    def test_onehot_with_negative_indices(self):
+        axisValue = 1
+        on_value = 3
+        off_value = 1
+        output_type = np.float32
+        node = onnx.helper.make_node('OneHot',
+                                     inputs=['indices', 'depth', 'values'],
+                                     outputs=['y'],
+                                     axis=axisValue)
+        indices = np.array([0, -7, -8], dtype=np.int64)
+
+        depth = np.array([10], dtype=np.float32)
+        values = np.array([off_value, on_value], dtype=output_type)
+        y = one_hot(indices, depth, axis=axisValue, dtype=output_type)
+        y = y * (on_value - off_value) + off_value
+        expect(node,
+               inputs=[indices, depth, values],
+               outputs=[y],
+               name='test_onehot_negative_indices')
+
+    def test_onehot_with_negative_axis(self):
+        axisValue = -2
+        on_value = 3
+        off_value = 1
+        output_type = np.float32
+        node = onnx.helper.make_node('OneHot',
+                                     inputs=['indices', 'depth', 'values'],
+                                     outputs=['y'],
+                                     axis=axisValue)
+        indices = np.array([[1, 9], [2, 4]], dtype=np.float32)
+        depth = np.array([10], dtype=np.float32)
+        values = np.array([off_value, on_value], dtype=output_type)
+        y = one_hot(indices, depth, axis=axisValue, dtype=output_type)
+        y = y * (on_value - off_value) + off_value
+        expect(node,
+               inputs=[indices, depth, values],
+               outputs=[y],
+               name='test_onehot_with_negative_axis')
+
+
+def one_hot(indices, depth, axis=-1, dtype=np.float32):  # type: ignore
+    ''' Compute one hot from indices at a specific axis '''
+    values = np.asarray(indices)
+    rank = len(values.shape)
+    depth_range = np.arange(depth)
+    if axis < 0:
+        axis += (rank + 1)
+    ls = values.shape[0:axis]
+    rs = values.shape[axis:rank]
+    targets = np.reshape(depth_range,
+                         (1,) * len(ls) + depth_range.shape + (1,) * len(rs))
+    values = np.reshape(np.mod(values, depth), ls + (1,) + rs)
+    return np.asarray(targets == values, dtype=dtype)
+
+
+def gemm_reference_implementation(
+    A,
+    B,
+    C=None,
+    alpha=1.,
+    beta=1.,
+    transA=0,
+    transB=0
+):  # type: (np.ndarray, np.ndarray, Optional[np.ndarray], float, float, int, int) -> np.ndarray
     A = A if transA == 0 else A.T
     B = B if transB == 0 else B.T
     C = C if C is not None else np.array(0)
@@ -2280,19 +3108,6 @@ def pool(
             y[shape] = f(window_vals[np.where(~np.isnan(window_vals))])
     return y.astype(np.float32)
 
-    def test_globalaveragepool(self):
-        node = onnx.helper.make_node(
-            'GlobalAveragePool',
-            inputs=['x'],
-            outputs=['y'],
-        )
-        x = np.array([[[
-            [1, 2, 3],
-            [4, 5, 6],
-            [7, 8, 9],
-        ]]]).astype(np.float32)
-        y = np.array([[[[5]]]]).astype(np.float32)
-        expect(node, inputs=[x], outputs=[y], name='test_globalaveragepool_precomputed')
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/python/test_operation.py b/test/python/test_operation.py
index 4a1728217b..be3cf2b513 100755
--- a/test/python/test_operation.py
+++ b/test/python/test_operation.py
@@ -3645,6 +3645,7 @@ def test_globalaveragepool_cpu(self):
     def test_globalaveragepool_gpu(self):
         self.globalaveragepool_channel_first(gpu_dev)
         self.globalaveragepool_channel_last(gpu_dev)
+
     def constantOfShape_test(self, dev):
         # float_ones
         X = np.array([4, 3, 2]).astype(np.int64)
@@ -3959,7 +3960,9 @@ def cast_test(self, dev):
             x.to_device(dev)
 
             result = autograd.cast(x, t3)
-            np.testing.assert_array_almost_equal(tensor.to_numpy(result),
+            result_np = tensor.to_numpy(result)
+            assert result_np.dtype == y.dtype, "type %s != %s." % (result_np.dtype, y.dtype)
+            np.testing.assert_array_almost_equal(result_np,
                                                     y,
                                                     decimal=5)
 
@@ -3969,5 +3972,43 @@ def test_cast_cpu(self):
     def test_cast_gpu(self):
         self.cast_test(gpu_dev)
 
+    def onehot_test(self, dev):
+        def one_hot(indices, depth, axis=-1, dtype=np.float32):  # type: ignore
+            ''' Compute one hot from indices at a specific axis '''
+            values = np.asarray(indices)
+            rank = len(values.shape)
+            depth_range = np.arange(depth)
+            if axis < 0:
+                axis += (rank + 1)
+            ls = values.shape[0:axis]
+            rs = values.shape[axis:rank]
+            targets = np.reshape(depth_range, (1,) * len(ls) + depth_range.shape + (1,) * len(rs))
+            values = np.reshape(np.mod(values, depth), ls + (1,) + rs)
+            return np.asarray(targets == values, dtype=dtype)
+
+        axisValue = 1
+        on_value = 3
+        off_value = 1
+        output_type = np.float32
+        indices = np.array([[1, 9], [2, 4]], dtype=np.float32)
+        depth = np.array([10], dtype=np.float32)
+        values = np.array([off_value, on_value], dtype=output_type)
+        y = one_hot(indices, depth, axis=axisValue, dtype=output_type)
+        y = y * (on_value - off_value) + off_value
+
+        x = tensor.from_numpy(indices)
+        x.to_device(dev)
+
+        result = autograd.onehot(axisValue, x, depth, values)
+        np.testing.assert_array_almost_equal(tensor.to_numpy(result),
+                                                y,
+                                                decimal=5)
+
+    def test_onehot_cpu(self):
+        self.onehot_test(cpu_dev)
+
+    def test_onehot_gpu(self):
+        self.onehot_test(gpu_dev)
+    
 if __name__ == '__main__':
     unittest.main()