diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..f3d5c415e090be510b646e88a7a523cdbab93d14
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,38 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..11fc491ef1dae316f2b06bbb40eaba9c757fdfd1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..14376956ab93b51250e4dafe431a86c1fd2ab85e
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "expman"]
+	path = expman
+	url = https://github.com/fabiocarrara/expman
+[submodule "models/deeplab"]
+	path = models/deeplab
+	url = https://github.com/david8862/tf-keras-deeplabv3p-model-set
diff --git a/convert_model.py b/convert_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9821918ec542d88d2d2f99ee40d9f6423c817a88
--- /dev/null
+++ b/convert_model.py
@@ -0,0 +1,56 @@
+import tensorflow as tf
+from tensorflow.keras import backend as K
+from adabelief_tf import AdaBeliefOptimizer
+
+def iou_coef(y_true, y_pred):
+    y_true = tf.cast(y_true, tf.float32)
+    y_pred = tf.cast(y_pred, tf.float32)
+    intersection = K.sum(K.abs(y_true * y_pred), axis=[1, 2, 3])
+    union = K.sum(y_true, axis=[1, 2, 3]) + K.sum(y_pred, axis=[1, 2, 3]) - intersection
+    return K.mean((intersection + 1e-6) / (union + 1e-6))
+
+def dice_coef(y_true, y_pred):
+    y_true = tf.cast(y_true, tf.float32)
+    y_pred = tf.cast(y_pred, tf.float32)
+    intersection = K.sum(K.abs(y_true * y_pred), axis=[1, 2, 3])
+    return K.mean((2. * intersection + 1e-6) / (K.sum(y_true, axis=[1, 2, 3]) + K.sum(y_pred, axis=[1, 2, 3]) + 1e-6))
+
+def boundary_loss(y_true, y_pred):
+    y_true = tf.cast(y_true, tf.float32)
+    y_pred = tf.cast(y_pred, tf.float32)
+    dy_true, dx_true = tf.image.image_gradients(y_true)
+    dy_pred, dx_pred = tf.image.image_gradients(y_pred)
+    loss = tf.reduce_mean(tf.abs(dy_pred - dy_true) + tf.abs(dx_pred - dx_true))
+    return loss * 0.5
+
+def enhanced_binary_crossentropy(y_true, y_pred):
+    y_true = tf.cast(y_true, tf.float32)
+    y_pred = tf.cast(y_pred, tf.float32)
+    bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
+    boundary = boundary_loss(y_true, y_pred)
+    return bce + boundary
+
+def hard_swish(x):
+    return x * tf.nn.relu6(x + 3) * (1. / 6.)
+
+# Path to your current .keras model
+keras_path = 'runs/b32_c-conv_d-|root|meye|data|NN_human_mouse_eyes|_g1.5_l0.001_num_c1_num_f16_num_s5_r128_se23_sp-random_up-relu_us0/best_model.keras'
+
+# Load the model with custom objects
+custom_objects = {
+    'AdaBeliefOptimizer': AdaBeliefOptimizer, 
+    'iou_coef': iou_coef, 
+    'dice_coef': dice_coef, 
+    'hard_swish': hard_swish,
+    'enhanced_binary_crossentropy': enhanced_binary_crossentropy,
+    'boundary_loss': boundary_loss
+}
+
+print("Loading model from:", keras_path)
+model = tf.keras.models.load_model(keras_path, custom_objects=custom_objects)
+
+# Save as .h5
+h5_path = keras_path.replace('.keras', '.h5')
+print("Saving model to:", h5_path)
+model.save(h5_path, save_format='h5')
+print("Conversion complete!")
\ No newline at end of file
diff --git a/expman/expman/__init__.py b/expman/expman/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8241917216d557fa15f4e5b611438ce7789a86
--- /dev/null
+++ b/expman/expman/__init__.py
@@ -0,0 +1,7 @@
+from .experiment import Experiment, exp_filter, use_hash_naming
+from .exp_group import ExpGroup
+
+abbreviate = Experiment.abbreviate
+from_dir = Experiment.from_dir
+gather = ExpGroup.gather
+is_exp_dir = Experiment.is_exp_dir
diff --git a/expman/expman/__main__.py b/expman/expman/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d3e67ca03149c455362e6d459abc626be9e7c5
--- /dev/null
+++ b/expman/expman/__main__.py
@@ -0,0 +1,58 @@
+import argparse
+
+from .exp_group import ExpGroup
+
+
+def add_param(args):
+    exps = ExpGroup.gather(args.run)
+    for exp in exps:
+        exp.add_parameter(args.param, args.value)
+
+
+def mv_param(args):
+    exps = ExpGroup.gather(args.run)
+    for exp in exps:
+        exp.rename_parameter(args.param, args.new_param)
+
+
+def rm_param(args):
+    exps = ExpGroup.gather(args.run)
+    for exp in exps:
+        exp.remove_parameter(args.param)
+
+
+def command_line():
+    def guess(value):
+        """ try to guess a python type for the passed string parameter """
+        try:
+            result = eval(value)
+        except (NameError, ValueError):
+            result = value
+        return result
+
+    parser = argparse.ArgumentParser(description='Experiment Manager Utilities')
+    subparsers = parser.add_subparsers(dest='command')
+    subparsers.required = True
+
+    parser_add = subparsers.add_parser('add-param')
+    parser_add.add_argument('run', default='runs/')
+    parser_add.add_argument('param', help='new param name')
+    parser_add.add_argument('value', type=guess, help='new param value')
+    parser_add.set_defaults(func=add_param)
+
+    parser_rm = subparsers.add_parser('rm-param')
+    parser_rm.add_argument('run', default='runs/')
+    parser_rm.add_argument('param', help='param to remove')
+    parser_rm.set_defaults(func=rm_param)
+
+    parser_mv = subparsers.add_parser('mv-param')
+    parser_mv.add_argument('run', default='runs/')
+    parser_mv.add_argument('param', help='param to rename')
+    parser_mv.add_argument('new_param', help='new param name')
+    parser_mv.set_defaults(func=mv_param)
+
+    args = parser.parse_args()
+    args.func(args)
+
+
+command_line()
diff --git a/expman/expman/__pycache__/__init__.cpython-311.pyc b/expman/expman/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13539884ea530426a316c8758356c6588d6b3a96
Binary files /dev/null and b/expman/expman/__pycache__/__init__.cpython-311.pyc differ
diff --git a/expman/expman/__pycache__/exp_group.cpython-311.pyc b/expman/expman/__pycache__/exp_group.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d3194514d5104250f497afeae8c805dd3565319
Binary files /dev/null and b/expman/expman/__pycache__/exp_group.cpython-311.pyc differ
diff --git a/expman/expman/__pycache__/experiment.cpython-311.pyc b/expman/expman/__pycache__/experiment.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59b9705ea7ea4250433e9250828a5ecfc16e9fc7
Binary files /dev/null and b/expman/expman/__pycache__/experiment.cpython-311.pyc differ
diff --git a/expman/expman/exp_group.py b/expman/expman/exp_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ae4f089f28f6b9cb7936c867b05858c33a951b8
--- /dev/null
+++ b/expman/expman/exp_group.py
@@ -0,0 +1,96 @@
+import os
+import pandas as pd
+
+from glob import glob
+from .experiment import Experiment
+
+
+class ExpGroup:
+    @classmethod
+    def gather(cls, root='runs/'):
+        if Experiment.is_exp_dir(root):
+            exps = (root,)
+        else:
+            exps = glob(os.path.join(root, '*'))
+            exps = filter(Experiment.is_exp_dir, exps)
+
+        exps = map(Experiment.from_dir, exps)
+        exps = filter(lambda x: x.existing, exps)
+        exps = tuple(exps)
+        return cls(exps)
+
+    def __init__(self, experiments=()):
+        assert isinstance(experiments, (list, tuple)), "'experiments' must be a list or tuple"
+        self.experiments = experiments
+
+    @staticmethod
+    def _collect_one(exp_id, exp, csv=None, index_col=None):
+        params = exp.params.to_frame().transpose().infer_objects()  # as DataFrame
+        params['exp_id'] = exp_id
+
+        if csv is None:
+            return params
+
+        csv_path = exp.path_to(csv)
+        if os.path.exists(csv_path):
+            stuff = pd.read_csv(csv_path, index_col=index_col)
+        else:  # try globbing
+            csv_files = os.path.join(exp.path, csv)
+            csv_files = list(glob(csv_files))
+            if len(csv_files) == 0:
+                return pd.DataFrame()
+
+            stuff = map(lambda x: pd.read_csv(x, index_col=index_col, float_precision='round_trip'), csv_files)
+            stuff = pd.concat(stuff, ignore_index=True)
+
+        stuff['exp_id'] = exp_id
+        return pd.merge(params, stuff, on='exp_id')
+
+    def collect(self, csv=None, index_col=None, prefix=''):
+        results = [self._collect_one(exp_id, exp, csv=csv, index_col=index_col) for exp_id, exp in enumerate(self.experiments)]
+        results = pd.concat(results, ignore_index=True, sort=False)
+
+        if len(results):
+            # build minimal exp_name
+            exp_name = ''
+            params = results.loc[:, :'exp_id'].drop('exp_id', axis=1)
+            if len(params) > 1:
+                varying_params = params.loc[:, params.nunique() > 1]
+                exp_name = varying_params.apply(Experiment.abbreviate, axis=1)
+            idx = results.columns.get_loc('exp_id') + 1
+            results.insert(idx, 'exp_name', prefix + exp_name)
+
+        return results
+
+    def filter(self, filters):
+        if isinstance(filters, str):
+            filters = string.split(',')
+            filters = map(lambda x: x.split('='), filters)
+            filters = {k: v for k, v in filters}
+
+        def __filter_exp(e):
+            for param, value in filters.items():
+                try:
+                    p = e.params[param]
+                    ptype = type(p)
+                    if p != ptype(value):
+                        return False
+                except:
+                    return False
+
+            return True
+
+        filtered_exps = filter(__filter_exp, self.experiments)
+        filtered_exps = tuple(filtered_exps)
+        return ExpGroup(filtered_exps)
+
+    def items(self, short_names=True, prefix=''):
+        if short_names:
+            params = self.collect(prefix=prefix)
+            exp_names = params['exp_name'].values
+            return zip(exp_names, self.experiments)
+
+        return self.experiments
+
+    def __iter__(self):
+        return iter(self.experiments)
\ No newline at end of file
diff --git a/expman/expman/experiment.py b/expman/expman/experiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4e15aa4e4ff06f3a05e10f605a8eb8735c8cc8
--- /dev/null
+++ b/expman/expman/experiment.py
@@ -0,0 +1,233 @@
+
+import argparse
+import ast
+import os
+import hashlib
+import shutil
+import numbers
+from glob import glob
+from io import StringIO
+
+import numpy as np
+import pandas as pd
+
+
+hash_naming = False
+
+def use_hash_naming(use_hashes=True):
+    global hash_naming
+    assert isinstance(use_hashes, bool), "Value must be a boolean."
+    hash_naming = use_hashes
+
+def _guessed_cast(x):
+    try:
+        return ast.literal_eval(x)
+    except:
+        return x
+
+def exp_filter(string):
+    if '=' not in string:
+        raise argparse.ArgumentTypeError(
+            'Filter {} is not in format <param1>=<value1>[, <param2>=<value2>[, ...]]'.format(string))
+    filters = string.split(',')
+    filters = map(lambda x: x.split('='), filters)
+    filters = {k: _guessed_cast(v) for k, v in filters}
+    return filters
+
+
+class Experiment:
+
+    PARAM_FILENAME = 'params.json'
+
+    @staticmethod
+    def _abbr(name, value, params):
+
+        def prefix_len(a, b):
+            return len(os.path.commonprefix((a, b)))
+
+        prefix = [name[:prefix_len(p, name) + 1] for p in params.keys() if p != name]
+        prefix = max(prefix, key=len) if len(prefix) > 0 else name
+
+        sep = ''
+        if isinstance(value, str):
+            sep = '-'
+        elif isinstance(value, numbers.Number):
+            value = '{:g}'.format(value)
+            sep = '-' if prefix[-1].isdigit() else ''
+        elif isinstance(value, (list, tuple)):
+            value = map(str, value)
+            value = map(lambda v: v.replace(os.sep, '|'), value)
+            value = ','.join(list(value))
+            sep = '-'
+
+        return prefix, sep, value
+
+    @classmethod
+    def abbreviate(cls, params):
+        if isinstance(params, pd.DataFrame):
+            params = params.iloc[0]
+            params = params.replace({np.nan: None})
+
+        if hash_naming:
+            exp_name = hashlib.md5(str(sorted(params.items())).encode()).hexdigest()
+        else:
+            abbrev_params = {k: '{}{}{}'.format(*cls._abbr(k, v, params)) for k, v in params.items()}
+            abbrev = sorted(abbrev_params.values())
+            exp_name = '_'.join(abbrev)
+
+        return exp_name
+
+    @classmethod
+    def from_dir(cls, exp_dir):
+        root = os.path.dirname(exp_dir.rstrip('/'))
+        params = os.path.join(exp_dir, cls.PARAM_FILENAME)
+
+        assert os.path.exists(exp_dir), "Experiment directory not found: '{}'".format(exp_dir)
+        assert os.path.exists(params), "Empty run directory found: '{}'".format(params)
+
+        params = cls._read_params(params)
+        exp = cls(params, root=root, create=False)
+        return exp
+
+    @classmethod
+    def is_exp_dir(cls, exp_dir):
+        if os.path.isdir(exp_dir):
+            params = os.path.join(exp_dir, cls.PARAM_FILENAME)
+            if os.path.exists(params):
+                return True
+
+        return False
+
+    @classmethod
+    def update_exp_dir(cls, exp_dir):
+        exp_dir = exp_dir.rstrip('/')
+        root = os.path.dirname(exp_dir)
+        name = os.path.basename(exp_dir)
+        params = os.path.join(exp_dir, cls.PARAM_FILENAME)
+
+        assert os.path.exists(exp_dir), "Experiment directory not found: '{}'".format(exp_dir)
+        assert os.path.exists(params), "Empty run directory found: '{}'".format(params)
+
+        params = cls._read_params(params)
+        new_name = cls.abbreviate(params)
+
+        if name != new_name:
+            new_exp_dir = os.path.join(root, new_name)
+            assert not os.path.exists(new_exp_dir), \
+                "Destination experiment directory already exists: '{}'".format(new_exp_dir)
+
+            print('Renaming:\n  {} into\n  {}'.format(exp_dir, new_exp_dir))
+            shutil.move(exp_dir, new_exp_dir)
+
+    def __init__(self, params, root='runs/', ignore=(), create=True):
+        # relative dir containing this run
+        self.root = root
+        # params to be ignored in the run naming
+        self.ignore = ignore
+        # parameters of this run
+        if isinstance(params, argparse.Namespace):
+            params = vars(params)
+
+        def _sanitize(v):
+            return tuple(v) if isinstance(v, list) else v
+
+        params = {k: _sanitize(v) for k, v in params.items() if k not in self.ignore}
+        self.params = pd.Series(params, name='params')
+
+        # whether to create the run directory if not exists
+        self.create = create
+
+        self.name = self.abbreviate(self.params)
+        self.path = os.path.join(self.root, self.name)
+        self.existing = os.path.exists(self.path)
+        self.found = self.existing
+
+        if not self.existing:
+            if self.create:
+                os.makedirs(self.path)
+                self.write_params()
+                self.existing = True
+            else:
+                print("Run directory '{}' not found, but not created.".format(self.path))
+
+        else:
+            param_fname = self.path_to(self.PARAM_FILENAME)
+            assert os.path.exists(param_fname), "Empty run, parameters not found: '{}'".format(param_fname)
+            self.params = self._read_params(param_fname)
+
+
+    def __str__(self):
+        s = StringIO()
+        print('Experiment Dir: {}'.format(self.path), file=s)
+        print('Params:', file=s)
+
+        # Set display options differently
+        with pd.option_context('display.max_rows', None,
+                              'display.max_columns', None,
+                              'display.width', None):
+            print(self.params.to_string(), file=s)
+
+        return s.getvalue()
+
+    def __repr__(self):
+        return self.__str__()
+
+    def path_to(self, path):
+        path = os.path.join(self.path, path)
+        return path
+
+    def add_parameter(self, key, value):
+        assert key not in self.params, "Parameter already exists: '{}'".format(key)
+        self.params[key] = value
+        self._update_run_dir()
+        self.write_params()
+
+    def rename_parameter(self, key, new_key):
+        assert key in self.params, "Cannot rename non-existent parameter: '{}'".format(key)
+        assert new_key not in self.params, "Destination name for parameter exists: '{}'".format(key)
+
+        self.params[new_key] = self.params[key]
+        del self.params[key]
+
+        self._update_run_dir()
+        self.write_params()
+
+    def remove_parameter(self, key):
+        assert key in self.params, "Cannot remove non-existent parameter: '{}'".format(key)
+        del self.params[key]
+        self._update_run_dir()
+        self.write_params()
+
+    def _update_run_dir(self):
+        old_run_dir = self.path
+        if self.existing:
+            self.name = self.abbreviate(self.params)
+            self.path = os.path.join(self.root, self.name)
+            assert not os.path.exists(self.path), "Cannot rename run, new name exists: '{}'".format(self.path)
+            shutil.move(old_run_dir, self.path)
+
+    @staticmethod
+    def _read_params(path):
+        # read json to pd.Series
+        params = pd.read_json(path, typ='series')
+        # transform lists to tuples (for hashability)
+        params = params.apply(lambda x: tuple(x) if isinstance(x, list) else x)
+        return params
+
+    def write_params(self):
+        # write Series as json
+        self.params.to_json(self.path_to(self.PARAM_FILENAME))
+
+def test():
+    parser = argparse.ArgumentParser(description='Experiment Manager Test')
+    parser.add_argument('-e', '--epochs', type=int, default=70)
+    parser.add_argument('-b', '--batch-size', type=int, default=64)
+    parser.add_argument('-m', '--model', choices=('1d-conv', 'paper'), default='1d-conv')
+    parser.add_argument('-s', '--seed', type=int, default=23)
+    parser.add_argument('--no-cuda', action='store_true')
+    parser.set_defaults(no_cuda=False)
+    args = parser.parse_args()
+
+    run = Experiment(args, root='prova', ignore=['no_cuda'])
+    print(run)
+    print(run.path_to('ckpt/best.h5'))
diff --git a/losses.py b/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..52d0b6a6eebd9fd750cff6157b42dc5e4e808102
--- /dev/null
+++ b/losses.py
@@ -0,0 +1,18 @@
+import tensorflow as tf
+from tensorflow.keras import backend as K
+
+def boundary_loss(y_true, y_pred):
+    """Additional loss focusing on boundaries"""
+    # Compute gradients
+    dy_true, dx_true = tf.image.image_gradients(y_true)
+    dy_pred, dx_pred = tf.image.image_gradients(y_pred)
+    
+    # Compute boundary loss
+    loss = tf.reduce_mean(tf.abs(dy_pred - dy_true) + tf.abs(dx_pred - dx_true))
+    return loss * 0.5  # weight factor
+
+def enhanced_binary_crossentropy(y_true, y_pred):
+    """Combine standard BCE with boundary loss"""
+    bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
+    boundary = boundary_loss(y_true, y_pred)
+    return bce + boundary
\ No newline at end of file
diff --git a/matlab/Meye.m b/matlab/Meye.m
new file mode 100644
index 0000000000000000000000000000000000000000..fde5857a5e93cb4c002cdf81def946ebbc5e9d9d
--- /dev/null
+++ b/matlab/Meye.m
@@ -0,0 +1,310 @@
+classdef Meye
+
+    properties (Access=private)
+        model
+    end
+
+
+    methods
+
+        % CONSTRUCTOR
+        %------------------------------------------------------------------
+        function self = Meye(modelPath)
+            % Class constructor
+            arguments
+                modelPath char {mustBeText}
+            end
+
+            % Change the current directory to the directory where the
+            % original class is, so that the package with the custom layers
+            % is created there
+            classPath = getClassPath(self);
+            oldFolder = cd(classPath);
+            % Import the model saved as ONNX
+            self.model = importONNXNetwork(modelPath, ...
+                'GenerateCustomLayers',true, ...
+                'PackageName','customLayers_meye',...
+                'InputDataFormats', 'BSSC',...
+                'OutputDataFormats',{'BSSC','BC'});
+
+            % Manually change the "nearest" option to "linear" inside of
+            % the automatically generated custom layers. This is necessary
+            % due to the fact that MATLAB still does not support the proper
+            % translation between ONNX layers and DLtoolbox layers
+            self.nearest2Linear([classPath filesep '+customLayers_meye'])
+
+            % Go back to the old current folder
+            cd(oldFolder)
+        end
+
+
+        % PREDICTION OF SINGLE IMAGES
+        %------------------------------------------------------------------
+        function [pupilMask, eyeProb, blinkProb] = predictImage(self, inputImage, options)
+            % Predicts pupil location on a single image
+            arguments
+                self
+                inputImage
+                options.roiPos = []
+                options.threshold = []
+            end
+
+            roiPos = options.roiPos;
+
+            % Convert the image to grayscale if RGB
+            if size(inputImage,3) > 1
+                inputImage = im2gray(inputImage);
+            end
+
+            % Crop the frame to the desired ROI
+            if ~isempty(roiPos)
+                crop = inputImage(roiPos(2):roiPos(2)+roiPos(4)-1,...
+                    roiPos(1):roiPos(1)+roiPos(3)-1);
+            else
+                crop = inputImage;
+            end
+
+            % Preprocessing
+            img = double(imresize(crop,[128 128]));
+            img = img / max(img,[],'all');
+
+            % Do the prediction
+            [rawMask, info] = predict(self.model, img);
+            eyeProb = info(1);
+            blinkProb = info(2);
+
+            % Reinsert the cropped prediction in the frame
+            if ~isempty(roiPos)
+                pupilMask = zeros(size(inputImage));
+                pupilMask(roiPos(2):roiPos(2)+roiPos(4)-1,...
+                    roiPos(1):roiPos(1)+roiPos(3)-1) = imresize(rawMask, [roiPos(4), roiPos(3)],"bilinear");
+            else
+                pupilMask = imresize(rawMask,size(inputImage),"bilinear");
+            end
+
+            % Apply a threshold to the image if requested
+            if ~isempty(options.threshold)
+                pupilMask = pupilMask > options.threshold;
+            end
+
+        end
+
+
+        % PREDICT A MOVIE AND GET A TABLE WITH THE RESULTS
+        %------------------------------------------------------------------
+        function tab = predictMovie(self, moviePath, options)
+            % Predict an entire video file and returns a results Table
+            %
+            % tab = predictMovie(moviePath, name-value)
+            % 
+            % INPUT(S)
+            %   - moviePath: (char/string) Full path of a video file.
+            %   - name-value pairs
+            %       - roiPos: [x,y,width,height] 4-elements vector defining a
+            %       rectangle containing the eye. Works best if width and
+            %       height are similar. If empty, a prediction will be done on 
+            %       a full frame(Default: []).
+            %       - threshold: [0-1] The pupil prediction is binarized based
+            %       on a threshold value to measure pupil size. (Default:0.4)
+            %
+            % OUTPUT(S)
+            %   - tab: a MATLAB table containing data of the analyzed video
+
+            arguments
+                self
+                moviePath char {mustBeText}
+                options.roiPos double = []
+                options.threshold = 0.4;
+            end
+
+            % Initialize a video reader
+            v = VideoReader(moviePath);
+            totFrames = v.NumFrames;
+
+            % Initialize Variables
+            frameN = zeros(totFrames,1,'double');
+            frameTime = zeros(totFrames,1,'double');
+            binaryMask = cell(totFrames,1);
+            pupilArea = zeros(totFrames,1,'double');
+            isEye = zeros(totFrames,1,'double');
+            isBlink = zeros(totFrames,1,'double');
+
+            tic
+            for i = 1:totFrames
+                % Progress report
+                if toc>10
+                    fprintf('%.1f%% - Processing frame (%u/%u)\n', (i/totFrames)*100 , i, totFrames)
+                    tic
+                end
+
+                % Read  a frame and make its prediction
+                frame = read(v, i, 'native');
+                [pupilMask, eyeProb, blinkProb] = self.predictImage(frame, roiPos=options.roiPos,...
+                    threshold=options.threshold);
+
+                % Save results for this frame
+                frameN(i) = i;
+                frameTime(i) = v.CurrentTime;
+                binaryMask{i} = pupilMask > options.threshold;
+                pupilArea(i) = sum(binaryMask{i},"all");
+                isEye(i) = eyeProb;
+                isBlink(i) = blinkProb;
+            end
+            % Save all the results in a final table
+            tab = table(frameN,frameTime,binaryMask,pupilArea,isEye,isBlink);
+        end
+
+
+
+        % PREVIEW OF A PREDICTED MOVIE
+        %------------------------------------------------------------------
+        function predictMovie_Preview(self, moviePath, options)
+            % Displays a live-preview of prediction for a video file
+
+            arguments
+                self
+                moviePath char {mustBeText}
+                options.roiPos double = []
+                options.threshold double = []
+            end
+            roiPos = options.roiPos;
+
+
+            % Initialize a video reader
+            v = VideoReader(moviePath);
+            % Initialize images to show
+            blankImg = zeros(v.Height, v.Width, 'uint8');
+            cyanColor = cat(3, blankImg, blankImg+255, blankImg+255);
+            pupilTransparency = blankImg;
+
+            % Create a figure for the preview
+            figHandle = figure(...
+                'Name','MEYE video preview',...
+                'NumberTitle','off',...
+                'ToolBar','none',...
+                'MenuBar','none', ...
+                'Color',[.1, .1, .1]);
+
+            ax = axes('Parent',figHandle,...
+                'Units','normalized',...
+                'Position',[0 0 1 .94]);
+
+            imHandle = imshow(blankImg,'Parent',ax);
+            hold on
+            cyanHandle = imshow(cyanColor,'Parent',ax);
+            cyanHandle.AlphaData = pupilTransparency;
+            rect = rectangle('LineWidth',1.5, 'LineStyle','-.','EdgeColor',[1,0,0],...
+                'Parent',ax,'Position',[0,0,0,0]);
+            hold off
+            title(ax,'MEYE Video Preview', 'Color',[1,1,1])
+
+            % Movie-Showing loop
+            while exist("figHandle","var") && ishandle(figHandle) && hasFrame(v)
+                try
+                    tic
+                    frame = readFrame(v);
+
+                    % Actually do the prediction
+                    [pupilMask, eyeProb, blinkProb] = self.predictImage(frame, roiPos=roiPos,...
+                        threshold=options.threshold);
+
+                    % Update graphic elements
+                    imHandle.CData = frame;
+                    cyanHandle.AlphaData = imresize(pupilMask, [v.Height, v.Width]);
+                    if ~isempty(roiPos)
+                        rect.Position = roiPos;
+                    end
+                    titStr = sprintf('Eye: %.2f%% - Blink:%.2f%% - FPS:%.1f',...
+                        eyeProb*100, blinkProb*100, 1/toc);
+                    ax.Title.String = titStr;
+                    drawnow
+                catch ME
+                    warning(ME.message)
+                    close(figHandle)
+                end
+            end
+            disp('Stop preview.')
+        end
+
+
+    end
+
+    
+    %------------------------------------------------------------------
+    %------------------------------------------------------------------
+    % INTERNAL FUNCTIONS
+    %------------------------------------------------------------------
+    %------------------------------------------------------------------
+    methods(Access=private)
+        %------------------------------------------------------------------
+        function path = getClassPath(~)
+            % Returns the full path of where the class file is
+
+            fullPath = mfilename('fullpath');
+            [path,~,~] = fileparts(fullPath);
+        end
+
+        %------------------------------------------------------------------
+        function [fplist,fnlist] = listfiles(~, folderpath, token)
+            listing = dir(folderpath);
+            index = 0;
+            fplist = {};
+            fnlist = {};
+            for i = 1:size(listing,1)
+                s = listing(i).name;
+                if contains(s,token)
+                    index = index+1;
+                    fplist{index} = [folderpath filesep s];
+                    fnlist{index} = s;
+                end
+            end
+        end
+
+        % nearest2Linear
+        %------------------------------------------------------------------
+        function nearest2Linear(self, inputPath)
+            fP = self.listfiles(inputPath, 'Shape_To_Upsample');
+
+            foundFileToChange = false;
+            beforePatter = '"half_pixel", "nearest",';
+            afterPattern = '"half_pixel", "linear",';
+            for i = 1:length(fP)
+
+                % Get the content of the file
+                fID = fopen(fP{i}, 'r');
+                f = fread(fID,'*char')';
+                fclose(fID);
+
+                % Send a verbose warning the first time we are manually
+                % correcting the upsampling layers bug
+                if ~foundFileToChange && contains(f,beforePatter)
+                    foundFileToChange = true;
+                    msg = ['This is a message from MEYE developers.\n' ...
+                        'In the current release of the Deep Learning Toolbox ' ...
+                        'MATLAB does not translate well all the layers in the ' ...
+                        'ONNX network to native MATLAB layers. In particular the ' ...
+                        'automatically generated custom layers that have to do ' ...
+                        'with UPSAMPLING are generated with the ''nearest'' instead of ' ...
+                        'the ''linear'' mode.\nWe automatically correct for this bug when you ' ...
+                        'instantiate a Meye object (henche this warning).\nEverything should work fine, ' ...
+                        'and we hope that in future MATLAB releases this hack wont be ' ...
+                        'needed anymore.\n' ...
+                        'If you find bugs or performance issues, please let us know ' ...
+                        'with an issue ' ...
+                        '<a href="matlab: web(''https://github.com/fabiocarrara/meye/issues'')">HERE.</a>'];
+                    warning(sprintf(msg))
+                end
+
+                % Replace the 'nearest' option with 'linear'
+                newF = strrep(f, beforePatter, afterPattern);
+
+                % Save the file back in its original location
+                fID = fopen(fP{i}, 'w');
+                fprintf(fID,'%s',newF);
+                fclose(fID);
+            end
+        end
+    end
+end
+
+
diff --git a/matlab/README.md b/matlab/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..db265cb379d6a6108a2ba9bfed6d37b3d2e1f7f4
--- /dev/null
+++ b/matlab/README.md
@@ -0,0 +1,57 @@
+# MEYE pupillometry on MATLAB
+
+> Try MEYE on a standalone [Web-App](https://www.pupillometry.it/)
+
+> Learn more on the original [MEYE repo](https://github.com/fabiocarrara/meye)
+
+> Label your own dataset with [pLabeler](https://github.com/LeonardoLupori/pLabeler)
+
+Starting from MATLAB version 2021b, MEYE is also available for use on MATLAB!
+
+Here's a brief tutorial on how to use it in you own experiments.
+
+## What do you need?
+
+- [MATLAB 2021b](https://it.mathworks.com/products/matlab.html) or later
+- [MATLAB Image Processing Toolbox](https://it.mathworks.com/products/image.html)
+- [MATLAB Deep Learning Toolbox](https://it.mathworks.com/products/deep-learning.html)  
+    An additional _support package_ of this toolbox has to be downloaded manually from the Add-On explorer in MATLAB:
+    -  _Deep Learning Toolbox™ Converter for ONNX Model Format_
+    ![image](https://user-images.githubusercontent.com/39329654/152327789-dde0af9b-d531-40be-b1a0-5ba17c508a13.png)
+- A MEYE model in [ONNX](https://onnx.ai/) format. You can download our latest model [here](https://github.com/fabiocarrara/meye/releases).
+![onnxModel](https://user-images.githubusercontent.com/39329654/152552616-1b800398-5794-4f51-b4ed-2e3339cb2d0d.png)
+
+
+## Quick start!
+
+```matlab
+% Create an instance of Meye
+meye = Meye('path/to/model.onnx');
+
+% Example 1
+% Make predictions on a single Image
+%
+% Load an image for which you want to predict the pupil
+img = imread('path/to/img.tif');
+% Make a prediction on a frame
+[pupil, isEye, isBlink] = meye.predictImage(img);
+
+% Example 2
+% Make predictions on a video file and preview the results
+%
+meye.predictMovie_Preview('path/to/video');
+```
+
+## Examples
+
+Inside the file [example.m](example.m) you can find 5 extensively commented examples of some use cases for MEYE on MATLAB.  
+These examples require you to download example data from [here](https://drive.google.com/drive/folders/1BG6O5BEkwXkNKC_1XuB3H9wbx3DeNWwF?usp=sharing). To run the examples succesfully, make sure that the downloaded files are in the same folder as the `example.m` file.
+
+# Known issues
+
+## Small issue with _Upsample_ layers
+When [importing](https://it.mathworks.com/help/deeplearning/ref/importonnxnetwork.html) a ONNX network, MATLAB tries to translate all the layers of the network from ONNX Operators to built-in MATLAB layers (see [here](https://it.mathworks.com/help/deeplearning/ref/importonnxnetwork.html#mw_dc6cd14c-e8d0-4370-af81-96626a888d9c)).  
+This operation is not succesful for all the layers and MATLAB tries to overcome erros by automatically generating custom layers to replace the ones that it wasnt able to translate. These _custom_ layers are stored in a folder as MATLAB `.m` class files.  
+We found a small bug in the way MATLAB translates `Upsample` layers while importing MEYE network. In particular, the custom generated layers perform the upsample with the `nearest` interpolation method, while it should be used the `linear` method for best results.  
+For now, we solved this bug by automatically replacing the `nearest` method with the `linear` one in all the custom generated layers. This restores optimal performance with no additional computational costs, but it's a bit hacky.   
+We hope that in future releases MATLAB's process of translation to its own built-in layers will be smoother and this trick will not be needed anymore.
\ No newline at end of file
diff --git a/matlab/example.m b/matlab/example.m
new file mode 100644
index 0000000000000000000000000000000000000000..94d2d68343ad58ffb66aaf1404481cf8751b2d44
--- /dev/null
+++ b/matlab/example.m
@@ -0,0 +1,211 @@
+%% Download all the example material
+% 
+% 1 - Download the latest MEYE model in ONNX format
+% -------------------------------------------------------------------------
+% Download the .onnx file from the assets here:
+% https://github.com/fabiocarrara/meye/releases
+
+% EXAMPLE data can be found in this folder:
+% https://drive.google.com/drive/folders/1BG6O5BEkwXkNKC_1XuB3H9wbx3DeNWwF?usp=sharing
+% 
+% 2 - Download an example image of a simple mouse eye from:
+% https://drive.google.com/file/d/1hcWcC1cAmzY4r-SIWDIgUY0-gpbmetUL/view?usp=sharing
+% 
+% 3 - Download an example of a large image here:
+% https://drive.google.com/file/d/16QixvUMtojqfrcy4WXlYJ7CP3K8vrz_C/view?usp=sharing
+% 
+% 4 - Download an example pupillometry video here:
+% https://drive.google.com/file/d/1TYj80dzIR1ZjpEvfefH_akhbUjwpvJta/view?usp=sharing
+
+
+%% EXAMPLE 1
+% -------------------------------------------------------------------------
+% Predict the pupil from a simple image of an eye
+
+% Clean up the workspace
+clearvars, clc
+
+% Change these values according to the filenames of the MEYE model and the
+% simple pupil image
+MODEL_NAME = 'meye_20220124.onnx';
+IMAGE_NAME = 'pupilImage_simple.png';
+
+
+% Initialize a MEYE object
+meye = Meye(MODEL_NAME);
+
+% Load the simple image
+img = imread(IMAGE_NAME);
+
+% Predict a single image
+[pupilMask, eyeProb, blinkProb] = meye.predictImage(img);
+
+% Plot the results of the prediction
+subplot(1,3,1)
+imshow(img)
+title('Original Image')
+
+subplot(1,3,2)
+imagesc(pupilMask)
+title(sprintf('Prediction (Eye:%.2f%% - Blink:%.2f%%)',eyeProb*100,blinkProb*100))
+axis off, axis image
+
+subplot(1,3,3)
+imshowpair(img, pupilMask)
+title('Merge')
+
+
+%% EXAMPLE 2
+% -------------------------------------------------------------------------
+% Binarize the pupil prediction and get the pupil size in pixels
+
+% Clean up the workspace
+clearvars, close all,  clc
+
+% Change these values according to the filenames of the MEYE model and the
+% simple pupil image
+MODEL_NAME = 'meye_20220124.onnx';
+IMAGE_NAME = 'pupilImage_simple.png';
+
+
+% Initialize a MEYE object
+meye = Meye(MODEL_NAME);
+
+% Load the simple image
+img = imread(IMAGE_NAME);
+
+% Predict a single image
+% You can automatically binarize the prediction by passing the "threshold"
+% optional argument. This number can be between 0 and 1. If omitted, the
+% function returns a raw probability map instead of a binarized image
+pupilBinaryMask = meye.predictImage(img, 'threshold', 0.4);
+
+imshowpair(img, pupilBinaryMask)
+title(sprintf('Pupil Size: %u px', sum(pupilBinaryMask,'all')))
+
+
+%% EXAMPLE 3
+% -------------------------------------------------------------------------
+% Predict the pupil on a large image where the eye is a small portion of
+% the image
+
+% Clean up the workspace
+clearvars, close all,  clc
+
+% Change these values according to the filenames of the MEYE model and the
+% simple pupil image
+MODEL_NAME = 'meye_20220124.onnx';
+IMAGE_NAME = 'pupilImage_large.png';
+
+
+% Initialize a MEYE object
+meye = Meye(MODEL_NAME);
+
+% Load the simple image
+img = imread(IMAGE_NAME);
+
+% Predict the image
+pupilMask = meye.predictImage(img);
+
+% As you can see from this image, the prediction is not perfect. This is
+% because MEYE was trained on images that tightly contained the eye. 
+subplot(1,2,1)
+imshowpair(img, pupilMask)
+title('Tomal Image prediction (low-quality)')
+
+% In order to solve this issue it is possible to restrict the prediction to
+% a rectangular Region of Interest (ROI) in the image. This is done simply
+% by passing the optional argument "roiPos" to the predictImage function.
+% The roiPos is a 4-elements vector containing X,Y, width, height of a
+% rectangular shape. Note that X and Y are the coordinates of the top left
+% corner of the ROI
+
+ROI = [90,90,200,200];
+pupilMask = meye.predictImage(img, 'roiPos', ROI);
+
+% Plot the results with the ROI and see the difference between the 2 methods
+subplot(1,2,2)
+imshowpair(img, pupilMask)
+rectangle('Position',ROI, 'LineStyle','-.','EdgeColor',[1,0,0])
+title('ROI prediction (high quality)')
+linkaxes
+set(gcf,'Position',[300,600,1000,320])
+
+
+%% EXAMPLE 4
+% -------------------------------------------------------------------------
+% Show a preview of the prediction of an entire pupillometry video.
+% 
+% As you saw you can adjust a few parameters for the prediction.
+% If you want to get a quick preview of how your pre-recorded video will be
+% processed, you can use the method predictMovie_Preview.
+% Here you can play around with different ROI positions and threshold
+% values and see what are the results before analyzing the whole video.
+
+% Clean up the workspace
+clearvars, close all,  clc
+
+% Change these values according to the filenames of the MEYE model and the
+% simple pupil image
+MODEL_NAME = 'meye_20220124.onnx';
+VIDEO_NAME = 'mouse_example.mp4';
+
+% Initialize a MEYE object
+meye = Meye(MODEL_NAME);
+
+% Try to play around moving or resizing the ROI to see how the performances change
+ROI = [70, 60, 200, 200]; 
+
+% Change the threshold value to binarize the pupil prediction.
+% Use [] to see the raw probability map. Use a number in the range [0:1] to binarize it
+threshold = 0.4;            
+
+meye.predictMovie_Preview(VIDEO_NAME,"roiPos", ROI,"threshold",threshold);
+
+
+
+%% EXAMPLE 5
+% Predict the entire video and get the results table
+
+% Clean up the workspace
+clearvars, close all,  clc
+
+% Change these values according to the filenames of the MEYE model and the
+% simple pupil image
+MODEL_NAME = 'meye_20220124.onnx';
+VIDEO_NAME = 'mouse_example.mp4';
+
+% Initialize a MEYE object
+meye = Meye(MODEL_NAME);
+
+% Try to play around moving or resizing the ROI to see how the performances change
+ROI = [70, 60, 200, 200]; 
+
+% Change the threshold value to binarize the pupil prediction.
+% Use [] to see the raw probability map. Use a number in the range [0:1] to binarize it
+threshold = 0.4;
+
+% Predict the whole movie and save results in a table
+T = meye.predictMovie(VIDEO_NAME, "roiPos", ROI, "threshold", threshold);
+
+% Show some of the values in the table
+disp(head(T))
+
+% Plot some of the results
+subplot 311
+plot(T.frameTime,T.isEye, 'LineWidth', 2)
+title('Eye Probability')
+ylabel('Probability'),
+xlim([T.frameTime(1) T.frameTime(end)])
+
+subplot 312
+plot(T.frameTime,T.isBlink, 'LineWidth', 2)
+title('Blink Probability')
+ylabel('Probability')
+xlim([T.frameTime(1) T.frameTime(end)])
+
+subplot 313
+plot(T.frameTime,T.pupilArea, 'LineWidth', 2)
+title('Pupil Size')
+xlabel('Time (s)'), ylabel('Pupil Area (px)')
+xlim([T.frameTime(1) T.frameTime(end)])
diff --git a/models/deeplab.py b/models/deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0625cd015378a2923222b57c2947a49ac9d1861
--- /dev/null
+++ b/models/deeplab.py
@@ -0,0 +1,78 @@
+import sys
+sys.path += ['models/deeplab']
+
+import tensorflow as tf
+
+from tensorflow.keras import backend as K
+from tensorflow.keras import layers as L
+from tensorflow.keras.models import Model, Sequential
+
+from deeplabv3p.models.deeplabv3p_resnet50 import Deeplabv3pResNet50
+from deeplabv3p.models.deeplabv3p_mobilenetv3 import Deeplabv3pMobileNetV3Small, Deeplabv3pLiteMobileNetV3Small, Deeplabv3pMobileNetV3Large, Deeplabv3pLiteMobileNetV3Large
+from deeplabv3p.models.deeplabv3p_xception import Deeplabv3pXception
+from deeplabv3p.models.deeplabv3p_peleenet import Deeplabv3pPeleeNet, Deeplabv3pLitePeleeNet
+
+AVAILABLE_BACKBONES = {
+    'resnet50': Deeplabv3pResNet50,
+    'xception': Deeplabv3pXception,
+    'mobilenetv3-large': Deeplabv3pMobileNetV3Large,
+    'lite-mobilenetv3-large': Deeplabv3pLiteMobileNetV3Large,
+    'mobilenetv3-small': Deeplabv3pMobileNetV3Small,
+    'lite-mobilenetv3-small': Deeplabv3pLiteMobileNetV3Small,
+    'peleenet': Deeplabv3pPeleeNet,
+    'lite-peleenet': Deeplabv3pLitePeleeNet,
+}
+
+AVAILABLE_PRETRAINED_WEIGHTS = {
+    'resnet50': 'imagenet',
+    'xception': None,  # 'pascalvoc', # needs fix in upstream
+    'mobilenetv3-large': 'imagenet',
+    'lite-mobilenetv3-large': 'imagenet',
+    'mobilenetv3-small': 'imagenet',
+    'lite-mobilenetv3-small': 'imagenet',
+    'peleenet': 'imagenet',
+    'lite-peleenet': 'imagenet',
+}
+
+def build_model(input_shape, output_shape, config):
+
+    assert input_shape[:2] == output_shape[:2], "Only same input-output HW shapes are supported."
+    num_classes = output_shape[2]
+
+    # backbone pretends RGB images to use pretrained weights
+    needs_rgb_conversion = input_shape[2] != 3
+    backbone_input_shape = (input_shape[:2] + (3,)) if needs_rgb_conversion else input_shape
+    backbone_name = config.get('backbone', 'resnet50')
+    weights = config.get('weights', AVAILABLE_PRETRAINED_WEIGHTS[backbone_name])
+    backbone_fn = AVAILABLE_BACKBONES[backbone_name]
+    backbone, backbone_len = backbone_fn(input_shape=backbone_input_shape, num_classes=num_classes, weights=weights, OS=8)
+
+    # segmentation mask
+    out_mask = backbone.get_layer('pred_resize').output
+    out_mask = L.Activation('sigmoid', name='mask')(out_mask)
+
+    # metadata tags (is_eye and is_blink)
+    middle = backbone.get_layer('image_pooling').output
+    middle = L.Flatten()(middle)
+    out_tags = L.Dense(2, activation='sigmoid', name='tags')(middle)
+
+    model = Model(inputs=backbone.input, outputs=[out_mask, out_tags])
+
+    if needs_rgb_conversion:
+        gray_input = L.Input(shape=input_shape)
+        rgb_input = L.Lambda(lambda x: K.tile(x, (1, 1, 1, 3)) , name='gray2rgb')(gray_input)  # we assume BHWC
+        out_mask, out_tags = model(rgb_input)
+
+        # rename outputs
+        out_mask = L.Lambda(lambda x: x, name='mask')(out_mask)
+        out_tags = L.Lambda(lambda x: x, name='tags')(out_tags)
+        model = Model(inputs=gray_input, outputs=[out_mask, out_tags])
+
+    return model
+
+
+if __name__ == "__main__":
+    shape = (128, 128, 1)
+    model = build_model(shape, shape, {'weights': None})#, 'backbone': 'lite-mobilenetv3-small'})
+    model.summary()
+    import pdb; pdb.set_trace()
diff --git a/models/deeplab/README.md b/models/deeplab/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff8723009538fb7d4ef4564f0591d1304d6e589c
--- /dev/null
+++ b/models/deeplab/README.md
@@ -0,0 +1,380 @@
+# TF Keras DeepLab v3+ Modelset
+
+## Introduction
+
+An end-to-end semantic segmentation pipeline with DeepLabv3+ models. Implement with tf.keras, including data collection/annotation, model training/tuning, model evaluation and on device deployment. Support different backbones and different head architecture:
+
+#### Backbone
+- [x] Xception
+- [x] ResNet50
+- [x] MobileNetV2
+- [x] MobilenetV3(Large/Small)
+- [x] PeleeNet ([paper](https://arxiv.org/abs/1804.06882))
+
+#### Head
+- [x] ASPP
+- [x] ASPP Lite(Only Global Pooling + 1x1 Conv)
+- [x] Decoder
+- [x] Different Output Stride(8/16/32)
+
+#### Loss
+- [x] Categorical Cross Entropy Loss
+- [x] Balanced Class Weighted Cross Entropy Loss
+- [x] Adaptive Class Weighted Cross Entropy Loss
+- [x] Focal Loss
+
+#### Postprocess
+- [x] Numpy CRF (Conditional Random Fields) postprocess implementation
+
+
+#### Train tech
+- [x] Transfer training from Imagenet/PascalVOC
+- [x] Dynamic learning rate decay (Cosine/Exponential/Polynomial/PiecewiseConstant)
+- [x] Weights Average policy for optimizer (EMA/SWA/Lookahead, valid for TF-2.x with tfa)
+- [x] GridMask data augmentation ([paper](https://arxiv.org/abs/2001.04086))
+- [x] Multi-GPU training with SyncBatchNorm support (valid for TF-2.2 and later)
+
+#### On-device deployment
+- [x] Tensorflow-Lite Float32/UInt8 model inference
+- [x] MNN Float32/UInt8 model inference
+
+
+## Quick Start
+
+1. Install requirements on Ubuntu 16.04/18.04:
+
+```
+# pip install -r requirements.txt
+```
+
+2. Download Deeplabv3+ PascalVOC pretrained weights. It's provided by [keras-deeplab-v3-plus](https://github.com/bonlime/keras-deeplab-v3-plus) and imported from [original TF checkpoint](https://github.com/tensorflow/models/tree/master/research/deeplab)
+3. Run Deeplab segmentation on your image or video.
+
+```
+# wget -O weights/deeplabv3_xception_tf_dim_ordering_tf_kernels.h5 https://github.com/bonlime/keras-deeplab-v3-plus/releases/download/1.1/deeplabv3_xception_tf_dim_ordering_tf_kernels.h5
+# python deeplab.py --model_type=xception --weights_path=weights/deeplabv3_xception_tf_dim_ordering_tf_kernels.h5 --classes_path=configs/voc_classes.txt --output_stride=16 --image
+# python deeplab.py --model_type=xception --weights_path=weights/deeplabv3_xception_tf_dim_ordering_tf_kernels.h5 --classes_path=configs/voc_classes.txt --output_stride=16 --input=<your video file>
+
+```
+
+Image segment sample:
+
+<p align="center">
+  <img src="assets/dog_inference.png">
+</p>
+
+
+
+## Guide of train/evaluate/demo
+
+### Train
+
+1. Prepare dataset
+    1. PascalVOC2012 & SBD (VOC2012 train_aug) semantic segmentation dataset
+        * Run a simple script to download, convert & merge PascalVOC 2012 and SBD:
+
+            ```
+            # pushd tools/dataset_converter/voc_augment/
+            # ./dataset_prepare.sh
+            # popd
+
+            ```
+       Dataset images & labels will be placed at `VOC2012/`
+
+    2. MS COCO 2017 segmentation dataset
+        * Run a simple script to download COCO2017 dataset, and convert annotated instance mask to PNG format semantic segmentation label image:
+
+            ```
+            # pushd tools/dataset_converter/mscoco2017/
+            # ./dataset_prepare.sh
+            # popd
+
+            ```
+       You can dig into related script for details. Dataset images & labels will be placed at `mscoco2017/`
+
+    3. ADE20K semantic segmentation dataset
+        * Run a simple script to download, merge & convert ADE20K dataset:
+
+            ```
+            # pushd tools/dataset_converter/ade20k/
+            # ./dataset_prepare.sh
+            # popd
+
+            ```
+       Dataset images & labels will be placed at `ADEChallengeData2016/`
+
+    4. Cityscapes semantic segmentation dataset
+        * Download the Cityscapes dataset package from `https://www.cityscapes-dataset.com/` (need registration) and put to `tools/dataset_converter/cityscapes/`. Then run a simple script to merge & convert:
+
+            ```
+            # pushd tools/dataset_converter/cityscapes/
+            # ./dataset_prepare.sh
+            # popd
+
+            ```
+       Dataset images & labels will be placed at `Cityscapes/`
+
+    5. Customized semantic segmentation dataset
+        * Collecting target JPG format images and place at `<dataset_path>/images`
+        * Generate semantic segmentation label image. You can use [labelme](https://github.com/wkentaro/labelme) to annotate your image with polygonal segmentation mask and save to a json file. Then run [json_to_dataset.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/tools/dataset_converter/labelme/json_to_dataset.py) to convert json annotations to PascalVOC style PNG format label images:
+            ```
+            # cd tools/dataset_converter/labelme && python json_to_dataset.py -h
+              usage: json_to_dataset.py [-h] --json_file_path JSON_FILE_PATH
+                                        [--classes_path CLASSES_PATH] --png_label_path
+                                        PNG_LABEL_PATH
+
+              convert labelme json label to voc png label
+
+              optional arguments:
+                -h, --help            show this help message and exit
+                --json_file_path JSON_FILE_PATH
+                                      path to labelme annotated json label files
+                --classes_path CLASSES_PATH
+                                      path to class definitions,
+                                      default=../../../configs/voc_classes.txt
+                --png_label_path PNG_LABEL_PATH
+                                      output path of converted png label images
+            ```
+
+            For class names file format, refer to  [voc_classes.txt](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/configs/voc_classes.txt) (not including background class, which would be added as index 0 in code by default).
+
+        * Place the PNG label images at `<dataset_path>/labels`
+        * Create PascalVOC style dataset split (train/val/test) txt files. One line for a image and only include image base name, like:
+            ```
+            2007_000033
+            2007_000042
+            2007_000061
+            ...
+            ```
+
+            You can put these dataset files together at `<dataset_path>` to create an independent dataset directory
+
+
+2. [train.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/train.py)
+```
+# python train.py -h
+usage: train.py [-h] [--model_type MODEL_TYPE] [--weights_path WEIGHTS_PATH]
+                [--model_input_shape MODEL_INPUT_SHAPE]
+                [--output_stride {8,16,32}] [--dataset_path DATASET_PATH]
+                [--dataset_file DATASET_FILE]
+                [--val_dataset_file VAL_DATASET_FILE] [--val_split VAL_SPLIT]
+                [--classes_path CLASSES_PATH] [--batch_size BATCH_SIZE]
+                [--optimizer {adam,rmsprop,sgd}] [--loss {crossentropy,focal}]
+                [--weighted_type {None,adaptive,balanced}]
+                [--learning_rate LEARNING_RATE]
+                [--average_type {None,ema,swa,lookahead}]
+                [--decay_type {None,cosine,exponential,polynomial,piecewise_constant}]
+                [--transfer_epoch TRANSFER_EPOCH] [--freeze_level {0,1,2}]
+                [--init_epoch INIT_EPOCH] [--total_epoch TOTAL_EPOCH]
+                [--gpu_num GPU_NUM] [--model_pruning] [--eval_online]
+                [--eval_epoch_interval EVAL_EPOCH_INTERVAL]
+                [--save_eval_checkpoint]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --model_type MODEL_TYPE
+                        DeepLabv3+ model type:
+                        mobilenetv2/mobilenetv2_lite/resnet50,
+                        default=mobilenetv2_lite
+  --weights_path WEIGHTS_PATH
+                        Pretrained model/weights file for fine tune
+  --model_input_shape MODEL_INPUT_SHAPE
+                        model image input shape as <height>x<width>,
+                        default=512x512
+  --output_stride {8,16,32}
+                        model output stride, default=16
+  --dataset_path DATASET_PATH
+                        dataset path containing images and label png file,
+                        default=VOC2012/
+  --dataset_file DATASET_FILE
+                        train samples txt file,
+                        default=VOC2012/ImageSets/Segmentation/trainval.txt
+  --val_dataset_file VAL_DATASET_FILE
+                        val samples txt file, default=None
+  --val_split VAL_SPLIT
+                        validation data persentage in dataset if no val
+                        dataset provide, default=0.1
+  --classes_path CLASSES_PATH
+                        path to class definitions,
+                        default=configs/voc_classes.txt
+  --batch_size BATCH_SIZE
+                        batch size for training, default=16
+  --optimizer {adam,rmsprop,sgd}
+                        optimizer for training (adam/rmsprop/sgd), default=sgd
+  --loss {crossentropy,focal}
+                        loss type for training (crossentropy/focal),
+                        default=crossentropy
+  --weighted_type {None,adaptive,balanced}
+                        class balance weighted type, default=None
+  --learning_rate LEARNING_RATE
+                        Initial learning rate, default=0.01
+  --average_type {None,ema,swa,lookahead}
+                        weights average type, default=None
+  --decay_type {None,cosine,exponential,polynomial,piecewise_constant}
+                        Learning rate decay type, default=None
+  --transfer_epoch TRANSFER_EPOCH
+                        Transfer training stage epochs, default=5
+  --freeze_level {0,1,2}
+                        Freeze level of the model in transfer training stage.
+                        0:NA/1:backbone/2:only open prediction layer
+  --init_epoch INIT_EPOCH
+                        initial training epochs for fine tune training,
+                        default=0
+  --total_epoch TOTAL_EPOCH
+                        total training epochs, default=150
+  --gpu_num GPU_NUM     Number of GPU to use, default=1
+  --model_pruning       Use model pruning for optimization, only for TF 1.x
+  --eval_online         Whether to do evaluation on validation dataset during
+                        training
+  --eval_epoch_interval EVAL_EPOCH_INTERVAL
+                        Number of iteration(epochs) interval to do evaluation,
+                        default=10
+  --save_eval_checkpoint
+                        Whether to save checkpoint with best evaluation result
+```
+
+Following is a reference config cmd for training mobilenetv2 lite model on PascalVOC2012 & SBD dataset:
+```
+# python train.py --model_type=mobilenetv2_lite --output_stride=16 --dataset_path=VOC2012/ --dataset_file=VOC2012/ImageSets/Segmentation/train.txt --val_dataset_file=VOC2012/ImageSets/Segmentation/val.txt --batch_size=16 --freeze_level=1 --transfer_epoch=5 --total_epoch=150 --eval_online --eval_epoch_interval=1 --save_eval_checkpoint --weighted_type=adaptive
+```
+
+Checkpoints during training could be found at `logs/000/`. Choose a best one as result
+
+You can also use Tensorboard to monitor the loss trend during train:
+```
+# tensorboard --logdir=logs/000
+```
+
+MultiGPU usage: use `--gpu_num N` to use N GPUs. It use [tf.distribute.MirroredStrategy](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) to support MultiGPU environment.
+
+
+### Model dump
+We' better to dump out inference model from training checkpoint for eval or demo. Following script cmd work for that.
+
+```
+# python deeplab.py --model_type=mobilenetv2_lite --weights_path=logs/000/<checkpoint>.h5 --classes_path=configs/voc_classes.txt --model_input_shape=512x512 --output_stride=16 --dump_model --output_model_file=model.h5
+```
+
+Change model_type, input shape & output stride to get different inference model. If "--model_pruning" was added in training, you also need to use "--pruning_model" here for dumping out the pruned model.
+
+NOTE: One trained model could be dump out for different input shape & output stride (of course with different accuracy performance).
+
+
+### Evaluation
+Use [eval.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/eval.py) to do evaluation on the inference model with your test data. It will calculate following metrics:
+
+* mIOU
+* FWIOU (Frequency Weighted IOU)
+* PA (Pixel Accuracy)
+* MPA (Mean Pixel Accuracy)
+
+It will also draw confusion matrix chart and IOU result for each class under "result" dir, and optionally save all the segmentation result images & predicted PNG labels for checking.
+
+```
+# python eval.py --model_path=model.h5 --dataset_path=VOC2012/ --dataset_file=VOC2012/ImageSets/Segmentation/val.txt --classes_path=configs/voc_classes.txt --model_input_shape=512x512 --save_result
+```
+
+If you enable "--eval_online" option in train.py, evaluation on validation dataset will be executed during training. But that may cost more time for train process.
+
+
+Following is a sample result trained on MobilenetV2_Lite model with VOC2012+SBD dataset:
+<p align="center">
+  <img src="assets/mIOU.png">
+  <img src="assets/confusion_matrix.png">
+</p>
+
+
+Some experiment on VOC2012+SBD dataset and comparison:
+
+| Model type | InputSize | Output Stride | TrainSet | TestSet | mIOU | FLOPS | Param | Speed | Ps |
+| ----- | ------ | ------ | ------ | ----- | ----- | ----- | ----- | ----- | ----- |
+| [ResNet50](https://github.com/david8862/tf-keras-deeplabv3p-model-set/releases/download/1.0.1/deeplabv3p_resnet50_512_os16_voc.tar.gz) | 512x512 | 16 | VOC12&SBD train | VOC12&SBD val | 73.71% | 73.95G | 26.72M | 38ms | Keras on Titan XP |
+| [MobileNetV3Large](https://github.com/david8862/tf-keras-deeplabv3p-model-set/releases/download/1.0.1/deeplabv3p_mobilenetv3large_512_os16_voc.tar.gz) | 512x512 | 16 | VOC12&SBD train | VOC12&SBD val | 72.33% | 9.52G | 3.51M | 29ms | Keras on Titan XP |
+| [PeleeNet Lite](https://github.com/david8862/tf-keras-deeplabv3p-model-set/releases/download/1.0.2/deeplabv3p_peleenet_lite_512_os16_voc.tar.gz) | 512x512 | 16 | VOC12&SBD train | VOC12&SBD val | 68.23% | 7.64G | 2.59M | 37.8ms | Keras on Titan XP |
+| [MobileNetV2 Lite](https://github.com/david8862/tf-keras-deeplabv3p-model-set/releases/download/1.0.0/deeplabv3p_mobilenetv2_lite_512_os16_voc.tar.gz) | 512x512 | 16 | VOC12&SBD train | VOC12&SBD val | 67.83% | 5.24G | 2.11M | 23ms | Keras on Titan XP |
+| [MobileNetV3Small Lite](https://github.com/david8862/tf-keras-deeplabv3p-model-set/releases/download/1.0.1/deeplabv3p_mobilenetv3small_lite_512_os16_voc.tar.gz) | 512x512 | 16 | VOC12&SBD train | VOC12&SBD val | 64.81% | 1.36G | 1.06M | 20ms | Keras on Titan XP |
+
+**NOTE**: If you meet any model loading problem with these pretrained weights due to h5 format compatibility issue, try to run "Model dump" with it again to regenerate the inference model.
+
+
+### Demo
+1. [deeplab.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/deeplab.py)
+> * Demo script for trained model
+
+image inference mode
+```
+# python deeplab.py --model_type=mobilenetv2_lite --weights_path=model.h5 --classes_path=configs/voc_classes.txt --model_input_shape=512x512 --output_stride=16 --image
+```
+video inference mode
+```
+# python deeplab.py --model_type=mobilenetv2_lite --weights_path=model.h5 --classes_path=configs/voc_classes.txt --model_input_shape=512x512 --output_stride=16 --input=test.mp4
+```
+For video detection mode, you can use "input=0" to capture live video from web camera and "output=<video name>" to dump out inference result to another video
+
+### Tensorflow model convert
+Using [keras_to_tensorflow.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/tools/model_converter/keras_to_tensorflow.py) to convert the tf.keras .h5 model to tensorflow frozen pb model:
+```
+# python keras_to_tensorflow.py
+    --input_model="path/to/keras/model.h5"
+    --output_model="path/to/save/model.pb"
+```
+
+### ONNX model convert
+Using [keras_to_onnx.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/tools/model_converter/keras_to_onnx.py) to convert the tf.keras .h5 model to ONNX model:
+```
+### need to set environment TF_KERAS=1 for tf.keras model
+# export TF_KERAS=1
+# python keras_to_onnx.py
+    --keras_model_file="path/to/keras/model.h5"
+    --output_file="path/to/save/model.onnx"
+    --op_set=11
+```
+
+You can also use [eval.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/eval.py) to do evaluation on the pb & onnx inference model
+
+### Inference model deployment
+See [on-device inference](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/inference) for TFLite & MNN model deployment
+
+
+### TODO
+- [ ] support model pruning
+- [ ] support SubPixel predict layer
+- [ ] support Quantization aware training
+
+
+## Some issues to know
+1. The test environment is
+    - Ubuntu 16.04/18.04
+    - Python 3.6.8
+    - tensorflow 2.0.0/tensorflow 1.15.0
+    - tf.keras 2.2.4-tf
+
+2. Imagenet pretrained weights for backbone is automatically loaded (if have) when training, so recommended to freeze backbone layers for several epochs in transfer traning stage.
+
+3. Training strategy is for reference only. Adjust it according to your dataset and your target. And add further strategy if needed.
+
+
+## Contribution guidelines
+New features, improvements and any other kind of contributions are warmly welcome via pull request :)
+
+
+# Citation
+Please cite tf-keras-deeplabv3p-model-set in your publications if it helps your research:
+```
+@article{Keras-segmentation-deeplab-v3.1,
+     Author = {Jenia Golbstein},
+     Year = {2019}
+}
+@article{pytorch-deeplab-xception,
+     Author = {jfzhang95},
+     Year = {2019}
+}
+
+@article{Focal Loss,
+     title={Focal Loss for Dense Object Detection},
+     author={Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Dollár},
+     journal = {arXiv},
+     year={2017}
+}
+
+```
diff --git a/models/deeplab/assets/2007_000346_inference.png b/models/deeplab/assets/2007_000346_inference.png
new file mode 100644
index 0000000000000000000000000000000000000000..143bd5afa90b5b012c71572b83c1e5afc8a82d4a
Binary files /dev/null and b/models/deeplab/assets/2007_000346_inference.png differ
diff --git a/models/deeplab/assets/confusion_matrix.png b/models/deeplab/assets/confusion_matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..7928fe308a379bba8a1896c8f1a826cfe788c48e
Binary files /dev/null and b/models/deeplab/assets/confusion_matrix.png differ
diff --git a/models/deeplab/assets/dog_inference.png b/models/deeplab/assets/dog_inference.png
new file mode 100644
index 0000000000000000000000000000000000000000..c2f4a01c23349fd05f6a0ab0111d80088a34cbe8
Binary files /dev/null and b/models/deeplab/assets/dog_inference.png differ
diff --git a/models/deeplab/assets/mIOU.png b/models/deeplab/assets/mIOU.png
new file mode 100644
index 0000000000000000000000000000000000000000..239546da479a046ef6a36fee74f1213dfbb36016
Binary files /dev/null and b/models/deeplab/assets/mIOU.png differ
diff --git a/models/deeplab/common/callbacks.py b/models/deeplab/common/callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9558444a8d52d1a572eb7d714954e8d9cacf8bb
--- /dev/null
+++ b/models/deeplab/common/callbacks.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python3
+# -*- coding=utf-8 -*-
+"""custom model callbacks."""
+import os, sys, random, tempfile
+import numpy as np
+from tensorflow_model_optimization.sparsity import keras as sparsity
+from tensorflow.keras.callbacks import Callback
+
+from eval import eval_mIOU
+
+
+class EvalCallBack(Callback):
+    def __init__(self, dataset_path, dataset, class_names, model_input_shape, model_pruning, log_dir, eval_epoch_interval=10, save_eval_checkpoint=False):
+        self.dataset_path = dataset_path
+        self.dataset = dataset
+        self.class_names = class_names
+        self.model_input_shape = model_input_shape
+        self.model_pruning = model_pruning
+        self.log_dir = log_dir
+        self.eval_epoch_interval = eval_epoch_interval
+        self.save_eval_checkpoint = save_eval_checkpoint
+        self.best_mIOU = 0.0
+
+    def on_epoch_end(self, epoch, logs=None):
+        if (epoch+1) % self.eval_epoch_interval == 0:
+            # Do eval every eval_epoch_interval epochs
+            mIOU = eval_mIOU(self.model, 'H5', self.dataset_path, self.dataset, self.class_names, self.model_input_shape, do_crf=False, save_result=False, show_background=True)
+
+            if self.save_eval_checkpoint and mIOU > self.best_mIOU:
+                # Save best mIOU value and model checkpoint
+                self.best_mIOU = mIOU
+                self.model.save(os.path.join(self.log_dir, 'ep{epoch:03d}-loss{loss:.3f}-Jaccard{Jaccard:.3f}-val_loss{val_loss:.3f}-val_Jaccard{val_Jaccard:.3f}-mIOU{mIOU:.3f}.h5'.format(epoch=(epoch+1), loss=logs.get('loss'), Jaccard=logs.get('Jaccard'), val_loss=logs.get('val_loss'), val_Jaccard=logs.get('val_Jaccard'), mIOU=mIOU)))
diff --git a/models/deeplab/common/data_utils.py b/models/deeplab/common/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..714f8c74a73ad1b1a0bff9645de828ccc873758c
--- /dev/null
+++ b/models/deeplab/common/data_utils.py
@@ -0,0 +1,523 @@
+#!/usr/bin/python3
+# -*- coding=utf-8 -*-
+"""Data process utility functions."""
+import numpy as np
+import random
+import math
+import cv2
+from PIL import Image, ImageEnhance
+
+def rand(a=0, b=1):
+    return np.random.rand()*(b-a) + a
+
+
+def random_horizontal_flip(image, label, prob=.5):
+    """
+    Random horizontal flip for image & label
+
+    # Arguments
+        image: origin image for horizontal flip
+            numpy array containing image data
+        label: origin label for horizontal flip
+            numpy array containing segment label mask
+        prob: probability for random flip,
+            scalar to control the flip probability.
+
+    # Returns
+        image: adjusted numpy array image.
+        label: adjusted numpy array label mask
+    """
+    flip = rand() < prob
+    if flip:
+        image = cv2.flip(image, 1)
+        label = cv2.flip(label, 1)
+
+    return image, label
+
+
+def random_vertical_flip(image, label, prob=.5):
+    """
+    Random vertical flip for image & label
+
+    # Arguments
+        image: origin image for vertical flip
+            numpy array containing image data
+        label: origin label for vertical flip
+            numpy array containing segment label mask
+        prob: probability for random flip,
+            scalar to control the flip probability.
+
+    # Returns
+        image: adjusted numpy array image.
+        label: adjusted numpy array label mask
+    """
+    flip = rand() < prob
+    if flip:
+        image = cv2.flip(image, 0)
+        label = cv2.flip(label, 0)
+
+    return image, label
+
+
+#def random_brightness(image, jitter=.3):
+    #"""
+    #Random adjust brightness for image
+
+    ## Arguments
+        #image: origin image for brightness change
+            #numpy array containing image data
+        #jitter: jitter range for random brightness,
+            #scalar to control the random brightness level.
+
+    ## Returns
+        #new_image: adjusted numpy array image.
+    #"""
+    #factor = 1.0 + random.gauss(mu=0.0, sigma=jitter)
+    #if random.randint(0,1) and abs(factor) > 0.1:
+        #factor = 1.0/factor
+    #table = np.array([((i / 255.0) ** factor) * 255 for i in np.arange(0, 256)]).astype(np.uint8)
+    #new_image = cv2.LUT(image, table)
+
+    #return new_image
+
+def random_brightness(image, jitter=.5):
+    """
+    Random adjust brightness for image
+
+    # Arguments
+        image: origin image for brightness change
+            numpy array containing image data
+        jitter: jitter range for random brightness,
+            scalar to control the random brightness level.
+
+    # Returns
+        image: adjusted numpy array image.
+    """
+    img = Image.fromarray(image)
+    enh_bri = ImageEnhance.Brightness(img)
+    brightness = rand(jitter, 1/jitter)
+    new_img = enh_bri.enhance(brightness)
+    image = np.asarray(new_img)
+
+    return image
+
+
+def random_blur(image, prob=.5, size=5):
+    """
+    Random add gaussian blur to image
+
+    # Arguments
+        image: origin image for blur
+            numpy array containing image data
+        prob: probability for blur,
+            scalar to control the blur probability.
+        size: kernel size for gaussian blur,
+            scalar to control the filter size.
+
+    # Returns
+        image: adjusted numpy array image.
+    """
+    blur = rand() < prob
+    if blur:
+        image = cv2.GaussianBlur(image, (size, size), 0)
+
+    return image
+
+
+def random_histeq(image, size=8, prob=.2):
+    """
+    Random apply "Contrast Limited Adaptive Histogram Equalization"
+    to image
+
+    # Arguments
+        image: origin image for histeq
+            numpy array containing image data
+        size: grid size for CLAHE,
+            scalar to control the grid size.
+        prob: probability for histeq,
+            scalar to control the histeq probability.
+
+    # Returns
+        image: adjusted numpy array image.
+    """
+    histeq = rand() < prob
+    if histeq:
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(size, size))
+        img_yuv = cv2.cvtColor(image, cv2.COLOR_BGR2YUV)
+        img_yuv[:,:,0] = clahe.apply(img_yuv[:,:,0])
+        image = cv2.cvtColor(img_yuv, cv2.COLOR_YUV2BGR) # to BGR
+    return image
+
+
+def random_grayscale(image, prob=.2):
+    """
+    Random convert image to grayscale
+
+    # Arguments
+        image: origin image for grayscale convert
+            numpy array containing image data
+        prob: probability for grayscale convert,
+            scalar to control the convert probability.
+
+    # Returns
+        image: adjusted numpy array image.
+    """
+    convert = rand() < prob
+    if convert:
+        #convert to grayscale first, and then
+        #back to 3 channels fake BGR
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+
+    return image
+
+
+def random_chroma(image, jitter=.5):
+    """
+    Random adjust chroma (color level) for image
+
+    # Arguments
+        image: origin image for chroma change
+            numpy array containing image data
+        jitter: jitter range for random chroma,
+            scalar to control the random color level.
+
+    # Returns
+        image: adjusted numpy array image.
+    """
+    img = Image.fromarray(image)
+    enh_col = ImageEnhance.Color(img)
+    color = rand(jitter, 1/jitter)
+    new_img = enh_col.enhance(color)
+    image = np.asarray(new_img)
+
+    return image
+
+
+def random_contrast(image, jitter=.5):
+    """
+    Random adjust contrast for image
+
+    # Arguments
+        image: origin image for contrast change
+            numpy array containing image data
+        jitter: jitter range for random contrast,
+            scalar to control the random contrast level.
+
+    # Returns
+        image: adjusted numpy array image.
+    """
+    img = Image.fromarray(image)
+    enh_con = ImageEnhance.Contrast(img)
+    contrast = rand(jitter, 1/jitter)
+    new_img = enh_con.enhance(contrast)
+    image = np.asarray(new_img)
+
+    return image
+
+
+def random_sharpness(image, jitter=.5):
+    """
+    Random adjust sharpness for image
+
+    # Arguments
+        image: origin image for sharpness change
+            numpy array containing image data
+        jitter: jitter range for random sharpness,
+            scalar to control the random sharpness level.
+
+    # Returns
+        image: adjusted numpy array image.
+    """
+    img = Image.fromarray(image)
+    enh_sha = ImageEnhance.Sharpness(img)
+    sharpness = rand(jitter, 1/jitter)
+    new_img = enh_sha.enhance(sharpness)
+    image = np.asarray(new_img)
+
+    return image
+
+
+def random_zoom_rotate(image, label, rotate_range=30, zoom_range=0.2, prob=0.3):
+    """
+    Random do zoom & rotate for image & label
+
+    # Arguments
+        image: origin image for zoom & rotate
+            numpy array containing image data
+        label: origin label for zoom & rotate
+            numpy array containing segment label mask
+        prob: probability for random flip,
+            scalar to control the flip probability.
+
+    # Returns
+        image: adjusted numpy array image.
+        label: adjusted numpy array label mask
+    """
+    if rotate_range:
+        angle = random.gauss(mu=0.0, sigma=rotate_range)
+    else:
+        angle = 0.0
+
+    if zoom_range:
+        scale = random.gauss(mu=1.0, sigma=zoom_range)
+    else:
+        scale = 1.0
+
+    warpAffine = rand() < prob
+    if warpAffine and (rotate_range or zoom_range):
+        M = cv2.getRotationMatrix2D((image.shape[1]//2, image.shape[0]//2), angle, scale)
+        image = cv2.warpAffine(image, M, (image.shape[1], image.shape[0]), flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=0)
+        label = cv2.warpAffine(label, M, (label.shape[1], label.shape[0]), flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=0)
+
+    return image, label
+
+
+class Grid(object):
+    def __init__(self, d1, d2, rotate=360, ratio=0.5, mode=1, prob=1.):
+        self.d1 = d1
+        self.d2 = d2
+        self.rotate = rotate
+        self.ratio = ratio
+        self.mode=mode
+        self.st_prob = self.prob = prob
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * min(1, epoch / max_epoch)
+
+    def __call__(self, img, label):
+        h = img.shape[0]
+        w = img.shape[1]
+
+        if np.random.rand() > self.prob:
+            return img, label
+
+        # 1.5 * h, 1.5 * w works fine with the squared images
+        # But with rectangular input, the mask might not be able to recover back to the input image shape
+        # A square mask with edge length equal to the diagnoal of the input image
+        # will be able to cover all the image spot after the rotation. This is also the minimum square.
+        hh = math.ceil((math.sqrt(h*h + w*w)))
+
+        d = np.random.randint(self.d1, self.d2)
+        #d = self.d
+
+        # maybe use ceil? but i guess no big difference
+        self.l = math.ceil(d*self.ratio)
+
+        mask = np.ones((hh, hh), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        for i in range(-1, hh//d+1):
+                s = d*i + st_h
+                t = s+self.l
+                s = max(min(s, hh), 0)
+                t = max(min(t, hh), 0)
+                mask[s:t,:] *= 0
+        for i in range(-1, hh//d+1):
+                s = d*i + st_w
+                t = s+self.l
+                s = max(min(s, hh), 0)
+                t = max(min(t, hh), 0)
+                mask[:,s:t] *= 0
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (hh-w)//2:(hh-w)//2+w]
+
+        if self.mode == 1:
+            mask = 1-mask
+
+        #mask = mask.expand_as(img)
+        img = img * np.expand_dims(mask, -1)
+        label = label * mask
+
+        return img, label
+
+
+def random_gridmask(image, label, prob=0.2):
+    """
+    Random do GridMask augment for image & label
+
+    reference:
+        https://arxiv.org/abs/2001.04086
+        https://github.com/Jia-Research-Lab/GridMask/blob/master/imagenet_grid/utils/grid.py
+
+    # Arguments
+        image: origin image for GridMask
+            numpy array containing image data
+        label: origin label for zoom & rotate
+            numpy array containing segment label mask
+        prob: probability for GridMask,
+            scalar to control the GridMask probability.
+
+    # Returns
+        image: adjusted numpy array image.
+        label: adjusted numpy array label mask
+    """
+    grid = Grid(d1=image.shape[1]//7, d2=image.shape[1]//3, rotate=360, ratio=0.5, prob=prob)
+    image, label = grid(image, label)
+
+    return image, label
+
+
+def random_crop(image, label, crop_shape, prob=.1):
+    """
+    Random crop a specific size area from image
+    and label
+
+    # Arguments
+        image: origin image for vertical flip
+            numpy array containing image data
+        label: origin label for vertical flip
+            numpy array containing segment label mask
+        crop_shape: target crop shape,
+            list or tuple in (width, height).
+        prob: probability for crop,
+            scalar to control the crop probability.
+
+    # Returns
+        image: croped numpy array image.
+        label: croped numpy array label mask
+    """
+    # check if the image and label are same shape
+    if (image.shape[0] != label.shape[0]) or (image.shape[1] != label.shape[1]):
+        raise Exception('Image and label must have the same dimensions!')
+
+    crop = rand() < prob
+    if crop:
+        if (crop_shape[0] < image.shape[1]) and (crop_shape[1] < image.shape[0]):
+            x = random.randrange(image.shape[1]-crop_shape[0])
+            y = random.randrange(image.shape[0]-crop_shape[1])
+
+            image = image[y:y+crop_shape[1], x:x+crop_shape[0], :]
+            label = label[y:y+crop_shape[1], x:x+crop_shape[0]]
+        else:
+            image = cv2.resize(image, crop_shape)
+            label = cv2.resize(label, crop_shape, interpolation = cv2.INTER_NEAREST)
+
+    return image, label
+
+
+
+def normalize_image(image):
+    """
+    normalize image array from 0 ~ 255
+    to -1.0 ~ 1.0
+
+    # Arguments
+        image: origin input image
+            numpy image array with dtype=float, 0.0 ~ 255.0
+
+    # Returns
+        image: numpy image array with dtype=float, -1.0 ~ 1.0
+    """
+    image = image.astype(np.float32) / 127.5 - 1
+
+    return image
+
+
+def denormalize_image(image):
+    """
+    Denormalize image array from -1.0 ~ 1.0
+    to 0 ~ 255
+
+    # Arguments
+        image: normalized image array with dtype=float, -1.0 ~ 1.0
+
+    # Returns
+        image: numpy image array with dtype=uint8, 0 ~ 255
+    """
+    image = (image * 127.5 + 127.5).astype(np.uint8)
+
+    return image
+
+
+def preprocess_image(image, model_image_size):
+    """
+    Prepare model input image data with
+    resize, normalize and dim expansion
+
+    # Arguments
+        image: origin input image
+            PIL Image object containing image data
+        model_image_size: model input image size
+            tuple of format (height, width).
+
+    # Returns
+        image_data: numpy array of image data for model input.
+    """
+    resized_image = image.resize(model_image_size, Image.BICUBIC)
+    image_data = np.asarray(resized_image).astype('float32')
+    #image_data = normalize_image(image_data)
+    image_data = np.expand_dims(image_data, 0)
+    return image_data
+
+
+def mask_resize(mask, target_size):
+    """
+    Resize predict segmentation mask array to target size
+    with bilinear interpolation
+
+    # Arguments
+        mask: predict mask array to be resize
+            uint8 numpy array with shape (height, width, 1)
+        target_size: target image size,
+            tuple of format (width, height).
+
+    # Returns
+        resize_mask: resized mask array.
+
+    """
+    dst_w, dst_h = target_size # dest width & height
+    src_h, src_w = mask.shape[:2] # src width & height
+
+    if src_h == dst_h and src_w == dst_w:
+        return mask.copy()
+
+    scale_x = float(src_w) / dst_w # resize scale for width
+    scale_y = float(src_h) / dst_h # resize scale for height
+
+    # create & go through the target image array
+    resize_mask = np.zeros((dst_h, dst_w), dtype=np.uint8)
+    for dst_y in range(dst_h):
+        for dst_x in range(dst_w):
+            # mapping dest point back to src point
+            src_x = (dst_x + 0.5) * scale_x - 0.5
+            src_y = (dst_y + 0.5) * scale_y - 0.5
+            # calculate round point in src image
+            src_x_0 = int(np.floor(src_x))
+            src_y_0 = int(np.floor(src_y))
+            src_x_1 = min(src_x_0 + 1, src_w - 1)
+            src_y_1 = min(src_y_0 + 1, src_h - 1)
+
+            # Bilinear interpolation
+            value0 = (src_x_1 - src_x) * mask[src_y_0, src_x_0] + (src_x - src_x_0) * mask[src_y_0, src_x_1]
+            value1 = (src_x_1 - src_x) * mask[src_y_1, src_x_0] + (src_x - src_x_0) * mask[src_y_1, src_x_1]
+            resize_mask[dst_y, dst_x] = int((src_y_1 - src_y) * value0 + (src_y - src_y_0) * value1)
+
+    return resize_mask
+
+
+def mask_resize_fast(mask, target_size):
+    """
+    Use cv2 to do a quick resize on predict
+    segmentation mask array to target size
+
+    # Arguments
+        mask: predict mask array to be resize
+            uint8 numpy array with shape (height, width, 1)
+        target_size: target image size,
+            tuple of format (width, height).
+
+    # Returns
+        resize_mask: resized mask array.
+
+    """
+    mask = cv2.merge([mask, mask, mask]).astype('uint8')
+    #resize_mask = cv2.resize(mask, target_size, cv2.INTER_AREA)
+    resize_mask = cv2.resize(mask, target_size, cv2.INTER_NEAREST)
+    (resize_mask, _, _) = cv2.split(np.array(resize_mask))
+
+    return resize_mask
+
diff --git a/models/deeplab/common/model_utils.py b/models/deeplab/common/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b19770be1231cd3a9851c50df2c75b78c0a8f684
--- /dev/null
+++ b/models/deeplab/common/model_utils.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python3
+# -*- coding=utf-8 -*-
+"""Model utility functions."""
+from tensorflow.keras.optimizers import Adam, RMSprop, SGD
+from tensorflow.keras.optimizers.schedules import ExponentialDecay, PolynomialDecay, PiecewiseConstantDecay
+from tensorflow.keras.experimental import CosineDecay
+from tensorflow_model_optimization.sparsity import keras as sparsity
+
+
+def get_pruning_model(model, begin_step, end_step):
+    import tensorflow as tf
+    if tf.__version__.startswith('2'):
+        # model pruning API is not supported in TF 2.0 yet
+        raise Exception('model pruning is not fully supported in TF 2.x, Please switch env to TF 1.x for this feature')
+
+    pruning_params = {
+      'pruning_schedule': sparsity.PolynomialDecay(initial_sparsity=0.0,
+                                                   final_sparsity=0.7,
+                                                   begin_step=begin_step,
+                                                   end_step=end_step,
+                                                   frequency=100)
+    }
+
+    pruning_model = sparsity.prune_low_magnitude(model, **pruning_params)
+    return pruning_model
+
+
+# some global value for lr scheduler
+# need to update to CLI option in main()
+#lr_base = 1e-3
+#total_epochs = 250
+
+#def learning_rate_scheduler(epoch, curr_lr, mode='cosine_decay'):
+    #lr_power = 0.9
+    #lr = curr_lr
+
+    ## adam default lr
+    #if mode is 'adam':
+        #lr = 0.001
+
+    ## original lr scheduler
+    #if mode is 'power_decay':
+        #lr = lr_base * ((1 - float(epoch) / total_epochs) ** lr_power)
+
+    ## exponential decay policy
+    #if mode is 'exp_decay':
+        #lr = (float(lr_base) ** float(lr_power)) ** float(epoch + 1)
+
+    ## cosine decay policy, including warmup and hold stage
+    #if mode is 'cosine_decay':
+        ##warmup & hold hyperparams, adjust for your training
+        #warmup_epochs = 0
+        #hold_base_rate_epochs = 0
+        #warmup_lr = lr_base * 0.01
+        #lr = 0.5 * lr_base * (1 + np.cos(
+             #np.pi * float(epoch - warmup_epochs - hold_base_rate_epochs) /
+             #float(total_epochs - warmup_epochs - hold_base_rate_epochs)))
+
+        #if hold_base_rate_epochs > 0 and epoch < warmup_epochs + hold_base_rate_epochs:
+            #lr = lr_base
+
+        #if warmup_epochs > 0 and epoch < warmup_epochs:
+            #if lr_base < warmup_lr:
+                #raise ValueError('learning_rate_base must be larger or equal to '
+                                 #'warmup_learning_rate.')
+            #slope = (lr_base - warmup_lr) / float(warmup_epochs)
+            #warmup_rate = slope * float(epoch) + warmup_lr
+            #lr = warmup_rate
+
+    #if mode is 'progressive_drops':
+        ## drops as progression proceeds, good for sgd
+        #if epoch > 0.9 * total_epochs:
+            #lr = 0.0001
+        #elif epoch > 0.75 * total_epochs:
+            #lr = 0.001
+        #elif epoch > 0.5 * total_epochs:
+            #lr = 0.01
+        #else:
+            #lr = 0.1
+
+    #print('learning_rate change to: {}'.format(lr))
+    #return lr
+
+
+def get_lr_scheduler(learning_rate, decay_type, decay_steps):
+    if decay_type:
+        decay_type = decay_type.lower()
+
+    if decay_type == None:
+        lr_scheduler = learning_rate
+    elif decay_type == 'cosine':
+        lr_scheduler = CosineDecay(initial_learning_rate=learning_rate, decay_steps=decay_steps, alpha=0.2) # use 0.2*learning_rate as final minimum learning rate
+    elif decay_type == 'exponential':
+        lr_scheduler = ExponentialDecay(initial_learning_rate=learning_rate, decay_steps=decay_steps, decay_rate=0.9)
+    elif decay_type == 'polynomial':
+        lr_scheduler = PolynomialDecay(initial_learning_rate=learning_rate, decay_steps=decay_steps, end_learning_rate=learning_rate/100)
+    elif decay_type == 'piecewise_constant':
+        #apply a piecewise constant lr scheduler, including warmup stage
+        boundaries = [500, int(decay_steps*0.9), decay_steps]
+        values = [0.001, learning_rate, learning_rate/10., learning_rate/100.]
+        lr_scheduler = PiecewiseConstantDecay(boundaries=boundaries, values=values)
+    else:
+        raise ValueError('Unsupported lr decay type')
+
+    return lr_scheduler
+
+
+def get_optimizer(optim_type, learning_rate, average_type=None, decay_type='cosine', decay_steps=100000):
+    optim_type = optim_type.lower()
+
+    lr_scheduler = get_lr_scheduler(learning_rate, decay_type, decay_steps)
+
+    if optim_type == 'adam':
+        optimizer = Adam(learning_rate=lr_scheduler, epsilon=1e-7, amsgrad=False)
+    elif optim_type == 'rmsprop':
+        optimizer = RMSprop(learning_rate=lr_scheduler, rho=0.9, momentum=0.0, centered=False)
+    elif optim_type == 'sgd':
+        optimizer = SGD(learning_rate=lr_scheduler, momentum=0.9, nesterov=False)
+    else:
+        raise ValueError('Unsupported optimizer type')
+
+    if average_type:
+        optimizer = get_averaged_optimizer(average_type, optimizer)
+
+    return optimizer
+
+
+def get_averaged_optimizer(average_type, optimizer):
+    """
+    Apply weights average mechanism in optimizer. Need tensorflow-addons
+    which request TF 2.x and have following compatibility table:
+    -------------------------------------------------------------
+    |    Tensorflow Addons     | Tensorflow |    Python          |
+    -------------------------------------------------------------
+    | tfa-nightly              | 2.3, 2.4   | 3.6, 3.7, 3.8      |
+    -------------------------------------------------------------
+    | tensorflow-addons-0.12.0 | 2.3, 2.4   | 3.6, 3.7, 3.8      |
+    -------------------------------------------------------------
+    | tensorflow-addons-0.11.2 | 2.2, 2.3   | 3.5, 3.6, 3.7, 3.8 |
+    -------------------------------------------------------------
+    | tensorflow-addons-0.10.0 | 2.2        | 3.5, 3.6, 3.7, 3.8 |
+    -------------------------------------------------------------
+    | tensorflow-addons-0.9.1  | 2.1, 2.2   | 3.5, 3.6, 3.7      |
+    -------------------------------------------------------------
+    | tensorflow-addons-0.8.3  | 2.1        | 3.5, 3.6, 3.7      |
+    -------------------------------------------------------------
+    | tensorflow-addons-0.7.1  | 2.1        | 2.7, 3.5, 3.6, 3.7 |
+    -------------------------------------------------------------
+    | tensorflow-addons-0.6.0  | 2.0        | 2.7, 3.5, 3.6, 3.7 |
+    -------------------------------------------------------------
+    """
+    import tensorflow_addons as tfa
+
+    average_type = average_type.lower()
+
+    if average_type == None:
+        averaged_optimizer = optimizer
+    elif average_type == 'ema':
+        averaged_optimizer = tfa.optimizers.MovingAverage(optimizer, average_decay=0.99)
+    elif average_type == 'swa':
+        averaged_optimizer = tfa.optimizers.SWA(optimizer, start_averaging=0, average_period=10)
+    elif average_type == 'lookahead':
+        averaged_optimizer = tfa.optimizers.Lookahead(optimizer, sync_period=6, slow_step_size=0.5)
+    else:
+        raise ValueError('Unsupported average type')
+
+    return averaged_optimizer
+
diff --git a/models/deeplab/common/utils.py b/models/deeplab/common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ef4e5c0b73689eca216700cd4b260a2a644ad04
--- /dev/null
+++ b/models/deeplab/common/utils.py
@@ -0,0 +1,343 @@
+#!/usr/bin/python3
+# -*- coding=utf-8 -*-
+"""Miscellaneous utility functions."""
+
+import os
+import numpy as np
+import copy
+from tqdm import tqdm
+from PIL import Image
+import matplotlib.pyplot as plt
+from matplotlib import gridspec
+
+from deeplabv3p.models.layers import normalize, img_resize
+from deeplabv3p.models.deeplabv3p_mobilenetv3 import hard_sigmoid, hard_swish
+import tensorflow as tf
+
+
+def optimize_tf_gpu(tf, K):
+    if tf.__version__.startswith('2'):
+        gpus = tf.config.experimental.list_physical_devices('GPU')
+        if gpus:
+            try:
+                # Currently, memory growth needs to be the same across GPUs
+                for gpu in gpus:
+                    tf.config.experimental.set_virtual_device_configuration(gpu, [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10000)])
+                    #tf.config.experimental.set_memory_growth(gpu, True)
+            except RuntimeError as e:
+                # Memory growth must be set before GPUs have been initialized
+                print(e)
+    else:
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth=True   #dynamic alloc GPU resource
+        config.gpu_options.per_process_gpu_memory_fraction = 0.9  #GPU memory threshold 0.3
+        session = tf.Session(config=config)
+
+        # set session
+        K.set_session(session)
+
+
+def get_custom_objects():
+    '''
+    form up a custom_objects dict so that the customized
+    layer/function call could be correctly parsed when keras
+    .h5 model is loading or converting
+    '''
+    custom_objects_dict = {
+        'tf': tf,
+        'normalize': normalize,
+        'img_resize': img_resize,
+        'hard_sigmoid': hard_sigmoid,
+        'hard_swish': hard_swish,
+    }
+    return custom_objects_dict
+
+"""
+def calculate_weigths_labels(dataset_generator, num_classes, save_path=None):
+    '''
+    calculate a static segment classes (including background) weights
+    coefficient based on class pixel
+    '''
+    # Initialize class count list array
+    class_counts = np.zeros((num_classes,))
+
+    # collecting class pixel count
+    pbar = tqdm(total=len(dataset_generator), desc='Calculating classes weights')
+    for n, (_, y) in enumerate(dataset_generator):
+        mask = (y >= 0) & (y < num_classes)
+        labels = y[mask].astype(np.uint8)
+        count_l = np.bincount(labels, minlength=num_classes)
+        class_counts += count_l
+        pbar.update(1)
+    pbar.close()
+    # sum() to get total valid pixel count
+    total_count = np.sum(class_counts)
+    # get class weights with 1/(log(1.02+(class_count/total_count)))
+    class_weights = []
+    for class_count in class_counts:
+        class_weight = 1 / (np.log(1.02 + (class_count / total_count)))
+        class_weights.append(class_weight)
+
+    class_weights = np.array(class_weights)
+    # save class weights array to file for reloading next time
+    if save_path:
+        classes_weights_path = os.path.join(save_path, 'classes_weights.npy')
+        np.save(classes_weights_path, class_weights)
+
+    return class_weights
+"""
+
+
+def calculate_weigths_labels(dataset_generator, num_classes, save_path=None):
+    '''
+    calculate a static segment classes (including background) weights
+    coefficient based on class pixel
+    '''
+    # Initialize class count list array
+    class_counts = np.zeros((num_classes,))
+
+    # collecting class pixel count
+    pbar = tqdm(total=len(dataset_generator), desc='Calculating classes weights')
+    for n, (_, y) in enumerate(dataset_generator):
+        mask = (y >= 0) & (y < num_classes)
+        labels = y[mask].astype(np.uint8)
+        count_l = np.bincount(labels, minlength=num_classes)
+        class_counts += count_l
+        pbar.update(1)
+    pbar.close()
+    # sum() to get total valid pixel count
+    total_count = np.sum(class_counts)
+
+    #
+    # use following formula to calculate balanced class weights:
+    # class_weights = sample_count / (num_classes * np.bincount(labels))
+    #
+    # which is same as
+    # class_weight.compute_class_weight('balanced', class_list, y)
+    #
+    class_weights = total_count / (num_classes * class_counts)
+    class_weights = np.array(class_weights)
+    # save class weights array to file for reloading next time
+    if save_path:
+        classes_weights_path = os.path.join(save_path, 'classes_weights.txt')
+        save_class_weights(classes_weights_path, class_weights)
+
+    return class_weights
+
+
+def save_class_weights(save_path, class_weights):
+    '''
+    save class weights array with shape (num_classes,)
+    '''
+    weights_file = open(save_path, 'w')
+    for class_weight in list(class_weights):
+            weights_file.write(str(class_weight))
+            weights_file.write('\n')
+    weights_file.close()
+
+
+def load_class_weights(classes_weights_path):
+    '''
+    load saved class weights txt file and convert
+    to numpy array with shape (num_classes,)
+    '''
+    with open(classes_weights_path) as f:
+        classes_weights = f.readlines()
+    classes_weights = [float(c.strip()) for c in classes_weights]
+
+    return np.array(classes_weights)
+
+
+def get_classes(classes_path):
+    '''loads the classes'''
+    with open(classes_path) as f:
+        class_names = f.readlines()
+    class_names = [c.strip() for c in class_names]
+    return class_names
+
+
+def get_data_list(data_list_file, shuffle=True):
+    with open(data_list_file) as f:
+        lines = f.readlines()
+        lines = [line.strip() for line in lines]
+
+    if shuffle:
+        np.random.seed(10101)
+        np.random.shuffle(lines)
+        np.random.seed(None)
+
+    return lines
+
+
+def figure_to_image(figure):
+    '''
+    Convert a Matplotlib figure to a Pillow image with RGBA channels
+
+    # Arguments
+        figure: matplotlib figure
+                usually create with plt.figure()
+
+    # Returns
+        image: numpy array image
+    '''
+    # draw the renderer
+    figure.canvas.draw()
+
+    # Get the RGBA buffer from the figure
+    w, h = figure.canvas.get_width_height()
+    buf = np.fromstring(figure.canvas.tostring_argb(), dtype=np.uint8)
+    buf.shape = (w, h, 4)
+
+    # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
+    buf = np.roll(buf, 3, axis=2)
+    image = Image.frombytes("RGBA", (w, h), buf.tostring())
+    # Convert RGBA to RGB
+    image = np.asarray(image)[..., :3]
+    return image
+
+
+def create_pascal_label_colormap():
+    """
+    create label colormap with PASCAL VOC segmentation dataset definition
+
+    # Returns
+        colormap: Colormap array for visualizing segmentation
+    """
+    colormap = np.zeros((256, 3), dtype=int)
+    index = np.arange(256, dtype=int)
+
+    for shift in reversed(range(8)):
+        for channel in range(3):
+            colormap[:, channel] |= ((index >> channel) & 1) << shift
+        index >>= 3
+
+    return colormap
+
+
+def label_to_color_image(label):
+    """
+    mapping the segmentation label to color indexing array
+
+    # Arguments
+        label: 2D uint8 numpy array, with segmentation label
+
+    # Returns
+        result: A 2D array with floating type. The element of the array
+        is the color indexed by the corresponding element in the input label
+        to the PascalVOC color map.
+
+    Raises:
+        ValueError: If label is not of rank 2 or its value is larger than color
+        map maximum entry.
+    """
+    if label.ndim != 2:
+        raise ValueError('Expect 2-D input label')
+
+    colormap = create_pascal_label_colormap()
+
+    if np.max(label) >= len(colormap):
+        raise ValueError('label value too large.')
+
+    return colormap[label]
+
+
+def visualize_segmentation(image, mask, gt_mask=None, class_names=None, overlay=0.7, ignore_count_threshold=100, title=None, gt_title=None):
+    """
+    Visualize segmentation mask on input image, using PascalVOC
+    Segmentation color map
+
+    # Arguments
+        image: image array
+            numpy array for input image
+        mask: predict mask array
+            2D numpy array for predict segmentation mask
+        gt_mask: ground truth mask array
+            2D numpy array for gt segmentation mask
+        class_names: label class definition
+            list of label class names
+        ignore_count_threshold: threshold to filter label
+            integer scalar to filter the label value with small count
+        title: predict segmentation title
+            title string for predict segmentation result plot
+        gt_title: ground truth segmentation title
+            title string for ground truth segmentation plot
+
+    # Returns
+        img: A numpy image with segmentation result
+    """
+    if (gt_mask is not None) and (class_names is not None):
+        grid_spec = gridspec.GridSpec(1, 3, width_ratios=[6, 6, 1])
+        figsize = (15, 10)
+    elif (gt_mask is not None) and (class_names is None):
+        grid_spec = gridspec.GridSpec(1, 2, width_ratios=[6, 6])
+        figsize = (15, 10)
+    elif (gt_mask is None) and (class_names is not None):
+        grid_spec = gridspec.GridSpec(1, 2, width_ratios=[6, 1])
+        figsize = (10, 10)
+    else:
+        grid_spec = [111]
+        figsize = (10, 10)
+
+    figure = plt.figure(figsize=figsize)
+
+    # convert mask array to color mapped image
+    mask_image = label_to_color_image(mask).astype(np.uint8)
+    # show segmentation result image
+    plt.subplot(grid_spec[0])
+    plt.imshow(image)
+    plt.imshow(mask_image, alpha=overlay)
+    plt.axis('off')
+    # add plt title, optional
+    if title:
+        plt.title(title)
+
+    if gt_mask is not None:
+        # reset invalid label value as 0(background)
+        filtered_gt_mask = copy.deepcopy(gt_mask)
+        filtered_gt_mask[filtered_gt_mask>len(class_names)-1] = 0
+        # convert gt mask array to color mapped image
+        gt_mask_image = label_to_color_image(filtered_gt_mask).astype(np.uint8)
+        # show gt segmentation image
+        plt.subplot(grid_spec[1])
+        plt.imshow(image)
+        plt.imshow(gt_mask_image, alpha=overlay)
+        plt.axis('off')
+        # add plt title, optional
+        if gt_title:
+            plt.title(gt_title)
+
+    # if class name list is provided, plot a legend graph of
+    # classes color map
+    if class_names:
+        classes_index = np.arange(len(class_names)).reshape(len(class_names), 1)
+        classes_color_map = label_to_color_image(classes_index)
+
+        labels, count= np.unique(mask, return_counts=True)
+        # filter some corner pixel labels, may be caused by mask resize
+        labels = np.array([labels[i] for i in range(len(labels)) if count[i] > ignore_count_threshold])
+
+        if gt_mask is not None:
+            gt_labels, gt_count= np.unique(filtered_gt_mask, return_counts=True)
+            # filter some corner pixel labels, may be caused by mask resize
+            gt_labels = np.array([gt_labels[i] for i in range(len(gt_labels)) if gt_count[i] > ignore_count_threshold])
+
+            # merge labels & gt labels
+            labels = list(set(list(labels)+list(gt_labels)))
+            labels.sort()
+            labels = np.array(labels)
+
+        ax = plt.subplot(grid_spec[-1])
+        plt.imshow(classes_color_map[labels].astype(np.uint8), interpolation='nearest')
+
+        # adjust subplot display
+        ax.yaxis.tick_right()
+        plt.yticks(range(len(labels)), np.asarray(class_names)[labels])
+        plt.xticks([], [])
+        ax.tick_params(width=0.0)
+        plt.grid('off')
+
+    # convert plt to numpy image
+    img = figure_to_image(figure)
+    plt.close("all")
+    return img
+
diff --git a/models/deeplab/configs/ade20k_classes.txt b/models/deeplab/configs/ade20k_classes.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bd153c8722d44dd6d1872c2e31093f1cf09a2f66
--- /dev/null
+++ b/models/deeplab/configs/ade20k_classes.txt
@@ -0,0 +1,150 @@
+wall
+building
+sky
+floor
+tree
+ceiling
+road
+bed
+window
+grass
+cabinet
+sidewalk
+person
+ground
+door
+table
+mountain
+plant
+curtain
+chair
+car
+water
+picture
+sofa
+shelf
+house
+sea
+mirror
+carpet
+field
+armchair
+seat
+fence
+desk
+rock
+closet
+lamp
+bathtub
+railing
+cushion
+base
+box
+column
+signboard
+chest of drawers
+counter
+sand
+sink
+skyscraper
+fireplace
+refrigerator
+grandstand
+path
+stairs, steps
+runway
+showcase
+billiard table
+pillow
+screen door
+stairway
+river
+bridge
+bookcase
+blind, screen
+coffee table
+toilet
+flower
+book
+hill
+bench
+countertop
+stove
+palm tree
+kitchen island
+computer
+swivel chair
+boat
+bar
+arcade machine
+hovel
+bus
+towel
+light
+truck
+tower
+chandelier
+sunshade
+streetlight
+booth
+television
+aeroplane
+dirt track
+clothes
+pole
+land
+handrail
+escalator
+ottoman
+bottle
+buffet
+poster
+stage
+van
+ship
+fountain
+conveyer belt
+canopy
+washing machine
+toy
+swimming pool
+stool
+barrel
+basket
+waterfall
+tent
+bag
+motorbike
+cradle
+oven
+ball
+solid food
+stair
+tank
+brand
+microwave
+flowerpot
+animal
+bicycle
+lake
+dishwasher
+silver screen
+blanket
+sculpture
+exhaust hood
+sconce
+vase
+traffic light
+tray
+dustbin
+fan
+wharf
+crt screen
+plate
+monitor
+notice board
+shower
+radiator
+glass
+clock
+flag
diff --git a/models/deeplab/configs/cityscapes_classes.txt b/models/deeplab/configs/cityscapes_classes.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dec51480ee965e2a19175eb56dcc15e16f9465ea
--- /dev/null
+++ b/models/deeplab/configs/cityscapes_classes.txt
@@ -0,0 +1,33 @@
+ego vehicle
+rectification border
+out of roi
+static
+dynamic
+ground
+road
+sidewalk
+parking
+rail track
+building
+wall
+fence
+guard rail
+bridge
+tunnel
+pole
+polegroup
+traffic light
+traffic sign
+vegetation
+terrain
+sky
+person
+rider
+car
+truck
+bus
+caravan
+trailer
+train
+motorcycle
+bicycle
diff --git a/models/deeplab/configs/coco_classes.txt b/models/deeplab/configs/coco_classes.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa
--- /dev/null
+++ b/models/deeplab/configs/coco_classes.txt
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/models/deeplab/configs/voc_classes.txt b/models/deeplab/configs/voc_classes.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8420ab35ede7400974f25836a6bb543024686a0e
--- /dev/null
+++ b/models/deeplab/configs/voc_classes.txt
@@ -0,0 +1,20 @@
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
diff --git a/models/deeplab/deeplab.py b/models/deeplab/deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..b73d601c9893e31015bd2ad3c030ed9ad63681d3
--- /dev/null
+++ b/models/deeplab/deeplab.py
@@ -0,0 +1,297 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Run a Deeplabv3plus semantic segmentation model on test images.
+"""
+
+import colorsys
+import os, sys, argparse
+import numpy as np
+import cv2
+from PIL import Image
+import matplotlib.pyplot as plt
+import time
+from timeit import default_timer as timer
+import tensorflow as tf
+from tensorflow.keras import backend as K
+from tensorflow.keras.models import Model, load_model
+from tensorflow.keras.utils import multi_gpu_model
+#from tensorflow_model_optimization.sparsity import keras as sparsity
+
+from deeplabv3p.model import get_deeplabv3p_model
+from deeplabv3p.postprocess_np import crf_postprocess
+from common.utils import get_classes, optimize_tf_gpu, visualize_segmentation
+from common.data_utils import preprocess_image, mask_resize, mask_resize_fast
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+#tf.enable_eager_execution()
+optimize_tf_gpu(tf, K)
+
+default_config = {
+        "model_type": 'mobilenetv2lite',
+        "classes_path": os.path.join('configs', 'voc_classes.txt'),
+        "model_input_shape" : (512, 512),
+        "output_stride": 16,
+        "weights_path": os.path.join('weights', 'mobilenetv2_original.h5'),
+        "do_crf": False,
+        "pruning_model": False,
+        "gpu_num" : 1,
+    }
+
+
+class DeepLab(object):
+    _defaults = default_config
+
+    @classmethod
+    def get_defaults(cls, n):
+        if n in cls._defaults:
+            return cls._defaults[n]
+        else:
+            return "Unrecognized attribute name '" + n + "'"
+
+    def __init__(self, **kwargs):
+        super(DeepLab, self).__init__()
+        self.__dict__.update(self._defaults) # set up default values
+        self.__dict__.update(kwargs) # and update with user overrides
+        self.class_names = get_classes(self.classes_path)
+        K.set_learning_phase(0)
+        self.deeplab_model = self._generate_model()
+
+    def _generate_model(self):
+        '''to generate the bounding boxes'''
+        weights_path = os.path.expanduser(self.weights_path)
+        assert weights_path.endswith('.h5'), 'Keras model or weights must be a .h5 file.'
+
+        #add one more class for background
+        self.class_names = ['background'] + self.class_names
+        num_classes = len(self.class_names)
+        assert len(self.class_names) < 254, 'PNG image label only support less than 254 classes.'
+
+        # Load model, or construct model and load weights.
+        try:
+            deeplab_model = get_deeplabv3p_model(self.model_type, num_classes, model_input_shape=self.model_input_shape, output_stride=self.output_stride, freeze_level=0, weights_path=weights_path, training=False)
+            deeplab_model.summary()
+        except Exception as e:
+            print(repr(e))
+        if self.gpu_num>=2:
+            deeplab_model = multi_gpu_model(deeplab_model, gpus=self.gpu_num)
+
+        return deeplab_model
+
+
+    def segment_image(self, image):
+        image_data = preprocess_image(image, self.model_input_shape)
+        # origin image shape, in (height, width) format
+        image_shape = tuple(reversed(image.size))
+
+        start = time.time()
+        out_mask = self.predict(image_data, image_shape)
+        end = time.time()
+        print("Inference time: {:.8f}s".format(end - start))
+
+        # show segmentation result
+        image_array = visualize_segmentation(np.array(image), out_mask, class_names=self.class_names, ignore_count_threshold=500)
+        return Image.fromarray(image_array)
+
+
+    def predict(self, image_data, image_shape):
+        prediction = self.deeplab_model.predict([image_data])
+        # reshape prediction to mask array
+        mask = np.argmax(prediction, -1)[0].reshape(self.model_input_shape)
+
+        # add CRF postprocess if need
+        if self.do_crf:
+            image = image_data[0].astype('uint8')
+            mask = crf_postprocess(image, mask, zero_unsure=False)
+
+        # resize mask back to origin image size
+        mask = mask_resize_fast(mask, tuple(reversed(image_shape)))
+
+        return mask
+
+
+    def dump_model_file(self, output_model_file):
+        self.deeplab_model.save(output_model_file)
+
+    def dump_saved_model(self, saved_model_path):
+        model = self.deeplab_model
+        os.makedirs(saved_model_path, exist_ok=True)
+
+        tf.keras.experimental.export_saved_model(model, saved_model_path)
+        print('export inference model to %s' % str(saved_model_path))
+
+
+def segment_video(deeplab, video_path, output_path=""):
+    import cv2
+    vid = cv2.VideoCapture(0 if video_path == '0' else video_path)
+    if not vid.isOpened():
+        raise IOError("Couldn't open webcam or video")
+
+    # here we encode the video to MPEG-4 for better compatibility, you can use ffmpeg later
+    # to convert it to x264 to reduce file size:
+    # ffmpeg -i test.mp4 -vcodec libx264 -f mp4 test_264.mp4
+    #
+    #video_FourCC    = cv2.VideoWriter_fourcc(*'XVID') if video_path == '0' else int(vid.get(cv2.CAP_PROP_FOURCC))
+    video_FourCC    = cv2.VideoWriter_fourcc(*'XVID') if video_path == '0' else cv2.VideoWriter_fourcc(*"mp4v")
+    video_fps       = vid.get(cv2.CAP_PROP_FPS)
+    video_size      = (int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
+                        int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+    isOutput = True if output_path != "" else False
+    if isOutput:
+        print("!!! TYPE:", type(output_path), type(video_FourCC), type(video_fps), type(video_size))
+        out = cv2.VideoWriter(output_path, video_FourCC, (5. if video_path == '0' else video_fps), video_size)
+    accum_time = 0
+    curr_fps = 0
+    fps = "FPS: ??"
+    prev_time = timer()
+    while True:
+        return_value, frame = vid.read()
+        image = Image.fromarray(frame)
+        image = deeplab.segment_image(image)
+        result = np.asarray(image)
+        curr_time = timer()
+        exec_time = curr_time - prev_time
+        prev_time = curr_time
+        accum_time = accum_time + exec_time
+        curr_fps = curr_fps + 1
+        if accum_time > 1:
+            accum_time = accum_time - 1
+            fps = "FPS: " + str(curr_fps)
+            curr_fps = 0
+        cv2.putText(result, text=fps, org=(3, 15), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
+                    fontScale=0.50, color=(255, 0, 0), thickness=2)
+        cv2.namedWindow("result", cv2.WINDOW_NORMAL)
+        cv2.imshow("result", result)
+        if isOutput:
+            out.write(result)
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+    # Release everything if job is finished
+    vid.release()
+    if isOutput:
+        out.release()
+    cv2.destroyAllWindows()
+
+
+def segment_img(deeplab):
+    while True:
+        img = input('Input image filename:')
+        try:
+            image = Image.open(img)
+        except:
+            print('Open Error! Try again!')
+            continue
+        else:
+            r_image = deeplab.segment_image(image)
+            r_image.show()
+
+
+if __name__ == '__main__':
+    # class DeepLab defines the default value, so suppress any default here
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description='demo or dump out Deeplab h5 model')
+    '''
+    Command line options
+    '''
+    parser.add_argument(
+        '--model_type', type=str,
+        help='Deeplabv3p model type: mobilenetv2/xception, default ' + DeepLab.get_defaults("model_type")
+    )
+
+    parser.add_argument(
+        '--weights_path', type=str,
+        help='path to model weight file, default ' + DeepLab.get_defaults("weights_path")
+    )
+
+    parser.add_argument(
+        '--classes_path', type=str,
+        help='path to class definitions, default ' + DeepLab.get_defaults("classes_path")
+    )
+
+    parser.add_argument(
+        '--model_input_shape', type=str,
+        help='model input size as <height>x<width>, default ' +
+        str(DeepLab.get_defaults("model_input_shape")[0])+'x'+str(DeepLab.get_defaults("model_input_shape")[1]),
+        default=str(DeepLab.get_defaults("model_input_shape")[0])+'x'+str(DeepLab.get_defaults("model_input_shape")[1])
+    )
+
+    parser.add_argument(
+        '--output_stride', type=int, choices=[8, 16, 32],
+        help='model output stride, default ' + str(DeepLab.get_defaults("output_stride"))
+    )
+
+    parser.add_argument(
+        '--do_crf', default=False, action="store_true",
+        help='whether to add CRF postprocess for model output, default ' + str(DeepLab.get_defaults("do_crf"))
+    )
+
+    #parser.add_argument(
+        #'--pruning_model', default=False, action="store_true",
+        #help='Whether to be a pruning model/weights file')
+
+    parser.add_argument(
+        '--gpu_num', type=int,
+        help='Number of GPU to use, default ' + str(DeepLab.get_defaults("gpu_num"))
+    )
+    parser.add_argument(
+        '--image', default=False, action="store_true",
+        help='Image inference mode, will ignore all positional arguments'
+    )
+    '''
+    Command line positional arguments -- for video detection mode
+    '''
+    parser.add_argument(
+        "--input", nargs='?', type=str,required=False,default='./path2your_video',
+        help = "Video input path"
+    )
+
+    parser.add_argument(
+        "--output", nargs='?', type=str, default="",
+        help = "[Optional] Video output path"
+    )
+    '''
+    Command line positional arguments -- for model dump
+    '''
+    parser.add_argument(
+        '--dump_model', default=False, action="store_true",
+        help='Dump out training model to inference model'
+    )
+
+    parser.add_argument(
+        '--output_model_file', type=str,
+        help='output inference model file'
+    )
+
+    args = parser.parse_args()
+    # param parse
+    if args.model_input_shape:
+        height, width = args.model_input_shape.split('x')
+        args.model_input_shape = (int(height), int(width))
+
+    # get wrapped inference object
+    deeplab = DeepLab(**vars(args))
+
+    if args.dump_model:
+        """
+        Dump out training model to inference model
+        """
+        if not args.output_model_file:
+            raise ValueError('output model file is not specified')
+
+        print('Dumping out training model to inference model')
+        deeplab.dump_model_file(args.output_model_file)
+        sys.exit()
+
+    if args.image:
+        """
+        Image segmentation mode, disregard any remaining command line arguments
+        """
+        print("Image segmentation mode")
+        if "input" in args:
+            print(" Ignoring remaining command line arguments: " + args.input + "," + args.output)
+        segment_img(deeplab)
+    elif "input" in args:
+        segment_video(deeplab, args.input, args.output)
+    else:
+        print("Must specify at least video_input_path.  See usage with --help.")
diff --git a/models/deeplab/deeplabv3p/data.py b/models/deeplab/deeplabv3p/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..d113b5530ae7ce470181f26c6db491db12681182
--- /dev/null
+++ b/models/deeplab/deeplabv3p/data.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os, glob, time
+import random
+import numpy as np
+import cv2
+from PIL import Image
+from sklearn.utils import class_weight
+from tensorflow.keras.utils import Sequence
+
+from common.data_utils import random_horizontal_flip, random_vertical_flip, random_brightness, random_grayscale, random_chroma, random_contrast, random_sharpness, random_blur, random_zoom_rotate, random_gridmask, random_crop, random_histeq
+
+
+class SegmentationGenerator(Sequence):
+    def __init__(self, dataset_path, data_list,
+                 batch_size=1,
+                 num_classes=21,
+                 target_size=(512, 512),
+                 weighted_type=None,
+                 is_eval=False,
+                 augment=True):
+        # get real path for dataset
+        dataset_realpath = os.path.realpath(dataset_path)
+        self.image_path_list = [os.path.join(dataset_realpath, 'images', image_id.strip()+'.jpg') for image_id in data_list]
+        self.label_path_list = [os.path.join(dataset_realpath, 'labels', image_id.strip()+'.png') for image_id in data_list]
+        # initialize random seed
+        np.random.seed(int(time.time()))
+
+        self.num_classes = num_classes
+        self.batch_size = batch_size
+        self.target_size = target_size
+        self.weighted_type = weighted_type
+        self.augment = augment
+        self.is_eval = is_eval
+
+        # Preallocate memory
+        self.X = np.zeros((batch_size, target_size[1], target_size[0], 3), dtype='float32')
+        self.Y = np.zeros((batch_size, target_size[1]*target_size[0], 1), dtype='float32')
+        self.PIXEL_WEIGHTS = np.zeros((batch_size, target_size[1]*target_size[0]), dtype='float32')
+
+    def get_batch_image_path(self, i):
+        return self.image_path_list[i*self.batch_size:(i+1)*self.batch_size]
+
+    def get_batch_label_path(self, i):
+        return self.label_path_list[i*self.batch_size:(i+1)*self.batch_size]
+
+    def get_weighted_type(self):
+        return self.weighted_type
+
+    def __len__(self):
+        return len(self.image_path_list) // self.batch_size
+
+    def __getitem__(self, i):
+
+        for n, (image_path, label_path) in enumerate(zip(self.image_path_list[i*self.batch_size:(i+1)*self.batch_size],
+                                                        self.label_path_list[i*self.batch_size:(i+1)*self.batch_size])):
+
+            # Load image and label array
+            image = cv2.imread(image_path, cv2.IMREAD_COLOR) # cv2.IMREAD_COLOR/cv2.IMREAD_GRAYSCALE/cv2.IMREAD_UNCHANGED
+            label = np.array(Image.open(label_path))
+
+            # we reset all the invalid label value as 0(background) in training,
+            # but as 255(invalid) in eval
+            if self.is_eval:
+                label[label>(self.num_classes-1)] = 255
+            else:
+                label[label>(self.num_classes-1)] = 0
+
+            # Do augmentation
+            if self.augment:
+                # random horizontal flip image
+                image, label = random_horizontal_flip(image, label)
+
+                # random vertical flip image
+                image, label = random_vertical_flip(image, label)
+
+                # random zoom & rotate image
+                image, label = random_zoom_rotate(image, label)
+
+                # random add gridmask augment for image and label
+                image, label = random_gridmask(image, label)
+
+                # random adjust brightness
+                image = random_brightness(image)
+
+                # random adjust color level
+                image = random_chroma(image)
+
+                # random adjust contrast
+                image = random_contrast(image)
+
+                # random adjust sharpness
+                image = random_sharpness(image)
+
+                # random convert image to grayscale
+                image = random_grayscale(image)
+
+                # random do gaussian blur to image
+                image = random_blur(image)
+
+                # random crop image & label
+                image, label = random_crop(image, label, self.target_size)
+
+                # random do histogram equalization using CLAHE
+                image = random_histeq(image)
+
+
+            # Resize image & label mask to model input shape
+            image = cv2.resize(image, self.target_size)
+            label = cv2.resize(label, self.target_size, interpolation = cv2.INTER_NEAREST)
+
+            label = label.astype('int32')
+            y = label.flatten()
+
+            # we reset all the invalid label value as 0(background) in training,
+            # but as 255(invalid) in eval
+            if self.is_eval:
+                y[y>(self.num_classes-1)] = 255
+            else:
+                y[y>(self.num_classes-1)] = 0
+
+            # append input image and label array
+            self.X[n] = image
+            self.Y[n]  = np.expand_dims(y, -1)
+
+            ###########################################################################
+            #
+            # generating adaptive pixels weights array, for unbalanced classes training
+            #
+            ###########################################################################
+
+            # Create adaptive pixels weights for all classes on one image,
+            # according to pixel number of classes
+            class_list = np.unique(y)
+            if len(class_list):
+                class_weights = class_weight.compute_class_weight('balanced', class_list, y)
+                class_weights = {class_id : weight for class_id , weight in zip(class_list, class_weights)}
+            # class_weigts dict would be like:
+            # {
+            #    0: 0.5997304983036035,
+            #   12: 2.842871240958237,
+            #   15: 1.0195474451419193
+            # }
+            for class_id in class_list:
+                np.putmask(self.PIXEL_WEIGHTS[n], y==class_id, class_weights[class_id])
+
+        # A trick of keras data generator: the last item yield
+        # from a generator could be a sample weights array
+        sample_weight_dict = {'pred_mask' : self.PIXEL_WEIGHTS}
+
+        if self.weighted_type == 'adaptive':
+            return self.X, self.Y, sample_weight_dict
+        else:
+            return self.X, self.Y
+
+    def on_epoch_end(self):
+        # Shuffle dataset for next epoch
+        c = list(zip(self.image_path_list, self.label_path_list))
+        random.shuffle(c)
+        self.image_path_list, self.label_path_list = zip(*c)
+
diff --git a/models/deeplab/deeplabv3p/loss.py b/models/deeplab/deeplabv3p/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a06b6b0b3e2077c14d0f7842d15b4284975c2d64
--- /dev/null
+++ b/models/deeplab/deeplabv3p/loss.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import numpy as np
+import tensorflow as tf
+import tensorflow.keras.backend as K
+
+
+def sparse_crossentropy_ignoring_last_label(y_true, y_pred):
+    num_classes = K.shape(y_pred)[-1]
+    y_true = K.one_hot(tf.cast(y_true[..., 0], tf.int32), num_classes+1)[..., :-1]
+    return K.categorical_crossentropy(y_true, y_pred)
+
+def sparse_crossentropy(y_true, y_pred):
+    num_classes = K.shape(y_pred)[-1]
+    y_true = K.one_hot(tf.cast(y_true[..., 0], tf.int32), num_classes)
+    return K.categorical_crossentropy(y_true, y_pred)
+
+
+def softmax_focal_loss(y_true, y_pred, gamma=2.0, alpha=0.25, from_logits=False):
+    """
+    Compute softmax focal loss.
+    Reference Paper:
+        "Focal Loss for Dense Object Detection"
+        https://arxiv.org/abs/1708.02002
+
+    # Arguments
+        y_true: Ground truth targets,
+            tensor of shape (?, num_pixel, num_classes).
+        y_pred: Predicted logits,
+            tensor of shape (?, num_pixel, num_classes).
+        gamma: exponent of the modulating factor (1 - p_t) ^ gamma.
+        alpha: optional alpha weighting factor to balance positives vs negatives.
+
+    # Returns
+        softmax_focal_loss: Softmax focal loss, tensor of shape (?, num_pixel).
+    """
+    if from_logits:
+        y_pred = K.softmax(y_pred)
+
+    # Clip the prediction value to prevent NaN's and Inf's
+    #epsilon = K.epsilon()
+    #y_pred = K.clip(y_pred, epsilon, 1. - epsilon)
+    y_pred = K.maximum(K.minimum(y_pred, 1 - 1e-15), 1e-15)
+
+    # Calculate Cross Entropy
+    cross_entropy = -y_true * K.log(y_pred)
+
+    # Calculate Focal Loss
+    softmax_focal_loss = K.mean(alpha * K.pow(1 - y_pred, gamma) * cross_entropy, axis=-1)
+    return softmax_focal_loss
+
+
+class WeightedSparseCategoricalCrossEntropy(object):
+  def __init__(self, weights, from_logits=False):
+    self.weights = np.array(weights).astype('float32')
+    self.from_logits = from_logits
+    self.__name__ = 'weighted_sparse_categorical_crossentropy'
+
+  def __call__(self, y_true, y_pred):
+    return self.weighted_sparse_categorical_crossentropy(y_true, y_pred)
+
+  def weighted_sparse_categorical_crossentropy(self, y_true, y_pred):
+    num_classes = len(self.weights)
+    y_true = K.one_hot(tf.cast(y_true[..., 0], tf.int32), num_classes)
+    if self.from_logits:
+        y_pred = K.softmax(y_pred)
+
+    log_pred = K.log(y_pred)
+    unweighted_losses = -K.sum(y_true*log_pred, axis=-1)
+
+    weights = K.sum(K.constant(self.weights) * y_true, axis=-1)
+    weighted_losses = unweighted_losses * weights
+    return weighted_losses
+
diff --git a/models/deeplab/deeplabv3p/metrics.py b/models/deeplab/deeplabv3p/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..28611f8afd5cb1ad081b390979afd1fb692fdd0e
--- /dev/null
+++ b/models/deeplab/deeplabv3p/metrics.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import numpy as np
+
+from tensorflow.keras import backend as K
+import tensorflow as tf
+
+
+def mIOU(gt, preds):
+    ulabels = np.unique(gt)
+    iou = np.zeros(len(ulabels))
+    for k, u in enumerate(ulabels):
+        inter = (gt == u) & (preds==u)
+        union = (gt == u) | (preds==u)
+        iou[k] = inter.sum()/union.sum()
+    return np.round(iou.mean(), 2)
+
+
+def sparse_accuracy_ignoring_last_label(y_true, y_pred):
+    nb_classes = y_pred.shape.as_list()[-1]
+    y_pred = K.reshape(y_pred, (-1, nb_classes))
+    y_true = tf.cast(K.flatten(y_true), tf.int64)
+    legal_labels = ~K.equal(y_true, nb_classes)
+    return K.sum(tf.cast(legal_labels & K.equal(y_true,
+                                                    K.argmax(y_pred, axis=-1)), tf.float32)) / K.sum(tf.cast(legal_labels, tf.float32))
+
+
+
+def Jaccard(y_true, y_pred):
+    nb_classes = y_pred.shape.as_list()[-1]
+    iou = []
+    pred_pixels = K.argmax(y_pred, axis=-1)
+    for i in range(0, nb_classes+1):
+        true_labels = K.equal(y_true[:,:,0], i)
+        pred_labels = K.equal(pred_pixels, i)
+        inter = tf.cast(true_labels & pred_labels, tf.int32)
+        union = tf.cast(true_labels | pred_labels, tf.int32)
+        legal_batches = K.sum(tf.cast(true_labels, tf.int32), axis=1)>0
+        ious = K.sum(inter, axis=1)/K.sum(union, axis=1)
+        iou.append(K.mean(ious[legal_batches]))
+    iou = tf.stack(iou)
+    legal_labels = ~tf.math.is_nan(iou)
+    iou = iou[legal_labels]
+    return K.mean(iou)
+
diff --git a/models/deeplab/deeplabv3p/model.py b/models/deeplab/deeplabv3p/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..19cecc1353e4c640a643b06a4453b5b9aa68e7c3
--- /dev/null
+++ b/models/deeplab/deeplabv3p/model.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+create deeplabv3p models
+"""
+from functools import partial
+from tensorflow.keras.layers import Conv2D, Reshape, Activation, Softmax, Lambda, Input
+from tensorflow.keras.models import Model
+
+from deeplabv3p.models.deeplabv3p_xception import Deeplabv3pXception
+from deeplabv3p.models.deeplabv3p_mobilenetv2 import Deeplabv3pMobileNetV2, Deeplabv3pLiteMobileNetV2
+from deeplabv3p.models.deeplabv3p_mobilenetv3 import Deeplabv3pMobileNetV3Large, Deeplabv3pLiteMobileNetV3Large, Deeplabv3pMobileNetV3Small, Deeplabv3pLiteMobileNetV3Small
+from deeplabv3p.models.deeplabv3p_peleenet import Deeplabv3pPeleeNet, Deeplabv3pLitePeleeNet
+from deeplabv3p.models.deeplabv3p_resnet50 import Deeplabv3pResNet50
+from deeplabv3p.models.layers import DeeplabConv2D, Subpixel, img_resize
+
+#
+# A map of model type to construction function for DeepLabv3+
+#
+deeplab_model_map = {
+    'mobilenetv2': partial(Deeplabv3pMobileNetV2, alpha=1.0),
+    'mobilenetv2_lite': partial(Deeplabv3pLiteMobileNetV2, alpha=1.0),
+
+    'mobilenetv3large': partial(Deeplabv3pMobileNetV3Large, alpha=1.0),
+    'mobilenetv3large_lite': partial(Deeplabv3pLiteMobileNetV3Large, alpha=1.0),
+
+    'mobilenetv3small': partial(Deeplabv3pMobileNetV3Small, alpha=1.0),
+    'mobilenetv3small_lite': partial(Deeplabv3pLiteMobileNetV3Small, alpha=1.0),
+
+    'peleenet': Deeplabv3pPeleeNet,
+    'peleenet_lite': Deeplabv3pLitePeleeNet,
+
+    'xception': Deeplabv3pXception,
+    'resnet50': Deeplabv3pResNet50,
+}
+
+
+def get_deeplabv3p_model(model_type, num_classes, model_input_shape, output_stride, freeze_level=0, weights_path=None, training=True, use_subpixel=False):
+    # check if model type is valid
+    if model_type not in deeplab_model_map.keys():
+        raise ValueError('This model type is not supported now')
+
+    model_function = deeplab_model_map[model_type]
+
+    input_tensor = Input(shape=model_input_shape + (3,), name='image_input')
+    model, backbone_len = model_function(input_tensor=input_tensor,
+                                         input_shape=model_input_shape + (3,),
+                                         #weights='imagenet',
+                                         num_classes=21,
+                                         OS=output_stride)
+
+    base_model = Model(model.input, model.layers[-5].output)
+    print('backbone layers number: {}'.format(backbone_len))
+
+    if use_subpixel:
+        if model_type == 'xception':
+            scale = 4
+        else:
+            scale = 8
+        x = Subpixel(num_classes, 1, scale, padding='same')(base_model.output)
+    else:
+        x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='conv_upsample')(base_model.output)
+        x = Lambda(img_resize, arguments={'size': (model_input_shape[0], model_input_shape[1])}, name='pred_resize')(x)
+
+    # for training model, we need to flatten mask to calculate loss
+    if training:
+        x = Reshape((model_input_shape[0]*model_input_shape[1], num_classes)) (x)
+
+    x = Softmax(name='pred_mask')(x)
+    model = Model(base_model.input, x, name='deeplabv3p_'+model_type)
+
+    #if use_subpixel:
+        # Do ICNR
+        #for layer in model.layers:
+            #if type(layer) == Subpixel:
+                #c, b = layer.get_weights()
+                #w = icnr_weights(scale=scale, shape=c.shape)
+                #layer.set_weights([w, b])
+
+    if weights_path:
+        model.load_weights(weights_path, by_name=False)#, skip_mismatch=True)
+        print('Load weights {}.'.format(weights_path))
+
+    if freeze_level in [1, 2]:
+        # Freeze the backbone part or freeze all but final feature map & input layers.
+        num = (backbone_len, len(base_model.layers))[freeze_level-1]
+        for i in range(num): model.layers[i].trainable = False
+        print('Freeze the first {} layers of total {} layers.'.format(num, len(model.layers)))
+    elif freeze_level == 0:
+        # Unfreeze all layers.
+        for i in range(len(model.layers)):
+            model.layers[i].trainable= True
+        print('Unfreeze all of the layers.')
+
+    return model
+
diff --git a/models/deeplab/deeplabv3p/models/__pycache__/deeplabv3p_mobilenetv3.cpython-311.pyc b/models/deeplab/deeplabv3p/models/__pycache__/deeplabv3p_mobilenetv3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09bcf6999a9a8ac009ef164bffe73138a94578e7
Binary files /dev/null and b/models/deeplab/deeplabv3p/models/__pycache__/deeplabv3p_mobilenetv3.cpython-311.pyc differ
diff --git a/models/deeplab/deeplabv3p/models/__pycache__/layers.cpython-311.pyc b/models/deeplab/deeplabv3p/models/__pycache__/layers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc25434158fe199f1349b8dab6847e344cdece88
Binary files /dev/null and b/models/deeplab/deeplabv3p/models/__pycache__/layers.cpython-311.pyc differ
diff --git a/models/deeplab/deeplabv3p/models/deeplabv3p_mobilenetv2.py b/models/deeplab/deeplabv3p/models/deeplabv3p_mobilenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..acbf147ff0ac76c95f648aba2e5c2ce32c3693a6
--- /dev/null
+++ b/models/deeplab/deeplabv3p/models/deeplabv3p_mobilenetv2.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" Deeplabv3+ MobileNetV2 model for Keras.
+
+# Reference Paper:
+- [Encoder-Decoder with Atrous Separable Convolution
+    for Semantic Image Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+- [Inverted Residuals and Linear Bottlenecks: Mobile Networks for
+    Classification, Detection and Segmentation](https://arxiv.org/abs/1801.04381)
+"""
+from tensorflow.keras.models import Model
+from tensorflow.keras.activations import relu
+from tensorflow.keras.layers import Conv2D, DepthwiseConv2D, ZeroPadding2D, Lambda, AveragePooling2D, Input, Concatenate, Add, Reshape, BatchNormalization, Dropout, ReLU, Softmax
+from tensorflow.keras.utils import get_source_inputs, get_file
+#from tensorflow.keras import backend as K
+
+from deeplabv3p.models.layers import DeeplabConv2D, DeeplabDepthwiseConv2D, CustomBatchNormalization, ASPP_block, ASPP_Lite_block, Decoder_block, normalize, img_resize
+
+BACKBONE_WEIGHT_PATH = ('https://github.com/JonathanCMitchell/mobilenet_v2_keras/'
+                    'releases/download/v1.1/')
+
+WEIGHTS_PATH_MOBILE = "https://github.com/bonlime/keras-deeplab-v3-plus/releases/download/1.1/deeplabv3_mobilenetv2_tf_dim_ordering_tf_kernels.h5"
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _inverted_res_block(inputs, expansion, stride, alpha, filters, block_id, skip_connection, rate=1):
+    #in_channels = inputs._keras_shape[-1]
+    in_channels = inputs.shape.as_list()[-1]
+    pointwise_conv_filters = int(filters * alpha)
+    pointwise_filters = _make_divisible(pointwise_conv_filters, 8)
+    x = inputs
+    prefix = 'expanded_conv_{}_'.format(block_id)
+    if block_id:
+        # Expand
+        x = DeeplabConv2D(expansion * in_channels, kernel_size=1, padding='same',
+                   use_bias=False, activation=None,
+                   name=prefix + 'expand')(x)
+        x = CustomBatchNormalization(epsilon=1e-3, momentum=0.999,
+                               name=prefix + 'expand_BN')(x)
+        x = ReLU(max_value=6.)(x)
+    else:
+        prefix = 'expanded_conv_'
+    # Depthwise
+    x = DeeplabDepthwiseConv2D(kernel_size=3, strides=stride, activation=None,
+                        use_bias=False, padding='same', dilation_rate=(rate, rate),
+                        name=prefix + 'depthwise')(x)
+    x = CustomBatchNormalization(epsilon=1e-3, momentum=0.999,
+                           name=prefix + 'depthwise_BN')(x)
+    x = ReLU(max_value=6., name=prefix + 'depthwise_relu')(x)
+
+    x = DeeplabConv2D(pointwise_filters,
+               kernel_size=1, padding='same', use_bias=False, activation=None,
+               name=prefix + 'project')(x)
+    x = CustomBatchNormalization(epsilon=1e-3, momentum=0.999,
+                           name=prefix + 'project_BN')(x)
+
+    if skip_connection:
+        return Add(name=prefix + 'add')([inputs, x])
+    # if in_channels == pointwise_filters and stride == 1:
+    #    return Add(name='res_connect_' + str(block_id))([inputs, x])
+
+    return x
+
+
+def MobileNetV2_body(input_tensor, OS, alpha, weights='imagenet'):
+    """
+    Modified MobileNetV2 feature extractor body
+    with specified output stride and skip level feature
+    """
+    if OS == 8:
+        origin_os16_stride = 1
+        origin_os16_block_rate = 2
+        origin_os32_stride = 1
+        origin_os32_block_rate = 4
+    elif OS == 16:
+        origin_os16_stride = 2
+        origin_os16_block_rate = 1
+        origin_os32_stride = 1
+        origin_os32_block_rate = 2
+    elif OS == 32:
+        origin_os16_stride = 2
+        origin_os16_block_rate = 1
+        origin_os32_stride = 2
+        origin_os32_block_rate = 1
+    else:
+        raise ValueError('invalid output stride', OS)
+
+    first_block_filters = _make_divisible(32 * alpha, 8)
+    x = DeeplabConv2D(first_block_filters,
+               kernel_size=3,
+               strides=(2, 2), padding='same',
+               use_bias=False, name='Conv')(input_tensor)
+    x = CustomBatchNormalization(
+        epsilon=1e-3, momentum=0.999, name='Conv_BN')(x)
+    x = ReLU(6.)(x)
+
+    x = _inverted_res_block(x, filters=16, alpha=alpha, stride=1,
+                            expansion=1, block_id=0, skip_connection=False)
+
+    x = _inverted_res_block(x, filters=24, alpha=alpha, stride=2,
+                            expansion=6, block_id=1, skip_connection=False)
+    x = _inverted_res_block(x, filters=24, alpha=alpha, stride=1,
+                            expansion=6, block_id=2, skip_connection=True)
+    # skip level feature, with output stride = 4
+    skip = x
+
+    x = _inverted_res_block(x, filters=32, alpha=alpha, stride=2,
+                            expansion=6, block_id=3, skip_connection=False)
+    x = _inverted_res_block(x, filters=32, alpha=alpha, stride=1,
+                            expansion=6, block_id=4, skip_connection=True)
+    x = _inverted_res_block(x, filters=32, alpha=alpha, stride=1,
+                            expansion=6, block_id=5, skip_connection=True)
+
+    # original output stride changes to 16 from here, so we start to control block stride and dilation rate
+    x = _inverted_res_block(x, filters=64, alpha=alpha, stride=origin_os16_stride,  # origin: stride=2!
+                            expansion=6, block_id=6, skip_connection=False)
+    x = _inverted_res_block(x, filters=64, alpha=alpha, stride=1, rate=origin_os16_block_rate,
+                            expansion=6, block_id=7, skip_connection=True)
+    x = _inverted_res_block(x, filters=64, alpha=alpha, stride=1, rate=origin_os16_block_rate,
+                            expansion=6, block_id=8, skip_connection=True)
+    x = _inverted_res_block(x, filters=64, alpha=alpha, stride=1, rate=origin_os16_block_rate,
+                            expansion=6, block_id=9, skip_connection=True)
+
+    x = _inverted_res_block(x, filters=96, alpha=alpha, stride=1, rate=origin_os16_block_rate,
+                            expansion=6, block_id=10, skip_connection=False)
+    x = _inverted_res_block(x, filters=96, alpha=alpha, stride=1, rate=origin_os16_block_rate,
+                            expansion=6, block_id=11, skip_connection=True)
+    x = _inverted_res_block(x, filters=96, alpha=alpha, stride=1, rate=origin_os16_block_rate,
+                            expansion=6, block_id=12, skip_connection=True)
+
+    # original output stride changes to 32 from here
+    x = _inverted_res_block(x, filters=160, alpha=alpha, stride=origin_os32_stride, rate=origin_os16_block_rate,  # origin: stride=2!
+                            expansion=6, block_id=13, skip_connection=False)
+    x = _inverted_res_block(x, filters=160, alpha=alpha, stride=1, rate=origin_os32_block_rate,
+                            expansion=6, block_id=14, skip_connection=True)
+    x = _inverted_res_block(x, filters=160, alpha=alpha, stride=1, rate=origin_os32_block_rate,
+                            expansion=6, block_id=15, skip_connection=True)
+
+    x = _inverted_res_block(x, filters=320, alpha=alpha, stride=1, rate=origin_os32_block_rate,
+                            expansion=6, block_id=16, skip_connection=False)
+    # end of feature extractor
+
+    # expand the model structure to MobileNetV2 no top, so
+    # that we can load official imagenet pretrained weights
+
+    # no alpha applied to last conv as stated in the paper:
+    # if the width multiplier is greater than 1 we
+    # increase the number of output channels
+    if alpha > 1.0:
+        last_block_filters = _make_divisible(1280 * alpha, 8)
+    else:
+        last_block_filters = 1280
+
+    y = DeeplabConv2D(last_block_filters,
+                      kernel_size=1,
+                      use_bias=False,
+                      name='Conv_1')(x)
+    y = CustomBatchNormalization(epsilon=1e-3,
+                           momentum=0.999,
+                           name='Conv_1_bn')(y)
+    y = ReLU(6., name='out_relu')(y)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+
+    # hardcode row=224
+    rows = 224
+
+    model = Model(inputs, y, name='mobilenetv2_%0.2f_%s' % (alpha, rows))
+    # Load weights.
+    if weights == 'imagenet':
+        model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' +
+                      str(alpha) + '_' + str(rows) + '_no_top' + '.h5')
+        weight_path = BACKBONE_WEIGHT_PATH + model_name
+        weights_path = get_file(
+            model_name, weight_path, cache_subdir='models')
+
+        model.load_weights(weights_path)
+
+    backbone_len = len(model.layers) - 3
+    # need to return feature map and skip connection,
+    # not the whole "no top" model
+    return x, skip, backbone_len
+
+
+def Deeplabv3pMobileNetV2(input_shape=(512, 512, 3),
+                          alpha=1.0,
+                          weights='imagenet',
+                          input_tensor=None,
+                          num_classes=21,
+                          OS=8):
+    """ Instantiates the Deeplabv3+ MobileNetV2 architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        alpha: controls the width of the MobileNetV2 network. This is known as the
+            width multiplier in the MobileNetV2 paper.
+                - If `alpha` < 1.0, proportionally decreases the number
+                    of filters in each layer.
+                - If `alpha` > 1.0, proportionally increases the number
+                    of filters in each layer.
+                - If `alpha` = 1, default number of filters from the paper
+                    are used at each layer.
+            Used only for mobilenetv2 backbone
+        weights: pretrained weights type
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    """
+
+    if not (weights in {'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, skip_feature, backbone_len = MobileNetV2_body(img_norm, OS, alpha, weights=weights)
+
+    # ASPP block
+    x = ASPP_block(x, OS)
+
+    # Deeplabv3+ decoder for feature projection
+    x = Decoder_block(x, skip_feature)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+
+    model = Model(img_input, x, name='deeplabv3p_mobilenetv2')
+
+    return model, backbone_len
+
+
+def Deeplabv3pLiteMobileNetV2(input_shape=(512, 512, 3),
+                          alpha=1.0,
+                          weights='imagenet',
+                          input_tensor=None,
+                          num_classes=21,
+                          OS=8):
+    """ Instantiates the Deeplabv3+ MobileNetV2Lite architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        alpha: controls the width of the MobileNetV2 network. This is known as the
+            width multiplier in the MobileNetV2 paper.
+                - If `alpha` < 1.0, proportionally decreases the number
+                    of filters in each layer.
+                - If `alpha` > 1.0, proportionally increases the number
+                    of filters in each layer.
+                - If `alpha` = 1, default number of filters from the paper
+                    are used at each layer.
+            Used only for mobilenetv2 backbone
+        weights: pretrained weights type
+                - pascalvoc : pre-trained on PASCAL VOC
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    # Raises
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+        ValueError: in case of invalid argument for `weights` or `backbone`
+    """
+
+    if not (weights in {'pascalvoc', 'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`pascalvoc` (pre-trained on PASCAL VOC) '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, _, backbone_len = MobileNetV2_body(img_norm, OS, alpha, weights=weights)
+
+    # use ASPP Lite block & no decode block
+    x = ASPP_Lite_block(x)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+
+    model = Model(img_input, x, name='deeplabv3p_mobilenetv2_lite')
+
+    # load weights
+    if weights == 'pascalvoc':
+        weights_path = get_file('deeplabv3_mobilenetv2_tf_dim_ordering_tf_kernels.h5',
+                                WEIGHTS_PATH_MOBILE,
+                                cache_subdir='models')
+        model.load_weights(weights_path, by_name=True)
+    return model, backbone_len
+
diff --git a/models/deeplab/deeplabv3p/models/deeplabv3p_mobilenetv3.py b/models/deeplab/deeplabv3p/models/deeplabv3p_mobilenetv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaf02796fd6c44581e60933c4b3bff83e89a5631
--- /dev/null
+++ b/models/deeplab/deeplabv3p/models/deeplabv3p_mobilenetv3.py
@@ -0,0 +1,912 @@
+
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" Deeplabv3+ MobileNetV3(Large/Small) model for Keras.
+
+# Reference Paper:
+- [Encoder-Decoder with Atrous Separable Convolution
+    for Semantic Image Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+- [Searching for MobileNetV3](https://arxiv.org/pdf/1905.02244.pdf) (ICCV 2019)
+"""
+import os, sys
+import warnings
+
+from keras_applications.imagenet_utils import _obtain_input_shape
+from keras_applications.imagenet_utils import preprocess_input as _preprocess_input
+from tensorflow.keras.utils import get_source_inputs, get_file
+from tensorflow.keras.layers import Conv2D, DepthwiseConv2D, Dense, GlobalAveragePooling2D, GlobalMaxPooling2D, Flatten, Softmax, Dropout, ZeroPadding2D
+from tensorflow.keras.layers import BatchNormalization, Add, Multiply, Reshape
+from tensorflow.keras.layers import Input, Activation, ReLU, Reshape, Lambda
+from tensorflow.keras.models import Model
+from tensorflow.keras import backend as K
+
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..'))
+from deeplabv3p.models.layers import DeeplabConv2D, DeeplabDepthwiseConv2D, CustomBatchNormalization, ASPP_block, ASPP_Lite_block, Decoder_block, normalize, img_resize
+
+
+BASE_WEIGHT_PATH = ('https://github.com/DrSlink/mobilenet_v3_keras/'
+                    'releases/download/v1.0/')
+WEIGHTS_HASHES = {
+    'large_224_0.75_float': (
+        '765b44a33ad4005b3ac83185abf1d0eb',
+        'c256439950195a46c97ede7c294261c6'),
+    'large_224_1.0_float': (
+        '59e551e166be033d707958cf9e29a6a7',
+        '12c0a8442d84beebe8552addf0dcb950'),
+    'large_minimalistic_224_1.0_float': (
+        '675e7b876c45c57e9e63e6d90a36599c',
+        'c1cddbcde6e26b60bdce8e6e2c7cae54'),
+    'small_224_0.75_float': (
+        'cb65d4e5be93758266aa0a7f2c6708b7',
+        'c944bb457ad52d1594392200b48b4ddb'),
+    'small_224_1.0_float': (
+        '8768d4c2e7dee89b9d02b2d03d65d862',
+        '5bec671f47565ab30e540c257bba8591'),
+    'small_minimalistic_224_1.0_float': (
+        '99cd97fb2fcdad2bf028eb838de69e37',
+        '1efbf7e822e03f250f45faa3c6bbe156'),
+}
+
+
+def correct_pad(backend, inputs, kernel_size):
+    """Returns a tuple for zero-padding for 2D convolution with downsampling.
+    # Arguments
+        input_size: An integer or tuple/list of 2 integers.
+        kernel_size: An integer or tuple/list of 2 integers.
+    # Returns
+        A tuple.
+    """
+    img_dim = 2 if backend.image_data_format() == 'channels_first' else 1
+    input_size = backend.int_shape(inputs)[img_dim:(img_dim + 2)]
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+
+    if input_size[0] is None:
+        adjust = (1, 1)
+    else:
+        adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2)
+
+    correct = (kernel_size[0] // 2, kernel_size[1] // 2)
+
+    return ((correct[0] - adjust[0], correct[0]),
+            (correct[1] - adjust[1], correct[1]))
+
+
+def preprocess_input(x):
+    """
+    "mode" option description in preprocess_input
+    mode: One of "caffe", "tf" or "torch".
+        - caffe: will convert the images from RGB to BGR,
+            then will zero-center each color channel with
+            respect to the ImageNet dataset,
+            without scaling.
+        - tf: will scale pixels between -1 and 1,
+            sample-wise.
+        - torch: will scale pixels between 0 and 1 and then
+            will normalize each channel with respect to the
+            ImageNet dataset.
+    """
+    x = _preprocess_input(x, mode='tf', backend=K)
+    #x /= 255.
+    #mean = [0.485, 0.456, 0.406]
+    #std = [0.229, 0.224, 0.225]
+
+    #x[..., 0] -= mean[0]
+    #x[..., 1] -= mean[1]
+    #x[..., 2] -= mean[2]
+    #if std is not None:
+        #x[..., 0] /= std[0]
+        #x[..., 1] /= std[1]
+        #x[..., 2] /= std[2]
+
+    return x
+
+
+def relu(x):
+    return ReLU()(x)
+
+
+def hard_sigmoid(x):
+    return ReLU(6.)(x + 3.) * (1. / 6.)
+
+
+def hard_swish(x):
+    return Multiply()([Activation(hard_sigmoid)(x), x])
+
+
+# This function is taken from the original tf repo.
+# It ensures that all layers have a channel number that is divisible by 8
+# It can be seen here:
+# https://github.com/tensorflow/models/blob/master/research/
+# slim/nets/mobilenet/mobilenet.py
+
+def _depth(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _se_block(inputs, filters, se_ratio, prefix):
+    x = GlobalAveragePooling2D(name=prefix + 'squeeze_excite/AvgPool')(inputs)
+    if K.image_data_format() == 'channels_first':
+        x = Reshape((filters, 1, 1))(x)
+    else:
+        x = Reshape((1, 1, filters))(x)
+    x = DeeplabConv2D(_depth(filters * se_ratio),
+                      kernel_size=1,
+                      padding='same',
+                      name=prefix + 'squeeze_excite/Conv')(x)
+    x = ReLU(name=prefix + 'squeeze_excite/Relu')(x)
+    x = DeeplabConv2D(filters,
+                      kernel_size=1,
+                      padding='same',
+                      name=prefix + 'squeeze_excite/Conv_1')(x)
+    x = Activation(hard_sigmoid)(x)
+    #if K.backend() == 'theano':
+        ## For the Theano backend, we have to explicitly make
+        ## the excitation weights broadcastable.
+        #x = Lambda(
+            #lambda br: K.pattern_broadcast(br, [True, True, True, False]),
+            #output_shape=lambda input_shape: input_shape,
+            #name=prefix + 'squeeze_excite/broadcast')(x)
+    x = Multiply(name=prefix + 'squeeze_excite/Mul')([inputs, x])
+    return x
+
+
+def _inverted_res_block(x, expansion, filters, kernel_size, stride,
+                        se_ratio, activation, block_id, skip_connection=False, rate=1):
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+    shortcut = x
+    prefix = 'expanded_conv/'
+    infilters = K.int_shape(x)[channel_axis]
+    if block_id:
+        # Expand
+        prefix = 'expanded_conv_{}/'.format(block_id)
+        x = DeeplabConv2D(_depth(infilters * expansion),
+                          kernel_size=1,
+                          padding='same',
+                          use_bias=False,
+                          name=prefix + 'expand')(x)
+        x = CustomBatchNormalization(axis=channel_axis,
+                                      epsilon=1e-3,
+                                      momentum=0.999,
+                                      name=prefix + 'expand/BatchNorm')(x)
+        x = Activation(activation)(x)
+
+    #if stride == 2:
+        #x = ZeroPadding2D(padding=correct_pad(K, x, kernel_size),
+                                 #name=prefix + 'depthwise/pad')(x)
+    x = DeeplabDepthwiseConv2D(kernel_size,
+                               strides=stride,
+                               padding='same',# if stride == 1 else 'valid',
+                               dilation_rate=(rate, rate),
+                               use_bias=False,
+                               name=prefix + 'depthwise/Conv')(x)
+    x = CustomBatchNormalization(axis=channel_axis,
+                                  epsilon=1e-3,
+                                  momentum=0.999,
+                                  name=prefix + 'depthwise/BatchNorm')(x)
+    x = Activation(activation)(x)
+
+    if se_ratio:
+        x = _se_block(x, _depth(infilters * expansion), se_ratio, prefix)
+
+    x = DeeplabConv2D(filters,
+                      kernel_size=1,
+                      padding='same',
+                      use_bias=False,
+                      name=prefix + 'project')(x)
+    x = CustomBatchNormalization(axis=channel_axis,
+                                  epsilon=1e-3,
+                                  momentum=0.999,
+                                  name=prefix + 'project/BatchNorm')(x)
+
+    #if stride == 1 and infilters == filters:
+        #x = Add(name=prefix + 'Add')([shortcut, x])
+    if skip_connection:
+        x = Add(name=prefix + 'Add')([shortcut, x])
+    return x
+
+
+def MobileNetV3(stack_fn,
+                last_point_ch,
+                input_shape=None,
+                alpha=1.0,
+                model_type='large',
+                minimalistic=False,
+                include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                classes=1000,
+                pooling=None,
+                dropout_rate=0.2,
+                **kwargs):
+    """Instantiates the MobileNetV3 architecture.
+    # Arguments
+        stack_fn: a function that returns output tensor for the
+            stacked residual blocks.
+        last_point_ch: number channels at the last layer (before top)
+        input_shape: optional shape tuple, to be specified if you would
+            like to use a model with an input img resolution that is not
+            (224, 224, 3).
+            It should have exactly 3 inputs channels (224, 224, 3).
+            You can also omit this option if you would like
+            to infer input_shape from an input_tensor.
+            If you choose to include both input_tensor and input_shape then
+            input_shape will be used if they match, if the shapes
+            do not match then we will throw an error.
+            E.g. `(160, 160, 3)` would be one valid value.
+        alpha: controls the width of the network. This is known as the
+            depth multiplier in the MobileNetV3 paper, but the name is kept for
+            consistency with MobileNetV1 in Keras.
+            - If `alpha` < 1.0, proportionally decreases the number
+                of filters in each layer.
+            - If `alpha` > 1.0, proportionally increases the number
+                of filters in each layer.
+            - If `alpha` = 1, default number of filters from the paper
+                are used at each layer.
+        model_type: MobileNetV3 is defined as two models: large and small. These
+        models are targeted at high and low resource use cases respectively.
+        minimalistic: In addition to large and small models this module also contains
+            so-called minimalistic models, these models have the same per-layer
+            dimensions characteristic as MobilenetV3 however, they don't utilize any
+            of the advanced blocks (squeeze-and-excite units, hard-swish, and 5x5
+            convolutions). While these models are less efficient on CPU, they are
+            much more performant on GPU/DSP.
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: one of `None` (random initialization),
+              'imagenet' (pre-training on ImageNet),
+              or the path to the weights file to be loaded.
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        pooling: optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model will be
+                the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a 2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        dropout_rate: fraction of the input units to drop on the last layer
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid model type, argument for `weights`,
+            or invalid input shape when weights='imagenet'
+    """
+
+    if not (weights in {'imagenet', None} or os.path.exists(weights)):
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization), `imagenet` '
+                         '(pre-training on ImageNet), '
+                         'or the path to the weights file to be loaded.')
+
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
+                         'as true, `classes` should be 1000')
+
+    # Determine proper input shape
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=224,
+                                      min_size=32,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top,
+                                      weights=weights)
+
+    # If input_shape is None and input_tensor is None using standart shape
+    if input_shape is None and input_tensor is None:
+        input_shape = (None, None, 3)
+
+    if K.image_data_format() == 'channels_last':
+        row_axis, col_axis = (0, 1)
+    else:
+        row_axis, col_axis = (1, 2)
+    rows = input_shape[row_axis]
+    cols = input_shape[col_axis]
+    if rows and cols and (rows < 32 or cols < 32):
+        raise ValueError('Input size must be at least 32x32; got `input_shape=' +
+                         str(input_shape) + '`')
+    if weights == 'imagenet':
+        if minimalistic is False and alpha not in [0.75, 1.0] \
+                or minimalistic is True and alpha != 1.0:
+            raise ValueError('If imagenet weights are being loaded, '
+                             'alpha can be one of `0.75`, `1.0` for non minimalistic'
+                             ' or `1.0` for minimalistic only.')
+
+        if rows != cols or rows != 224:
+            warnings.warn('`input_shape` is undefined or non-square, '
+                          'or `rows` is not 224.'
+                          ' Weights for input shape (224, 224) will be'
+                          ' loaded as the default.')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        #if not K.is_keras_tensor(input_tensor):
+            #img_input = Input(tensor=input_tensor, shape=input_shape)
+        #else:
+            #img_input = input_tensor
+        img_input = input_tensor
+
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+    if minimalistic:
+        kernel = 3
+        activation = relu
+        se_ratio = None
+    else:
+        kernel = 5
+        activation = hard_swish
+        se_ratio = 0.25
+
+    x = ZeroPadding2D(padding=correct_pad(K, img_input, 3),
+                             name='Conv_pad')(img_input)
+    x = DeeplabConv2D(16,
+                      kernel_size=3,
+                      strides=(2, 2),
+                      padding='valid',
+                      use_bias=False,
+                      name='Conv')(x)
+    x = CustomBatchNormalization(axis=channel_axis,
+                                  epsilon=1e-3,
+                                  momentum=0.999,
+                                  name='Conv/BatchNorm')(x)
+    x = Activation(activation)(x)
+
+    x, skip_feature = stack_fn(x, kernel, activation, se_ratio)
+    # keep end of the feature extrator as final feature map
+    final_feature = x
+
+    last_conv_ch = _depth(K.int_shape(x)[channel_axis] * 6)
+
+    # if the width multiplier is greater than 1 we
+    # increase the number of output channels
+    if alpha > 1.0:
+        last_point_ch = _depth(last_point_ch * alpha)
+
+    x = DeeplabConv2D(last_conv_ch,
+                      kernel_size=1,
+                      padding='same',
+                      use_bias=False,
+                      name='Conv_1')(x)
+    x = CustomBatchNormalization(axis=channel_axis,
+                                  epsilon=1e-3,
+                                  momentum=0.999,
+                                  name='Conv_1/BatchNorm')(x)
+    x = Activation(activation)(x)
+
+    if include_top:
+        x = GlobalAveragePooling2D()(x)
+        if channel_axis == 1:
+            x = Reshape((last_conv_ch, 1, 1))(x)
+        else:
+            x = Reshape((1, 1, last_conv_ch))(x)
+        x = DeeplabConv2D(last_point_ch,
+                          kernel_size=1,
+                          padding='same',
+                          name='Conv_2')(x)
+        x = Activation(activation)(x)
+        if dropout_rate > 0:
+            x = Dropout(dropout_rate)(x)
+        x = DeeplabConv2D(classes,
+                          kernel_size=1,
+                          padding='same',
+                          name='Logits')(x)
+        x = Flatten()(x)
+        x = Softmax(name='Predictions/Softmax')(x)
+    else:
+        if pooling == 'avg':
+            x = GlobalAveragePooling2D(name='avg_pool')(x)
+        elif pooling == 'max':
+            x = GlobalMaxPooling2D(name='max_pool')(x)
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = Model(inputs, x, name='MobilenetV3' + model_type)
+
+    # Load weights.
+    if weights == 'imagenet':
+        model_name = "{}{}_224_{}_float".format(
+            model_type, '_minimalistic' if minimalistic else '', str(alpha))
+        if include_top:
+            file_name = 'weights_mobilenet_v3_' + model_name + '.h5'
+            file_hash = WEIGHTS_HASHES[model_name][0]
+        else:
+            file_name = 'weights_mobilenet_v3_' + model_name + '_no_top.h5'
+            file_hash = WEIGHTS_HASHES[model_name][1]
+        weights_path = get_file(file_name,
+                                            BASE_WEIGHT_PATH + file_name,
+                                            cache_subdir='models',
+                                            file_hash=file_hash)
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    #return model
+    return final_feature, skip_feature, len(model.layers) - 3
+
+
+
+def MobileNetV3Small(input_shape=None,
+                     alpha=1.0,
+                     OS=8,
+                     minimalistic=False,
+                     include_top=True,
+                     weights='imagenet',
+                     input_tensor=None,
+                     classes=1000,
+                     pooling=None,
+                     dropout_rate=0.2,
+                     **kwargs):
+    """
+    Modified MobileNetV3Large feature extractor body
+    with specified output stride and skip level feature
+    """
+    if OS == 8:
+        origin_os16_stride = 1
+        origin_os16_block_rate = 2
+        origin_os32_stride = 1
+        origin_os32_block_rate = 4
+    elif OS == 16:
+        origin_os16_stride = 2
+        origin_os16_block_rate = 1
+        origin_os32_stride = 1
+        origin_os32_block_rate = 2
+    elif OS == 32:
+        origin_os16_stride = 2
+        origin_os16_block_rate = 1
+        origin_os32_stride = 2
+        origin_os32_block_rate = 1
+    else:
+        raise ValueError('invalid output stride', OS)
+
+    def stack_fn(x, kernel, activation, se_ratio):
+        def depth(d):
+            return _depth(d * alpha)
+
+        x = _inverted_res_block(x, expansion=1, filters=depth(16), kernel_size=3,
+                stride=2, se_ratio=se_ratio, activation=relu, block_id=0, skip_connection=False)
+        # skip level feature, with output stride = 4
+        skip = x
+
+        x = _inverted_res_block(x, expansion=72. / 16, filters=depth(24), kernel_size=3,
+                stride=2, se_ratio=None, activation=relu, block_id=1, skip_connection=False)
+        x = _inverted_res_block(x, expansion=88. / 24, filters=depth(24), kernel_size=3,
+                stride=1, se_ratio=None, activation=relu, block_id=2, skip_connection=True)
+
+        # original output stride changes to 16 from here, so we start to control block stride and dilation rate
+        x = _inverted_res_block(x, expansion=4, filters=depth(40), kernel_size=kernel,
+                stride=origin_os16_stride, se_ratio=se_ratio, activation=activation, block_id=3, skip_connection=False) # origin: stride=2!
+        x = _inverted_res_block(x, expansion=6, filters=depth(40), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio, activation=activation, block_id=4, skip_connection=True, rate=origin_os16_block_rate)
+        x = _inverted_res_block(x, expansion=6, filters=depth(40), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio, activation=activation, block_id=5, skip_connection=True, rate=origin_os16_block_rate)
+        x = _inverted_res_block(x, expansion=3, filters=depth(48), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio, activation=activation, block_id=6, skip_connection=False, rate=origin_os16_block_rate)
+        x = _inverted_res_block(x, expansion=3, filters=depth(48), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio, activation=activation, block_id=7, skip_connection=True, rate=origin_os16_block_rate)
+        # original output stride changes to 32 from here
+        x = _inverted_res_block(x, expansion=6, filters=depth(96), kernel_size=kernel,
+                stride=origin_os32_stride, se_ratio=se_ratio, activation=activation, block_id=8, skip_connection=False, rate=origin_os16_block_rate) # origin: stride=2!
+        x = _inverted_res_block(x, expansion=6, filters=depth(96), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio, activation=activation, block_id=9, skip_connection=True, rate=origin_os32_block_rate)
+        x = _inverted_res_block(x, expansion=6, filters=depth(96), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio, activation=activation, block_id=10, skip_connection=True, rate=origin_os32_block_rate)
+        return x, skip
+
+    return MobileNetV3(stack_fn,
+                       1024,
+                       input_shape,
+                       alpha,
+                       'small',
+                       minimalistic,
+                       include_top,
+                       weights,
+                       input_tensor,
+                       classes,
+                       pooling,
+                       dropout_rate,
+                       **kwargs)
+
+
+def MobileNetV3Large(input_shape=None,
+                     alpha=1.0,
+                     OS=8,
+                     minimalistic=False,
+                     include_top=True,
+                     weights='imagenet',
+                     input_tensor=None,
+                     classes=1000,
+                     pooling=None,
+                     dropout_rate=0.2,
+                     **kwargs):
+    """
+    Modified MobileNetV3Large feature extractor body
+    with specified output stride and skip level feature
+    """
+    if OS == 8:
+        origin_os16_stride = 1
+        origin_os16_block_rate = 2
+        origin_os32_stride = 1
+        origin_os32_block_rate = 4
+    elif OS == 16:
+        origin_os16_stride = 2
+        origin_os16_block_rate = 1
+        origin_os32_stride = 1
+        origin_os32_block_rate = 2
+    elif OS == 32:
+        origin_os16_stride = 2
+        origin_os16_block_rate = 1
+        origin_os32_stride = 2
+        origin_os32_block_rate = 1
+    else:
+        raise ValueError('invalid output stride', OS)
+
+    def stack_fn(x, kernel, activation, se_ratio):
+        def depth(d):
+            return _depth(d * alpha)
+        x = _inverted_res_block(x, expansion=1, filters=depth(16), kernel_size=3,
+                stride=1, se_ratio=None, activation=relu, block_id=0, skip_connection=True)
+        x = _inverted_res_block(x, expansion=4, filters=depth(24), kernel_size=3,
+                stride=2, se_ratio=None, activation=relu, block_id=1, skip_connection=False)
+        x = _inverted_res_block(x, expansion=3, filters=depth(24), kernel_size=3,
+                stride=1, se_ratio=None, activation=relu, block_id=2, skip_connection=True)
+        # skip level feature, with output stride = 4
+        skip = x
+
+        x = _inverted_res_block(x, expansion=3, filters=depth(40), kernel_size=kernel,
+                stride=2, se_ratio=se_ratio, activation=relu, block_id=3, skip_connection=False)
+        x = _inverted_res_block(x, expansion=3, filters=depth(40), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio, activation=relu, block_id=4, skip_connection=True)
+        x = _inverted_res_block(x, expansion=3, filters=depth(40), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio, activation=relu, block_id=5, skip_connection=True)
+
+        # original output stride changes to 16 from here, so we start to control block stride and dilation rate
+        x = _inverted_res_block(x, expansion=6, filters=depth(80), kernel_size=3,
+                stride=origin_os16_stride, se_ratio=None, activation=activation, block_id=6, skip_connection=False) # origin: stride=2!
+        x = _inverted_res_block(x, expansion=2.5, filters=depth(80), kernel_size=3,
+                stride=1, se_ratio=None, activation=activation, block_id=7, skip_connection=True, rate=origin_os16_block_rate)
+        x = _inverted_res_block(x, expansion=2.3, filters=depth(80), kernel_size=3,
+                stride=1, se_ratio=None, activation=activation, block_id=8, skip_connection=True, rate=origin_os16_block_rate)
+        x = _inverted_res_block(x, expansion=2.3, filters=depth(80), kernel_size=3,
+                stride=1, se_ratio=None, activation=activation, block_id=9, skip_connection=True, rate=origin_os16_block_rate)
+        x = _inverted_res_block(x, expansion=6, filters=depth(112), kernel_size=3,
+                stride=1, se_ratio=se_ratio, activation=activation, block_id=10, skip_connection=False, rate=origin_os16_block_rate)
+        x = _inverted_res_block(x, expansion=6, filters=depth(112), kernel_size=3,
+                stride=1, se_ratio=se_ratio, activation=activation, block_id=11, skip_connection=True, rate=origin_os16_block_rate)
+        # original output stride changes to 32 from here
+        x = _inverted_res_block(x, expansion=6, filters=depth(160), kernel_size=kernel,
+                stride=origin_os32_stride, se_ratio=se_ratio,
+                                activation=activation, block_id=12, skip_connection=False, rate=origin_os16_block_rate) # origin: stride=2!
+        x = _inverted_res_block(x, expansion=6, filters=depth(160), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio,
+                                activation=activation, block_id=13, skip_connection=True, rate=origin_os32_block_rate)
+        x = _inverted_res_block(x, expansion=6, filters=depth(160), kernel_size=kernel,
+                stride=1, se_ratio=se_ratio,
+                                activation=activation, block_id=14, skip_connection=True, rate=origin_os32_block_rate)
+        return x, skip
+
+    return MobileNetV3(stack_fn,
+                       1280,
+                       input_shape,
+                       alpha,
+                       'large',
+                       minimalistic,
+                       include_top,
+                       weights,
+                       input_tensor,
+                       classes,
+                       pooling,
+                       dropout_rate,
+                       **kwargs)
+
+
+setattr(MobileNetV3Small, '__doc__', MobileNetV3.__doc__)
+setattr(MobileNetV3Large, '__doc__', MobileNetV3.__doc__)
+
+
+
+def Deeplabv3pMobileNetV3Large(input_shape=(512, 512, 3),
+                          alpha=1.0,
+                          weights='imagenet',
+                          input_tensor=None,
+                          num_classes=21,
+                          OS=8):
+    """ Instantiates the Deeplabv3+ MobileNetV3Large architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        alpha: controls the width of the MobileNetV3Large network. This is known as the
+            width multiplier in the MobileNetV3Large paper.
+                - If `alpha` < 1.0, proportionally decreases the number
+                    of filters in each layer.
+                - If `alpha` > 1.0, proportionally increases the number
+                    of filters in each layer.
+                - If `alpha` = 1, default number of filters from the paper
+                    are used at each layer.
+        weights: pretrained weights type
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    """
+
+    if not (weights in {'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, skip_feature, backbone_len = MobileNetV3Large(include_top=False, input_tensor=img_norm, weights=weights, OS=OS, alpha=alpha)
+
+    # ASPP block
+    x = ASPP_block(x, OS)
+
+    # Deeplabv3+ decoder for feature projection
+    x = Decoder_block(x, skip_feature)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+    model = Model(img_input, x, name='deeplabv3p_mobilenetv3large')
+
+    return model, backbone_len
+
+
+def Deeplabv3pLiteMobileNetV3Large(input_shape=(512, 512, 3),
+                          alpha=1.0,
+                          weights='imagenet',
+                          input_tensor=None,
+                          num_classes=21,
+                          OS=8):
+    """ Instantiates the Deeplabv3+ MobileNetV3LargeLite architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        alpha: controls the width of the MobileNetV3Large network. This is known as the
+            width multiplier in the MobileNetV3Large paper.
+                - If `alpha` < 1.0, proportionally decreases the number
+                    of filters in each layer.
+                - If `alpha` > 1.0, proportionally increases the number
+                    of filters in each layer.
+                - If `alpha` = 1, default number of filters from the paper
+                    are used at each layer.
+        weights: pretrained weights type
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    # Raises
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+        ValueError: in case of invalid argument for `weights` or `backbone`
+    """
+    if not (weights in {'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, _, backbone_len = MobileNetV3Large(include_top=False, input_tensor=img_norm, weights=weights, OS=OS, alpha=alpha)
+
+    # use ASPP Lite block & no decode block
+    x = ASPP_Lite_block(x)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+    model = Model(img_input, x, name='deeplabv3p_mobilenetv3large_lite')
+
+    return model, backbone_len
+
+
+
+def Deeplabv3pMobileNetV3Small(input_shape=(512, 512, 3),
+                          alpha=1.0,
+                          weights='imagenet',
+                          input_tensor=None,
+                          num_classes=21,
+                          OS=8):
+    """ Instantiates the Deeplabv3+ MobileNetV3Small architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        alpha: controls the width of the MobileNetV3Small network. This is known as the
+            width multiplier in the MobileNetV2 paper.
+                - If `alpha` < 1.0, proportionally decreases the number
+                    of filters in each layer.
+                - If `alpha` > 1.0, proportionally increases the number
+                    of filters in each layer.
+                - If `alpha` = 1, default number of filters from the paper
+                    are used at each layer.
+        weights: pretrained weights type
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    """
+    if not (weights in {'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, skip_feature, backbone_len = MobileNetV3Small(include_top=False, input_tensor=img_norm, weights=weights, OS=OS, alpha=alpha)
+
+    # ASPP block
+    x = ASPP_block(x, OS)
+
+    # Deeplabv3+ decoder for feature projection
+    x = Decoder_block(x, skip_feature)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+    model = Model(img_input, x, name='deeplabv3p_mobilenetv3small')
+
+    return model, backbone_len
+
+
+
+def Deeplabv3pLiteMobileNetV3Small(input_shape=(512, 512, 3),
+                          alpha=1.0,
+                          weights='imagenet',
+                          input_tensor=None,
+                          num_classes=21,
+                          OS=8):
+    """ Instantiates the Deeplabv3+ MobileNetV3SmallLite architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        alpha: controls the width of the MobileNetV3Small network. This is known as the
+            width multiplier in the MobileNetV3Small paper.
+                - If `alpha` < 1.0, proportionally decreases the number
+                    of filters in each layer.
+                - If `alpha` > 1.0, proportionally increases the number
+                    of filters in each layer.
+                - If `alpha` = 1, default number of filters from the paper
+                    are used at each layer.
+        weights: pretrained weights type
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    # Raises
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+        ValueError: in case of invalid argument for `weights` or `backbone`
+    """
+    if not (weights in {'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, _, backbone_len = MobileNetV3Small(include_top=False, input_tensor=img_norm, weights=weights, OS=OS, alpha=alpha)
+
+    # use ASPP Lite block & no decode block
+    x = ASPP_Lite_block(x)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+    model = Model(img_input, x, name='deeplabv3p_mobilenetv3small_lite')
+
+    return model, backbone_len
+
+
+
+if __name__ == '__main__':
+    input_tensor = Input(shape=(512, 512, 3), name='image_input')
+    model, backbone_len = Deeplabv3pMobileNetV3Small(input_tensor=input_tensor,
+                                      alpha=1.0,
+                                      weights=None,
+                                      num_classes=21,
+                                      OS=8)
+    model.summary()
\ No newline at end of file
diff --git a/models/deeplab/deeplabv3p/models/deeplabv3p_peleenet.py b/models/deeplab/deeplabv3p/models/deeplabv3p_peleenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..cec3246c5846ebbf2b7e4c141f7f5730ffc7465d
--- /dev/null
+++ b/models/deeplab/deeplabv3p/models/deeplabv3p_peleenet.py
@@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" Deeplabv3+ PeleeNet model for Keras.
+
+# Reference Paper:
+- [Densely Connected Convolutional Networks](https://arxiv.org/pdf/1608.06993.pdf)
+- [Pelee: A Real-Time Object Detection System on Mobile Devices](https://arxiv.org/abs/1804.06882)
+"""
+import os, sys
+import warnings
+
+from keras_applications.imagenet_utils import _obtain_input_shape
+from keras_applications.imagenet_utils import preprocess_input as _preprocess_input
+from tensorflow.keras.utils import get_source_inputs, get_file
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, ReLU, \
+    MaxPooling2D, Concatenate, AveragePooling2D, Flatten, Dropout, Dense, GlobalAveragePooling2D, GlobalMaxPooling2D, Softmax, Reshape, Lambda
+from tensorflow.keras import backend as K
+
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..'))
+from deeplabv3p.models.layers import DeeplabConv2D, CustomBatchNormalization, ASPP_block, ASPP_Lite_block, Decoder_block, normalize, img_resize
+
+
+BASE_WEIGHT_PATH = (
+    'https://github.com/david8862/tf-keras-image-classifier/'
+    'releases/download/v1.0.0/')
+
+
+def preprocess_input(x):
+    """
+    "mode" option description in preprocess_input
+    mode: One of "caffe", "tf" or "torch".
+        - caffe: will convert the images from RGB to BGR,
+            then will zero-center each color channel with
+            respect to the ImageNet dataset,
+            without scaling.
+        - tf: will scale pixels between -1 and 1,
+            sample-wise.
+        - torch: will scale pixels between 0 and 1 and then
+            will normalize each channel with respect to the
+            ImageNet dataset.
+    """
+    #x = _preprocess_input(x, mode='tf', backend=K)
+    x /= 255.
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+
+    x[..., 0] -= mean[0]
+    x[..., 1] -= mean[1]
+    x[..., 2] -= mean[2]
+    if std is not None:
+        x[..., 0] /= std[0]
+        x[..., 1] /= std[1]
+        x[..., 2] /= std[2]
+
+    return x
+
+
+def dense_graph(x, growth_rate, bottleneck_width, name=''):
+    growth_rate = int(growth_rate / 2)
+    inter_channel = int(growth_rate * bottleneck_width / 4) * 4
+
+    num_input_features = K.int_shape(x)[-1]
+
+    if inter_channel > num_input_features / 2:
+        inter_channel = int(num_input_features / 8) * 4
+        print('adjust inter_channel to ', inter_channel)
+
+    branch1 = basic_conv2d_graph(
+        x, inter_channel, kernel_size=1, strides=1, padding='valid', name=name + '_branch1a')
+    branch1 = basic_conv2d_graph(
+        branch1, growth_rate, kernel_size=3, strides=1, padding='same', name=name + '_branch1b')
+
+    branch2 = basic_conv2d_graph(
+        x, inter_channel, kernel_size=1, strides=1, padding='valid', name=name + '_branch2a')
+    branch2 = basic_conv2d_graph(
+        branch2, growth_rate, kernel_size=3, strides=1, padding='same', name=name + '_branch2b')
+    branch2 = basic_conv2d_graph(
+        branch2, growth_rate, kernel_size=3, strides=1, padding='same', name=name + '_branch2c')
+
+    out = Concatenate(axis=-1)([x, branch1, branch2])
+
+    return out
+
+
+def dense_block_graph(x, num_layers, bn_size, growth_rate, name=''):
+    for i in range(num_layers):
+        x = dense_graph(x, growth_rate, bn_size, name=name + '_denselayer{}'.format(i + 1))
+
+    return x
+
+
+def stem_block_graph(x, num_init_features, name=''):
+    num_stem_features = int(num_init_features / 2)
+
+    out = basic_conv2d_graph(x, num_init_features, kernel_size=3, strides=2, padding='same', name=name + '_stem1')
+
+    branch2 = basic_conv2d_graph(
+        out, num_stem_features, kernel_size=1, strides=1, padding='valid', name=name + '_stem2a')
+    branch2 = basic_conv2d_graph(
+        branch2, num_init_features, kernel_size=3, strides=2, padding='same', name=name + '_stem2b')
+
+    branch1 = MaxPooling2D(pool_size=2, strides=2)(out)
+
+    out = Concatenate(axis=-1)([branch1, branch2])
+
+    out = basic_conv2d_graph(out, num_init_features, kernel_size=1, strides=1, padding='valid', name=name + '_stem3')
+
+    return out
+
+
+def basic_conv2d_graph(x, out_channels, kernel_size, strides, padding, activation=True, name=''):
+    x = DeeplabConv2D(
+        out_channels, kernel_size=kernel_size, strides=strides,
+        padding=padding, use_bias=False, name=name + '_conv')(x)
+    x = CustomBatchNormalization(name=name + '_norm')(x)
+    if activation:
+        x = ReLU()(x)
+
+    return x
+
+
+def PeleeNet(input_shape=None,
+             OS=8,
+             growth_rate=32,
+             block_config=[3, 4, 8, 6],
+             num_init_features=32,
+             bottleneck_width=[1, 2, 4, 4],
+             include_top=True,
+             weights='imagenet',
+             input_tensor=None,
+             pooling=None,
+             dropout_rate=0.05,
+             classes=1000,
+             **kwargs):
+    """Instantiates the PeleeNet architecture.
+
+    # Arguments
+        input_shape: optional shape tuple, to be specified if you would
+            like to use a model with an input img resolution that is not
+            (224, 224, 3).
+            It should have exactly 3 inputs channels (224, 224, 3).
+            You can also omit this option if you would like
+            to infer input_shape from an input_tensor.
+            If you choose to include both input_tensor and input_shape then
+            input_shape will be used if they match, if the shapes
+            do not match then we will throw an error.
+            E.g. `(160, 160, 3)` would be one valid value.
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: one of `None` (random initialization),
+              'imagenet' (pre-training on ImageNet),
+              or the path to the weights file to be loaded.
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional block.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional block, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+
+    # Returns
+        A Keras model instance.
+
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape or invalid alpha, rows when
+            weights='imagenet'
+    """
+
+    if not (weights in {'imagenet', None} or os.path.exists(weights)):
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization), `imagenet` '
+                         '(pre-training on ImageNet), '
+                         'or the path to the weights file to be loaded.')
+
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
+                         'as true, `classes` should be 1000')
+
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=224,
+                                      min_size=32,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top,
+                                      weights=weights)
+
+    # If input_shape is None and input_tensor is None using standard shape
+    if input_shape is None and input_tensor is None:
+        input_shape = (None, None, 3)
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        #if not K.is_keras_tensor(input_tensor):
+            #img_input = Input(tensor=input_tensor, shape=input_shape)
+        #else:
+            #img_input = input_tensor
+        img_input = input_tensor
+
+    if type(growth_rate) is list:
+        growth_rates = growth_rate
+        assert len(growth_rates) == 4, 'The growth rate must be the list and the size must be 4'
+    else:
+        growth_rates = [growth_rate] * 4
+
+    if type(bottleneck_width) is list:
+        bottleneck_widths = bottleneck_width
+        assert len(bottleneck_widths) == 4, 'The bottleneck width must be the list and the size must be 4'
+    else:
+        bottleneck_widths = [bottleneck_width] * 4
+
+    features = stem_block_graph(img_input, num_init_features, name='bbn_features_stemblock')
+    num_features = num_init_features
+    for i, num_layers in enumerate(block_config):
+        features = dense_block_graph(
+            features, num_layers=num_layers, bn_size=bottleneck_widths[i],
+            growth_rate=growth_rates[i], name='bbn_features_denseblock{}'.format(i + 1))
+
+        num_features = num_features + num_layers * growth_rates[i]
+        features = basic_conv2d_graph(
+            features, num_features, kernel_size=1, strides=1,
+            padding='valid', name='bbn_features_transition{}'.format(i + 1))
+
+        #if i != len(block_config) - 1:
+            #features = AveragePooling2D(pool_size=2, strides=2)(features)
+
+        # skip level feature, with output stride = 4
+        if i == 0:
+            skip = features
+
+        # apply stride pooling according to OS
+        if OS == 8 and i < 1:
+            features = AveragePooling2D(pool_size=2, strides=2)(features)
+        elif OS == 16 and i < 2:
+            features = AveragePooling2D(pool_size=2, strides=2)(features)
+        elif OS == 32 and i != len(block_config) - 1:
+            features = AveragePooling2D(pool_size=2, strides=2)(features)
+
+    features_shape = K.int_shape(features)
+
+    if include_top:
+        x = GlobalAveragePooling2D()(features)
+        if dropout_rate > 0:
+            x = Dropout(dropout_rate)(x)
+        x = Dense(classes, activation='softmax',
+                         use_bias=True, name='Logits')(x)
+    else:
+        if pooling == 'avg':
+            x = GlobalAveragePooling2D()(features)
+        elif pooling == 'max':
+            x = GlobalMaxPooling2D()(features)
+        else:
+            x = features
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    model = Model(inputs, x, name='peleenet')
+
+    # Load weights.
+    if weights == 'imagenet':
+        if include_top:
+            file_name = 'peleenet_weights_tf_dim_ordering_tf_kernels_224.h5'
+            weight_path = BASE_WEIGHT_PATH + file_name
+        else:
+            file_name = 'peleenet_weights_tf_dim_ordering_tf_kernels_224_no_top.h5'
+            weight_path = BASE_WEIGHT_PATH + file_name
+
+        weights_path = get_file(file_name, weight_path, cache_subdir='models')
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    backbone_len = len(model.layers)
+    # need to return feature map and skip connection,
+    # not the whole "no top" model
+    return x, skip, backbone_len
+    #return model
+
+
+def Deeplabv3pPeleeNet(input_shape=(512, 512, 3),
+                       weights='imagenet',
+                       input_tensor=None,
+                       num_classes=21,
+                       OS=8):
+    """ Instantiates the Deeplabv3+ PeleeNet architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        weights: pretrained weights type
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    """
+
+    if not (weights in {'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, skip_feature, backbone_len = PeleeNet(include_top=False, pooling=None, input_tensor=img_norm, weights=weights, OS=OS)
+
+    # ASPP block
+    x = ASPP_block(x, OS)
+
+    # Deeplabv3+ decoder for feature projection
+    x = Decoder_block(x, skip_feature)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+
+    model = Model(img_input, x, name='deeplabv3p_peleenet')
+
+    return model, backbone_len
+
+
+def Deeplabv3pLitePeleeNet(input_shape=(512, 512, 3),
+                          weights='imagenet',
+                          input_tensor=None,
+                          num_classes=21,
+                          OS=8):
+    """ Instantiates the Deeplabv3+ MobileNetV2Lite architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        weights: pretrained weights type
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    # Raises
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+        ValueError: in case of invalid argument for `weights` or `backbone`
+    """
+
+    if not (weights in {'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, _, backbone_len = PeleeNet(include_top=False, pooling=None, input_tensor=img_norm, weights=weights, OS=OS)
+
+    # use ASPP Lite block & no decode block
+    x = ASPP_Lite_block(x)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+
+    model = Model(img_input, x, name='deeplabv3p_peleenet_lite')
+
+    return model, backbone_len
+
+
+if __name__ == '__main__':
+    input_tensor = Input(shape=(512, 512, 3), name='image_input')
+    model, backbone_len = Deeplabv3pLitePeleeNet(input_tensor=input_tensor,
+                                      weights=None,
+                                      num_classes=21,
+                                      OS=8)
+    model.summary()
diff --git a/models/deeplab/deeplabv3p/models/deeplabv3p_resnet50.py b/models/deeplab/deeplabv3p/models/deeplabv3p_resnet50.py
new file mode 100644
index 0000000000000000000000000000000000000000..764ddce42c8b81d7736e63401d265a95364764f4
--- /dev/null
+++ b/models/deeplab/deeplabv3p/models/deeplabv3p_resnet50.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" Deeplabv3+ ResNet50 model for Keras.
+
+# Reference:
+- [Encoder-Decoder with Atrous Separable Convolution
+    for Semantic Image Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+- [Deep Residual Learning for Image Recognition](
+    https://arxiv.org/abs/1512.03385) (CVPR 2016 Best Paper Award)
+"""
+import os
+import warnings
+from keras_applications.imagenet_utils import _obtain_input_shape
+from tensorflow.keras.models import Model
+from tensorflow.keras.activations import relu
+from tensorflow.keras.layers import Conv2D, DepthwiseConv2D, ZeroPadding2D, Lambda, AveragePooling2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D, Input, Dense, Concatenate, add, Reshape, BatchNormalization, Dropout, ReLU, Softmax
+from tensorflow.keras.utils import get_source_inputs, get_file
+from tensorflow.keras import backend as K
+
+from deeplabv3p.models.layers import DeeplabConv2D, DeeplabDepthwiseConv2D, CustomBatchNormalization, ASPP_block, ASPP_Lite_block, Decoder_block, normalize, img_resize
+
+
+WEIGHTS_PATH = ('https://github.com/fchollet/deep-learning-models/'
+                'releases/download/v0.2/'
+                'resnet50_weights_tf_dim_ordering_tf_kernels.h5')
+WEIGHTS_PATH_NO_TOP = ('https://github.com/fchollet/deep-learning-models/'
+                       'releases/download/v0.2/'
+                       'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5')
+
+
+def identity_block(input_tensor, kernel_size, filters, stage, block, rate=1):
+    """The identity block is the block that has no conv layer at shortcut.
+
+    # Arguments
+        input_tensor: input tensor
+        kernel_size: default 3, the kernel size of
+            middle conv layer at main path
+        filters: list of integers, the filters of 3 conv layer at main path
+        stage: integer, current stage label, used for generating layer names
+        block: 'a','b'..., current block label, used for generating layer names
+
+    # Returns
+        Output tensor for the block.
+    """
+    filters1, filters2, filters3 = filters
+    if K.image_data_format() == 'channels_last':
+        bn_axis = 3
+    else:
+        bn_axis = 1
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+    x = DeeplabConv2D(filters1, (1, 1),
+                      kernel_initializer='he_normal',
+                      dilation_rate=(rate, rate),
+                      name=conv_name_base + '2a')(input_tensor)
+    x = CustomBatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
+    x = ReLU()(x)
+
+    x = DeeplabConv2D(filters2, kernel_size,
+                      padding='same',
+                      kernel_initializer='he_normal',
+                      dilation_rate=(rate, rate),
+                      name=conv_name_base + '2b')(x)
+    x = CustomBatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
+    x = ReLU()(x)
+
+    x = DeeplabConv2D(filters3, (1, 1),
+                      kernel_initializer='he_normal',
+                      dilation_rate=(rate, rate),
+                      name=conv_name_base + '2c')(x)
+    x = CustomBatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
+
+    x = add([x, input_tensor])
+    x = ReLU()(x)
+    return x
+
+
+def conv_block(input_tensor,
+               kernel_size,
+               filters,
+               stage,
+               block,
+               strides=(2, 2),
+               rate=1):
+    """A block that has a conv layer at shortcut.
+
+    # Arguments
+        input_tensor: input tensor
+        kernel_size: default 3, the kernel size of
+            middle conv layer at main path
+        filters: list of integers, the filters of 3 conv layer at main path
+        stage: integer, current stage label, used for generating layer names
+        block: 'a','b'..., current block label, used for generating layer names
+        strides: Strides for the first conv layer in the block.
+
+    # Returns
+        Output tensor for the block.
+
+    Note that from stage 3,
+    the first conv layer at main path is with strides=(2, 2)
+    And the shortcut should have strides=(2, 2) as well
+    """
+    filters1, filters2, filters3 = filters
+    if K.image_data_format() == 'channels_last':
+        bn_axis = 3
+    else:
+        bn_axis = 1
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+    x = DeeplabConv2D(filters1, (1, 1), strides=strides,
+                      kernel_initializer='he_normal',
+                      dilation_rate=(rate, rate),
+                      name=conv_name_base + '2a')(input_tensor)
+    x = CustomBatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
+    x = ReLU()(x)
+
+    x = DeeplabConv2D(filters2, kernel_size, padding='same',
+                      kernel_initializer='he_normal',
+                      dilation_rate=(rate, rate),
+                      name=conv_name_base + '2b')(x)
+    x = CustomBatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
+    x = ReLU()(x)
+
+    x = DeeplabConv2D(filters3, (1, 1),
+                      kernel_initializer='he_normal',
+                      dilation_rate=(rate, rate),
+                      name=conv_name_base + '2c')(x)
+    x = CustomBatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
+
+    shortcut = DeeplabConv2D(filters3, (1, 1), strides=strides,
+                             kernel_initializer='he_normal',
+                             dilation_rate=(rate, rate),
+                             name=conv_name_base + '1')(input_tensor)
+    shortcut = CustomBatchNormalization(
+        axis=bn_axis, name=bn_name_base + '1')(shortcut)
+
+    x = add([x, shortcut])
+    x = ReLU()(x)
+    return x
+
+
+def ResNet50(include_top=True,
+             OS=8,
+             weights='imagenet',
+             input_tensor=None,
+             input_shape=None,
+             pooling=None,
+             classes=1000,
+             **kwargs):
+    """Instantiates the ResNet50 architecture.
+
+    Optionally loads weights pre-trained on ImageNet.
+    Note that the data format convention used by the model is
+    the one specified in your Keras config at `~/.keras/keras.json`.
+
+    # Arguments
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: one of `None` (random initialization),
+              'imagenet' (pre-training on ImageNet),
+              or the path to the weights file to be loaded.
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` (with `channels_last` data format)
+            or `(3, 224, 224)` (with `channels_first` data format).
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(200, 200, 3)` would be one valid value.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model will be
+                the 4D tensor output of the
+                last convolutional block.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional block, and thus
+                the output of the model will be a 2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+
+    # Returns
+        A Keras model instance.
+
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+    """
+
+    """
+    Modified ResNet50 feature extractor body
+    with specified output stride and skip level feature
+    """
+    if OS == 8:
+        origin_os16_stride = (1, 1)
+        origin_os16_block_rate = 2
+        origin_os32_stride = (1, 1)
+        origin_os32_block_rate = 4
+    elif OS == 16:
+        origin_os16_stride = (2, 2)
+        origin_os16_block_rate = 1
+        origin_os32_stride = (1, 1)
+        origin_os32_block_rate = 2
+    elif OS == 32:
+        origin_os16_stride = (2, 2)
+        origin_os16_block_rate = 1
+        origin_os32_stride = (2, 2)
+        origin_os32_block_rate = 1
+    else:
+        raise ValueError('invalid output stride', OS)
+
+    if not (weights in {'imagenet', None} or os.path.exists(weights)):
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization), `imagenet` '
+                         '(pre-training on ImageNet), '
+                         'or the path to the weights file to be loaded.')
+
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as `"imagenet"` with `include_top`'
+                         ' as true, `classes` should be 1000')
+
+    # Determine proper input shape
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=224,
+                                      min_size=32,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top,
+                                      weights=weights)
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        #if not backend.is_keras_tensor(input_tensor):
+            #img_input = Input(tensor=input_tensor, shape=input_shape)
+        #else:
+            #img_input = input_tensor
+        img_input = input_tensor
+
+    if K.image_data_format() == 'channels_last':
+        bn_axis = 3
+    else:
+        bn_axis = 1
+
+    x = ZeroPadding2D(padding=(3, 3), name='conv1_pad')(img_input)
+    x = DeeplabConv2D(64, (7, 7),
+                      strides=(2, 2),
+                      padding='valid',
+                      kernel_initializer='he_normal',
+                      name='conv1')(x)
+    x = CustomBatchNormalization(axis=bn_axis, name='bn_conv1')(x)
+    x = ReLU()(x)
+    x = ZeroPadding2D(padding=(1, 1), name='pool1_pad')(x)
+    x = MaxPooling2D((3, 3), strides=(2, 2))(x)
+    x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
+    x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
+    x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
+    # skip level feature, with output stride = 4
+    skip = x
+
+    x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
+    x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
+    x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
+    x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
+
+    # original output stride changes to 16 from here, so we start to control block stride and dilation rate
+    x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', strides=origin_os16_stride) # origin: stride=(2, 2)
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b', rate=origin_os16_block_rate)
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c', rate=origin_os16_block_rate)
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d', rate=origin_os16_block_rate)
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e', rate=origin_os16_block_rate)
+    x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f', rate=origin_os16_block_rate)
+
+    # original output stride changes to 32 from here
+    x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', strides=origin_os32_stride, rate=origin_os16_block_rate) # origin: stride=(2, 2)
+    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', rate=origin_os32_block_rate)
+    x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', rate=origin_os32_block_rate)
+
+    if include_top:
+        x = GlobalAveragePooling2D(name='avg_pool')(x)
+        x = Dense(classes, activation='softmax', name='fc1000')(x)
+    else:
+        if pooling == 'avg':
+            x = GlobalAveragePooling2D()(x)
+        elif pooling == 'max':
+            x = GlobalMaxPooling2D()(x)
+        else:
+            warnings.warn('The output shape of `ResNet50(include_top=False)` '
+                          'has been changed since Keras 2.2.0.')
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+    # Create model.
+    model = Model(inputs, x, name='resnet50')
+
+    # Load weights.
+    if weights == 'imagenet':
+        if include_top:
+            weights_path = get_file(
+                'resnet50_weights_tf_dim_ordering_tf_kernels.h5',
+                WEIGHTS_PATH,
+                cache_subdir='models',
+                md5_hash='a7b3fe01876f51b976af0dea6bc144eb')
+        else:
+            weights_path = get_file(
+                'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
+                WEIGHTS_PATH_NO_TOP,
+                cache_subdir='models',
+                md5_hash='a268eb855778b3df3c7506639542a6af')
+        model.load_weights(weights_path)
+    elif weights is not None:
+        model.load_weights(weights)
+
+    backbone_len = len(model.layers)
+    # need to return feature map and skip connection,
+    # not the whole "no top" model
+    return x, skip, backbone_len
+    #return model
+
+
+
+def Deeplabv3pResNet50(input_shape=(512, 512, 3),
+                          weights='imagenet',
+                          input_tensor=None,
+                          num_classes=21,
+                          OS=8):
+    """ Instantiates the Deeplabv3+ MobileNetV3Large architecture
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        weights: pretrained weights type
+                - imagenet: pre-trained on Imagenet
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}.
+
+    # Returns
+        A Keras model instance.
+    """
+    if not (weights in {'imagenet', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`imagenet` (pre-trained on Imagenet) or '
+                         '`None` (random initialization)')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, skip_feature, backbone_len = ResNet50(include_top=False, input_tensor=img_norm, weights=weights, OS=OS)
+
+    # ASPP block
+    x = ASPP_block(x, OS)
+
+    # Deeplabv3+ decoder for feature projection
+    x = Decoder_block(x, skip_feature)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+    model = Model(img_input, x, name='deeplabv3p_resnet50')
+
+    return model, backbone_len
+
+
+
+
+if __name__ == '__main__':
+    input_tensor = Input(shape=(224, 224, 3), name='image_input')
+    #model = ResNet50(include_top=False, input_shape=(512, 512, 3), weights='imagenet')
+    model = ResNet50(include_top=True, input_tensor=input_tensor, weights='imagenet')
+    model.summary()
+
+    import numpy as np
+    from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
+    from keras_preprocessing import image
+
+    img = image.load_img('../../examples/dog.jpg', target_size=(224, 224))
+    x = image.img_to_array(img)
+    x = np.expand_dims(x, axis=0)
+    x = preprocess_input(x)
+
+    preds = model.predict(x)
+    print('Predicted:', decode_predictions(preds))
diff --git a/models/deeplab/deeplabv3p/models/deeplabv3p_xception.py b/models/deeplab/deeplabv3p/models/deeplabv3p_xception.py
new file mode 100644
index 0000000000000000000000000000000000000000..b75ee6067e23b11760c04bdb86d99d600aad5869
--- /dev/null
+++ b/models/deeplab/deeplabv3p/models/deeplabv3p_xception.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" Deeplabv3+ Xception model for Keras.
+On Pascal VOC, original model gets to 84.56% mIOU
+
+Reference Paper:
+- [Encoder-Decoder with Atrous Separable Convolution
+    for Semantic Image Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
+- [Xception: Deep Learning with Depthwise Separable Convolutions]
+    (https://arxiv.org/abs/1610.02357)
+"""
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Conv2D, DepthwiseConv2D, ZeroPadding2D, Lambda, AveragePooling2D, Input, Concatenate, Add, Reshape, BatchNormalization, Dropout, ReLU, Softmax, add
+from tensorflow.keras.utils import get_source_inputs, get_file
+#from tensorflow.keras import backend as K
+
+from deeplabv3p.models.layers import DeeplabConv2D, DeeplabDepthwiseConv2D, CustomBatchNormalization, SepConv_BN, ASPP_block, Decoder_block, normalize, img_resize
+
+WEIGHTS_PATH_X = "https://github.com/bonlime/keras-deeplab-v3-plus/releases/download/1.1/deeplabv3_xception_tf_dim_ordering_tf_kernels.h5"
+
+
+def _conv2d_same(x, filters, prefix, stride=1, kernel_size=3, rate=1):
+    """Implements right 'same' padding for even kernel sizes
+        Without this there is a 1 pixel drift when stride = 2
+        Args:
+            x: input tensor
+            filters: num of filters in pointwise convolution
+            prefix: prefix before name
+            stride: stride at depthwise conv
+            kernel_size: kernel size for depthwise convolution
+            rate: atrous rate for depthwise convolution
+    """
+    if stride == 1:
+        return DeeplabConv2D(filters,
+                      (kernel_size, kernel_size),
+                      strides=(stride, stride),
+                      padding='same', use_bias=False,
+                      dilation_rate=(rate, rate),
+                      name=prefix)(x)
+    else:
+        kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
+        pad_total = kernel_size_effective - 1
+        pad_beg = pad_total // 2
+        pad_end = pad_total - pad_beg
+        x = ZeroPadding2D((pad_beg, pad_end))(x)
+        return DeeplabConv2D(filters,
+                      (kernel_size, kernel_size),
+                      strides=(stride, stride),
+                      padding='valid', use_bias=False,
+                      dilation_rate=(rate, rate),
+                      name=prefix)(x)
+
+
+def _xception_block(inputs, depth_list, prefix, skip_connection_type, stride,
+                    rate=1, depth_activation=False, return_skip=False):
+    """ Basic building block of modified Xception network
+        Args:
+            inputs: input tensor
+            depth_list: number of filters in each SepConv layer. len(depth_list) == 3
+            prefix: prefix before name
+            skip_connection_type: one of {'conv','sum','none'}
+            stride: stride at last depthwise conv
+            rate: atrous rate for depthwise convolution
+            depth_activation: flag to use activation between depthwise & pointwise convs
+            return_skip: flag to return additional tensor after 2 SepConvs for decoder
+            """
+    residual = inputs
+    for i in range(3):
+        residual = SepConv_BN(residual,
+                              depth_list[i],
+                              prefix + '_separable_conv{}'.format(i + 1),
+                              stride=stride if i == 2 else 1,
+                              rate=rate,
+                              depth_activation=depth_activation)
+        if i == 1:
+            skip = residual
+    if skip_connection_type == 'conv':
+        shortcut = _conv2d_same(inputs, depth_list[-1], prefix + '_shortcut',
+                                kernel_size=1,
+                                stride=stride)
+        shortcut = CustomBatchNormalization(name=prefix + '_shortcut_BN')(shortcut)
+        outputs = add([residual, shortcut])
+    elif skip_connection_type == 'sum':
+        outputs = add([residual, inputs])
+    elif skip_connection_type == 'none':
+        outputs = residual
+    if return_skip:
+        return outputs, skip
+    else:
+        return outputs
+
+
+def Xception_body(input_tensor, OS):
+    """
+    Modified Alighed Xception feature extractor body
+    with specified output stride and skip level feature
+    """
+    if OS == 8:
+        origin_os16_stride = 1
+        origin_os16_block_rate = 2
+        origin_os32_stride = 1
+        origin_os32_block_rate = 4
+    elif OS == 16:
+        origin_os16_stride = 2
+        origin_os16_block_rate = 1
+        origin_os32_stride = 1
+        origin_os32_block_rate = 2
+    elif OS == 32:
+        origin_os16_stride = 2
+        origin_os16_block_rate = 1
+        origin_os32_stride = 2
+        origin_os32_block_rate = 1
+    else:
+        raise ValueError('invalid output stride', OS)
+
+    x = DeeplabConv2D(32, (3, 3), strides=(2, 2),
+               name='entry_flow_conv1_1', use_bias=False, padding='same')(input_tensor)
+
+    x = CustomBatchNormalization(name='entry_flow_conv1_1_BN')(x)
+    x = ReLU()(x)
+
+    x = _conv2d_same(x, 64, 'entry_flow_conv1_2', kernel_size=3, stride=1)
+    x = CustomBatchNormalization(name='entry_flow_conv1_2_BN')(x)
+    x = ReLU()(x)
+
+    x = _xception_block(x, [128, 128, 128], 'entry_flow_block1',
+                        skip_connection_type='conv', stride=2,
+                        depth_activation=False)
+    # skip level feature, with output stride = 4
+    x, skip = _xception_block(x, [256, 256, 256], 'entry_flow_block2',
+                               skip_connection_type='conv', stride=2,
+                               depth_activation=False, return_skip=True)
+
+    # original output stride changes to 16 from here, so we start to control block stride and dilation rate
+    x = _xception_block(x, [728, 728, 728], 'entry_flow_block3',
+                        skip_connection_type='conv', stride=origin_os16_stride,
+                        depth_activation=False)
+    for i in range(16):
+        x = _xception_block(x, [728, 728, 728], 'middle_flow_unit_{}'.format(i + 1),
+                            skip_connection_type='sum', stride=1, rate=origin_os16_block_rate,
+                            depth_activation=False)
+
+    # original output stride changes to 32 from here
+    x = _xception_block(x, [728, 1024, 1024], 'exit_flow_block1',
+                        skip_connection_type='conv', stride=origin_os32_stride, rate=origin_os16_block_rate,
+                        depth_activation=False)
+    x = _xception_block(x, [1536, 1536, 2048], 'exit_flow_block2',
+                        skip_connection_type='none', stride=1, rate=origin_os32_block_rate,
+                        depth_activation=True)
+    # end of feature extractor
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+
+    backbone_len = len(Model(inputs, x).layers)
+    return x, skip, backbone_len
+
+
+
+def Deeplabv3pXception(input_shape=(512, 512, 3),
+                       weights='pascalvoc',
+                       input_tensor=None,
+                       num_classes=21,
+                       OS=16):
+    """ Instantiates the Deeplabv3+ architecture
+    Optionally loads weights pre-trained
+    on PASCAL VOC. This model is available for TensorFlow only,
+    and can only be used with inputs following the TensorFlow
+    data format `(width, height, channels)`.
+    # Arguments
+        input_shape: shape of input image. format HxWxC
+            PASCAL VOC model was trained on (512,512,3) images
+        weights: pretrained weights type
+                - pascalvoc : pre-trained on PASCAL VOC
+                - None : random initialization
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        num_classes: number of desired classes.
+        OS: determines input_shape/feature_extractor_output ratio. One of {8,16,32}
+    # Returns
+        A Keras model instance.
+    # Raises
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+        ValueError: in case of invalid argument for `weights` or `backbone`
+    """
+
+    if not (weights in {'pascalvoc', None}):
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization) or `pascalvoc` '
+                         '(pre-trained on PASCAL VOC)')
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape, name='image_input')
+    else:
+        img_input = input_tensor
+
+    # normalize input image
+    img_norm = Lambda(normalize, name='input_normalize')(img_input)
+
+    # backbone body for feature extract
+    x, skip_feature, backbone_len = Xception_body(img_norm, OS)
+
+    # ASPP block
+    x = ASPP_block(x, OS)
+
+    # Deeplabv3+ decoder for feature projection
+    x = Decoder_block(x, skip_feature)
+
+    # Final prediction conv block
+    x = DeeplabConv2D(num_classes, (1, 1), padding='same', name='logits_semantic')(x)
+    x = Lambda(img_resize, arguments={'size': (input_shape[0],input_shape[1]), 'mode': 'bilinear'}, name='pred_resize')(x)
+    x = Reshape((input_shape[0]*input_shape[1], num_classes)) (x)
+    x = Softmax(name='Predictions/Softmax')(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    #if input_tensor is not None:
+        #inputs = get_source_inputs(input_tensor)
+    #else:
+        #inputs = img_input
+
+    model = Model(img_input, x, name='deeplabv3p_xception')
+
+    # load weights
+    if weights == 'pascalvoc':
+        weights_path = get_file('deeplabv3_xception_tf_dim_ordering_tf_kernels.h5',
+                                WEIGHTS_PATH_X,
+                                cache_subdir='models')
+        model.load_weights(weights_path, by_name=True)
+    return model, backbone_len
+
diff --git a/models/deeplab/deeplabv3p/models/layers.py b/models/deeplab/deeplabv3p/models/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a95f82bf862960099208960c641246fc7ddb89b
--- /dev/null
+++ b/models/deeplab/deeplabv3p/models/layers.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from __future__ import division
+
+from functools import wraps
+
+from tensorflow.keras import backend as K
+from tensorflow.keras.layers import Conv2D, DepthwiseConv2D, SeparableConv2D, ZeroPadding2D, Lambda, AveragePooling2D, Concatenate, BatchNormalization, Dropout, ReLU
+from tensorflow.keras.regularizers import l2
+import tensorflow as tf
+
+L2_FACTOR = 2e-5
+
+@wraps(Conv2D)
+def DeeplabConv2D(*args, **kwargs):
+    """Wrapper to set Deeplab parameters for Conv2D."""
+    deeplab_conv_kwargs = {'kernel_regularizer': l2(L2_FACTOR)}
+    deeplab_conv_kwargs['bias_regularizer'] = l2(L2_FACTOR)
+    #deeplab_conv_kwargs['padding'] = 'valid' if kwargs.get('strides')==(2,2) else 'same'
+    deeplab_conv_kwargs.update(kwargs)
+    return Conv2D(*args, **deeplab_conv_kwargs)
+
+
+@wraps(DepthwiseConv2D)
+def DeeplabDepthwiseConv2D(*args, **kwargs):
+    """Wrapper to set Deeplab parameters for DepthwiseConv2D."""
+    deeplab_conv_kwargs = {'kernel_regularizer': l2(L2_FACTOR)}
+    deeplab_conv_kwargs['bias_regularizer'] = l2(L2_FACTOR)
+    #deeplab_conv_kwargs['padding'] = 'valid' if kwargs.get('strides')==(2,2) else 'same'
+    deeplab_conv_kwargs.update(kwargs)
+    return DepthwiseConv2D(*args, **deeplab_conv_kwargs)
+
+
+@wraps(SeparableConv2D)
+def DeeplabSeparableConv2D(*args, **kwargs):
+    """Wrapper to set Deeplab parameters for SeparableConv2D."""
+    deeplab_conv_kwargs = {'kernel_regularizer': l2(L2_FACTOR)}
+    deeplab_conv_kwargs['bias_regularizer'] = l2(L2_FACTOR)
+    #deeplab_conv_kwargs['padding'] = 'valid' if kwargs.get('strides')==(2,2) else 'same'
+    deeplab_conv_kwargs.update(kwargs)
+    return SeparableConv2D(*args, **deeplab_conv_kwargs)
+
+
+def normalize(x):
+    return x/127.5 - 1
+
+
+def img_resize(x, size, mode='bilinear'):
+    if mode == 'bilinear':
+        return tf.image.resize(x, size=size, method='bilinear')
+    elif mode == 'nearest':
+        return tf.image.resize(x, size=size, method='nearest')
+    else:
+        raise ValueError('output model file is not specified')
+
+
+def CustomBatchNormalization(*args, **kwargs):
+    if tf.__version__ >= '2.2':
+        from tensorflow.keras.layers.experimental import SyncBatchNormalization
+        BatchNorm = SyncBatchNormalization
+    else:
+        BatchNorm = BatchNormalization
+
+    return BatchNorm(*args, **kwargs)
+
+
+
+def SepConv_BN(x, filters, prefix, stride=1, kernel_size=3, rate=1, depth_activation=False, epsilon=1e-3):
+    """ SepConv with BN between depthwise & pointwise. Optionally add activation after BN
+        Implements right "same" padding for even kernel sizes
+        Args:
+            x: input tensor
+            filters: num of filters in pointwise convolution
+            prefix: prefix before name
+            stride: stride at depthwise conv
+            kernel_size: kernel size for depthwise convolution
+            rate: atrous rate for depthwise convolution
+            depth_activation: flag to use activation between depthwise & pointwise convs
+            epsilon: epsilon to use in BN layer
+    """
+
+    if stride == 1:
+        depth_padding = 'same'
+    else:
+        kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
+        pad_total = kernel_size_effective - 1
+        pad_beg = pad_total // 2
+        pad_end = pad_total - pad_beg
+        x = ZeroPadding2D((pad_beg, pad_end))(x)
+        depth_padding = 'valid'
+
+    if not depth_activation:
+        x = ReLU()(x)
+    x = DeeplabDepthwiseConv2D((kernel_size, kernel_size), strides=(stride, stride), dilation_rate=(rate, rate),
+                        padding=depth_padding, use_bias=False, name=prefix + '_depthwise')(x)
+    x = CustomBatchNormalization(name=prefix + '_depthwise_BN', epsilon=epsilon)(x)
+    if depth_activation:
+        x = ReLU()(x)
+    x = DeeplabConv2D(filters, (1, 1), padding='same',
+               use_bias=False, name=prefix + '_pointwise')(x)
+    x = CustomBatchNormalization(name=prefix + '_pointwise_BN', epsilon=epsilon)(x)
+    if depth_activation:
+        x = ReLU()(x)
+
+    return x
+
+
+def ASPP_block(x, OS):
+    """
+    branching for Atrous Spatial Pyramid Pooling
+    """
+    if OS == 8:
+        atrous_rates = (12, 24, 36)
+    elif OS == 16:
+        atrous_rates = (6, 12, 18)
+    elif OS == 32:
+        # unofficial hyperparameters, just have a try
+        atrous_rates = (3, 6, 9)
+    else:
+        raise ValueError('invalid output stride', OS)
+
+    # feature map shape, (batch, height, width, channel)
+    feature_shape = x.shape.as_list()
+
+    # Image Feature branch
+    b4 = AveragePooling2D(pool_size=(feature_shape[1], feature_shape[2]))(x)
+
+    b4 = DeeplabConv2D(256, (1, 1), padding='same',
+                use_bias=False, name='image_pooling')(b4)
+    b4 = CustomBatchNormalization(name='image_pooling_BN', epsilon=1e-5)(b4)
+    b4 = ReLU()(b4)
+    b4 = Lambda(img_resize, arguments={'size': (feature_shape[1], feature_shape[2]), 'mode': 'bilinear'}, name='aspp_resize')(b4)
+
+    # simple 1x1
+    b0 = DeeplabConv2D(256, (1, 1), padding='same', use_bias=False, name='aspp0')(x)
+    b0 = CustomBatchNormalization(name='aspp0_BN', epsilon=1e-5)(b0)
+    b0 = ReLU(name='aspp0_activation')(b0)
+
+    # rate = 6 (12)
+    b1 = SepConv_BN(x, 256, 'aspp1',
+                    rate=atrous_rates[0], depth_activation=True, epsilon=1e-5)
+    # rate = 12 (24)
+    b2 = SepConv_BN(x, 256, 'aspp2',
+                    rate=atrous_rates[1], depth_activation=True, epsilon=1e-5)
+    # rate = 18 (36)
+    b3 = SepConv_BN(x, 256, 'aspp3',
+                    rate=atrous_rates[2], depth_activation=True, epsilon=1e-5)
+    # concatenate ASPP branches & project
+    x = Concatenate()([b4, b0, b1, b2, b3])
+
+    x = DeeplabConv2D(256, (1, 1), padding='same',
+               use_bias=False, name='concat_projection')(x)
+    x = CustomBatchNormalization(name='concat_projection_BN', epsilon=1e-5)(x)
+    x = ReLU()(x)
+    x = Dropout(0.5)(x)
+
+    return x
+
+
+def ASPP_Lite_block(x):
+    """
+    a simplified version of Deeplab ASPP block, which
+    only have global pooling & simple 1x1 conv branch
+    """
+    # feature map shape, (batch, height, width, channel)
+    feature_shape = x.shape.as_list()
+
+    # Image Feature branch
+    b4 = AveragePooling2D(pool_size=(feature_shape[1], feature_shape[2]))(x)
+
+    b4 = DeeplabConv2D(256, (1, 1), padding='same',
+                use_bias=False, name='image_pooling')(b4)
+    b4 = CustomBatchNormalization(name='image_pooling_BN', epsilon=1e-5)(b4)
+    b4 = ReLU()(b4)
+    b4 = Lambda(img_resize, arguments={'size': (feature_shape[1], feature_shape[2]), 'mode': 'bilinear'}, name='aspp_resize')(b4)
+
+    # simple 1x1 conv
+    b0 = DeeplabConv2D(256, (1, 1), padding='same', use_bias=False, name='aspp0')(x)
+    b0 = CustomBatchNormalization(name='aspp0_BN', epsilon=1e-5)(b0)
+    b0 = ReLU(name='aspp0_activation')(b0)
+
+    # only 2 branches
+    x = Concatenate()([b4, b0])
+    x = DeeplabConv2D(256, (1, 1), padding='same',
+               use_bias=False, name='concat_projection')(x)
+    x = CustomBatchNormalization(name='concat_projection_BN', epsilon=1e-5)(x)
+    x = ReLU()(x)
+    x = Dropout(0.5)(x)
+
+    return x
+
+
+def Decoder_block(x, skip_feature):
+    """
+    DeepLab v.3+ decoder
+    Feature projection x4 (x2) block
+    """
+    # skip feature shape, (batch, height, width, channel)
+    skip_shape = skip_feature.shape.as_list()
+
+    x = Lambda(img_resize, arguments={'size': (skip_shape[1], skip_shape[2]), 'mode': 'bilinear'}, name='decoder_resize')(x)
+
+    skip_feature = DeeplabConv2D(48, (1, 1), padding='same',
+                       use_bias=False, name='feature_projection0')(skip_feature)
+    skip_feature = CustomBatchNormalization(
+        name='feature_projection0_BN', epsilon=1e-5)(skip_feature)
+    skip_feature = ReLU()(skip_feature)
+    x = Concatenate()([x, skip_feature])
+    x = SepConv_BN(x, 256, 'decoder_conv0',
+                   depth_activation=True, epsilon=1e-5)
+    x = SepConv_BN(x, 256, 'decoder_conv1',
+                   depth_activation=True, epsilon=1e-5)
+    return x
+
+
+
+#def icnr_weights(init = tf.glorot_normal_initializer(), scale=2, shape=[3,3,32,4], dtype = tf.float32):
+    #sess = tf.Session()
+    #return sess.run(ICNR(init, scale=scale)(shape=shape, dtype=dtype))
+
+class ICNR:
+    """ICNR initializer for checkerboard artifact free sub pixel convolution
+    Ref:
+     [1] Andrew Aitken et al. Checkerboard artifact free sub-pixel convolution
+     https://arxiv.org/pdf/1707.02937.pdf)
+    Args:
+    initializer: initializer used for sub kernels (orthogonal, glorot uniform, etc.)
+    scale: scale factor of sub pixel convolution
+    """
+
+    def __init__(self, initializer, scale=1):
+        self.scale = scale
+        self.initializer = initializer
+
+    def __call__(self, shape, dtype, partition_info=None):
+        shape = list(shape)
+        if self.scale == 1:
+            return self.initializer(shape)
+
+        new_shape = shape[:3] + [shape[3] // (self.scale ** 2)]
+        x = self.initializer(new_shape, dtype, partition_info)
+        x = tf.transpose(x, perm=[2, 0, 1, 3])
+        x = tf.image.resize_nearest_neighbor(x, size=(shape[0] * self.scale, shape[1] * self.scale))
+        x = tf.space_to_depth(x, block_size=self.scale)
+        x = tf.transpose(x, perm=[1, 2, 0, 3])
+
+        return x
+
+class Subpixel(Conv2D):
+    def __init__(self,
+                 filters,
+                 kernel_size,
+                 r,
+                 padding='valid',
+                 data_format=None,
+                 strides=(1,1),
+                 activation=None,
+                 use_bias=True,
+                 kernel_initializer='glorot_uniform',
+                 bias_initializer='zeros',
+                 kernel_regularizer=None,
+                 bias_regularizer=None,
+                 activity_regularizer=None,
+                 kernel_constraint=None,
+                 bias_constraint=None,
+                 **kwargs):
+        super(Subpixel, self).__init__(
+            filters=r*r*filters,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer,
+            activity_regularizer=activity_regularizer,
+            kernel_constraint=kernel_constraint,
+            bias_constraint=bias_constraint,
+            **kwargs)
+        self.r = r
+
+    def _phase_shift(self, I):
+        r = self.r
+        bsize, a, b, c = I.get_shape().as_list()
+        bsize = K.shape(I)[0] # Handling Dimension(None) type for undefined batch dim
+        X = K.reshape(I, [bsize, a, b, int(c/(r*r)),r, r]) # bsize, a, b, c/(r*r), r, r
+        X = K.permute_dimensions(X, (0, 1, 2, 5, 4, 3))  # bsize, a, b, r, r, c/(r*r)
+        #Keras backend does not support tf.split, so in future versions this could be nicer
+        X = [X[:,i,:,:,:,:] for i in range(a)] # a, [bsize, b, r, r, c/(r*r)
+        X = K.concatenate(X, 2)  # bsize, b, a*r, r, c/(r*r)
+        X = [X[:,i,:,:,:] for i in range(b)] # b, [bsize, r, r, c/(r*r)
+        X = K.concatenate(X, 2)  # bsize, a*r, b*r, c/(r*r)
+        return X
+
+    def call(self, inputs):
+        return self._phase_shift(super(Subpixel, self).call(inputs))
+
+    def compute_output_shape(self, input_shape):
+        unshifted = super(Subpixel, self).compute_output_shape(input_shape)
+        return (unshifted[0], self.r*unshifted[1], self.r*unshifted[2], int(unshifted[3]/(self.r*self.r)))
+
+    def get_config(self):
+        config = super(Conv2D, self).get_config()
+        config.pop('rank')
+        config.pop('dilation_rate')
+        config['filters']= int(config['filters'] / self.r*self.r)
+        config['r'] = self.r
+        return config
diff --git a/models/deeplab/deeplabv3p/postprocess_np.py b/models/deeplab/deeplabv3p/postprocess_np.py
new file mode 100644
index 0000000000000000000000000000000000000000..7daf3465060f70a1c70c7e54697c9ad156c9cbb6
--- /dev/null
+++ b/models/deeplab/deeplabv3p/postprocess_np.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import numpy as np
+
+import pydensecrf.densecrf as dcrf
+from pydensecrf.utils import unary_from_labels
+
+
+# Fully connected CRF post processing function
+def crf_postprocess(im, mask, zero_unsure=True):
+    colors, labels = np.unique(mask, return_inverse=True)
+    image_size = mask.shape[:2]
+    n_labels = len(set(labels.flat))
+    d = dcrf.DenseCRF2D(image_size[1], image_size[0], n_labels)  # width, height, nlabels
+    U = unary_from_labels(labels, n_labels, gt_prob=.7, zero_unsure=zero_unsure)
+    d.setUnaryEnergy(U)
+    # This adds the color-independent term, features are the locations only.
+    d.addPairwiseGaussian(sxy=(3,3), compat=3)
+    # This adds the color-dependent term, i.e. features are (x,y,r,g,b).
+    # im is an image-array, e.g. im.dtype == np.uint8 and im.shape == (640,480,3)
+    d.addPairwiseBilateral(sxy=80, srgb=13, rgbim=im.astype('uint8'), compat=10)
+    Q = d.inference(5) # 5 - num of iterations
+    MAP = np.argmax(Q, axis=0).reshape(image_size)
+    unique_map = np.unique(MAP)
+    result = np.copy(MAP)
+    for u in unique_map: # get original labels back
+        np.putmask(result, MAP == u, colors[u])
+    return result
+    # MAP = crf_postprocess(frame, labels.astype('int32'), zero_unsure=False)
+
diff --git a/models/deeplab/eval.py b/models/deeplab/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..f70139d395ec426755af098d161637b2b1dba274
--- /dev/null
+++ b/models/deeplab/eval.py
@@ -0,0 +1,565 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Calculate mIOU for Deeplabv3p model on validation dataset
+"""
+import os, argparse, time
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import copy
+import itertools
+from tqdm import tqdm
+from collections import OrderedDict
+import operator
+from labelme.utils import lblsave as label_save
+
+from tensorflow.keras.models import load_model
+import tensorflow.keras.backend as K
+import tensorflow as tf
+import MNN
+import onnxruntime
+
+from common.utils import get_data_list, get_classes, get_custom_objects, optimize_tf_gpu, visualize_segmentation
+from deeplabv3p.data import SegmentationGenerator
+from deeplabv3p.metrics import mIOU
+from deeplabv3p.postprocess_np import crf_postprocess
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+optimize_tf_gpu(tf, K)
+
+
+def deeplab_predict_keras(model, image_data):
+    prediction = model.predict(image_data)
+    prediction = np.argmax(prediction, axis=-1)
+    return prediction[0]
+
+
+def deeplab_predict_onnx(model, image_data):
+    input_tensors = []
+    for i, input_tensor in enumerate(model.get_inputs()):
+        input_tensors.append(input_tensor)
+    # assume only 1 input tensor for image
+    assert len(input_tensors) == 1, 'invalid input tensor number.'
+
+    feed = {input_tensors[0].name: image_data}
+    prediction = model.run(None, feed)
+
+    prediction = np.argmax(prediction, axis=-1)
+    return prediction[0]
+
+
+def deeplab_predict_pb(model, image_data):
+    # NOTE: TF 1.x frozen pb graph need to specify input/output tensor name
+    # so we need to hardcode the input/output tensor names here to get them from model
+    output_tensor_name = 'graph/pred_mask/Softmax:0'
+
+    # assume only 1 input tensor for image
+    input_tensor_name = 'graph/image_input:0'
+
+    # get input/output tensors
+    image_input = model.get_tensor_by_name(input_tensor_name)
+    output_tensor = model.get_tensor_by_name(output_tensor_name)
+
+    with tf.Session(graph=model) as sess:
+        prediction = sess.run(output_tensor, feed_dict={
+            image_input: image_data
+        })
+    prediction = np.argmax(prediction, axis=-1)
+    return prediction[0]
+
+
+def deeplab_predict_tflite(interpreter, image_data):
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    interpreter.set_tensor(input_details[0]['index'], image_data)
+    interpreter.invoke()
+
+    prediction = []
+    for output_detail in output_details:
+        output_data = interpreter.get_tensor(output_detail['index'])
+        prediction.append(output_data)
+
+    prediction = np.argmax(prediction[0], axis=-1)
+    return prediction[0]
+
+
+def deeplab_predict_mnn(interpreter, session, image_data):
+    from functools import reduce
+    from operator import mul
+
+    # assume only 1 input tensor for image
+    input_tensor = interpreter.getSessionInput(session)
+    # get input shape
+    input_shape = input_tensor.getShape()
+
+    # use a temp tensor to copy data
+    # TODO: currently MNN python binding have mem leak when creating MNN.Tensor
+    # from numpy array, only from tuple is good. So we convert input image to tuple
+    input_elementsize = reduce(mul, input_shape)
+    tmp_input = MNN.Tensor(input_shape, input_tensor.getDataType(),\
+                    tuple(image_data.reshape(input_elementsize, -1)), input_tensor.getDimensionType())
+
+    input_tensor.copyFrom(tmp_input)
+    interpreter.runSession(session)
+
+    prediction = []
+    # we only handle single output model
+    output_tensor = interpreter.getSessionOutput(session)
+    output_shape = output_tensor.getShape()
+
+    assert output_tensor.getDataType() == MNN.Halide_Type_Float
+
+    # copy output tensor to host, for further postprocess
+    output_elementsize = reduce(mul, output_shape)
+    tmp_output = MNN.Tensor(output_shape, output_tensor.getDataType(),\
+                tuple(np.zeros(output_shape, dtype=float).reshape(output_elementsize, -1)), output_tensor.getDimensionType())
+
+    output_tensor.copyToHostTensor(tmp_output)
+    #tmp_output.printTensorData()
+
+    output_data = np.array(tmp_output.getData(), dtype=float).reshape(output_shape)
+    # our postprocess code based on TF channel last format, so if the output format
+    # doesn't match, we need to transpose
+    if output_tensor.getDimensionType() == MNN.Tensor_DimensionType_Caffe:
+        output_data = output_data.transpose((0,2,3,1))
+    elif output_tensor.getDimensionType() == MNN.Tensor_DimensionType_Caffe_C4:
+        raise ValueError('unsupported output tensor dimension type')
+
+    prediction.append(output_data)
+    prediction = np.argmax(prediction[0], axis=-1)
+    return prediction[0]
+
+
+def plot_confusion_matrix(cm, classes, mIOU, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+    trained_classes = classes
+    plt.figure()
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title,fontsize=11)
+    tick_marks = np.arange(len(classes))
+    plt.xticks(np.arange(len(trained_classes)), classes, rotation=90,fontsize=9)
+    plt.yticks(tick_marks, classes,fontsize=9)
+    thresh = cm.max() / 2.
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, np.round(cm[i, j],2), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black", fontsize=7)
+    plt.tight_layout()
+    plt.ylabel('True label',fontsize=9)
+    plt.xlabel('Predicted label',fontsize=9)
+
+    plt.title('Mean IOU: '+ str(np.round(mIOU*100, 2)))
+    output_path = os.path.join('result','confusion_matrix.png')
+    os.makedirs('result', exist_ok=True)
+    plt.savefig(output_path)
+    #plt.show()
+    return
+
+
+def adjust_axes(r, t, fig, axes):
+    """
+     Plot - adjust axes
+    """
+    # get text width for re-scaling
+    bb = t.get_window_extent(renderer=r)
+    text_width_inches = bb.width / fig.dpi
+    # get axis width in inches
+    current_fig_width = fig.get_figwidth()
+    new_fig_width = current_fig_width + text_width_inches
+    propotion = new_fig_width / current_fig_width
+    # get axis limit
+    x_lim = axes.get_xlim()
+    axes.set_xlim([x_lim[0], x_lim[1]*propotion])
+
+
+def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar):
+    """
+     Draw plot using Matplotlib
+    """
+    # sort the dictionary by decreasing value, into a list of tuples
+    sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1))
+    # unpacking the list of tuples into two lists
+    sorted_keys, sorted_values = zip(*sorted_dic_by_value)
+    #
+    if true_p_bar != "":
+        """
+         Special case to draw in (green=true predictions) & (red=false predictions)
+        """
+        fp_sorted = []
+        tp_sorted = []
+        for key in sorted_keys:
+            fp_sorted.append(dictionary[key] - true_p_bar[key])
+            tp_sorted.append(true_p_bar[key])
+        plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Predictions')
+        plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Predictions', left=fp_sorted)
+        # add legend
+        plt.legend(loc='lower right')
+        """
+         Write number on side of bar
+        """
+        fig = plt.gcf() # gcf - get current figure
+        axes = plt.gca()
+        r = fig.canvas.get_renderer()
+        for i, val in enumerate(sorted_values):
+            fp_val = fp_sorted[i]
+            tp_val = tp_sorted[i]
+            fp_str_val = " " + str(fp_val)
+            tp_str_val = fp_str_val + " " + str(tp_val)
+            # trick to paint multicolor with offset:
+            #   first paint everything and then repaint the first number
+            t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold')
+            plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold')
+            if i == (len(sorted_values)-1): # largest bar
+                adjust_axes(r, t, fig, axes)
+    else:
+      plt.barh(range(n_classes), sorted_values, color=plot_color)
+      """
+       Write number on side of bar
+      """
+      fig = plt.gcf() # gcf - get current figure
+      axes = plt.gca()
+      r = fig.canvas.get_renderer()
+      for i, val in enumerate(sorted_values):
+          str_val = " " + str(val) # add a space before
+          if val < 1.0:
+              str_val = " {0:.2f}".format(val)
+          t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold')
+          # re-set axes to show number inside the figure
+          if i == (len(sorted_values)-1): # largest bar
+              adjust_axes(r, t, fig, axes)
+    # set window title
+    fig.canvas.set_window_title(window_title)
+    # write classes in y axis
+    tick_font_size = 12
+    plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size)
+    """
+     Re-scale height accordingly
+    """
+    init_height = fig.get_figheight()
+    # comput the matrix height in points and inches
+    dpi = fig.dpi
+    height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing)
+    height_in = height_pt / dpi
+    # compute the required figure height
+    top_margin = 0.15    # in percentage of the figure height
+    bottom_margin = 0.05 # in percentage of the figure height
+    figure_height = height_in / (1 - top_margin - bottom_margin)
+    # set new height
+    if figure_height > init_height:
+        fig.set_figheight(figure_height)
+
+    # set plot title
+    plt.title(plot_title, fontsize=14)
+    # set axis titles
+    # plt.xlabel('classes')
+    plt.xlabel(x_label, fontsize='large')
+    # adjust size of window
+    fig.tight_layout()
+    # save the plot
+    fig.savefig(output_path)
+    # show image
+    if to_show:
+        plt.show()
+    # close the plot
+    plt.close()
+
+
+def plot_mIOU_result(IOUs, mIOU, num_classes):
+    '''
+     Draw mIOU plot (Show IOU's of all classes in decreasing order)
+    '''
+    window_title = "mIOU"
+    plot_title = "mIOU: {0:.3f}%".format(mIOU*100)
+    x_label = "Intersection Over Union"
+    output_path = os.path.join('result','mIOU.png')
+    os.makedirs('result', exist_ok=True)
+    draw_plot_func(IOUs, num_classes, window_title, plot_title, x_label, output_path, to_show=False, plot_color='royalblue', true_p_bar='')
+
+
+def save_seg_result(image, pred_mask, gt_mask, image_id, class_names):
+    # save predict mask as PNG image
+    mask_dir = os.path.join('result','predict_mask')
+    os.makedirs(mask_dir, exist_ok=True)
+    label_save(os.path.join(mask_dir, str(image_id)+'.png'), pred_mask)
+
+    # visualize segmentation result
+    title_str = 'Predict Segmentation\nmIOU: '+str(mIOU(pred_mask, gt_mask))
+    gt_title_str = 'GT Segmentation'
+    image_array = visualize_segmentation(image, pred_mask, gt_mask, class_names=class_names, title=title_str, gt_title=gt_title_str, ignore_count_threshold=1)
+
+    # save result as JPG
+    result_dir = os.path.join('result','segmentation')
+    os.makedirs(result_dir, exist_ok=True)
+    result_file = os.path.join(result_dir, str(image_id)+'.jpg')
+    Image.fromarray(image_array).save(result_file)
+
+
+def generate_matrix(gt_mask, pre_mask, num_classes):
+    valid = (gt_mask >= 0) & (gt_mask < num_classes)
+    label = num_classes * gt_mask[valid].astype('int') + pre_mask[valid]
+    count = np.bincount(label, minlength=num_classes**2)
+    confusion_matrix = count.reshape(num_classes, num_classes)
+    return confusion_matrix
+
+
+def eval_mIOU(model, model_format, dataset_path, dataset, class_names, model_input_shape, do_crf=False, save_result=False, show_background=False):
+    num_classes = len(class_names)
+
+    #prepare eval dataset generator
+    eval_generator = SegmentationGenerator(dataset_path, dataset,
+                                            1,  #batch_size
+                                            num_classes,
+                                            target_size=model_input_shape[::-1],
+                                            weighted_type=None,
+                                            is_eval=True,
+                                            augment=False)
+
+    if model_format == 'MNN':
+        #MNN inference engine need create session
+        session = model.createSession()
+
+    # confusion matrix for all classes
+    confusion_matrix = np.zeros((num_classes, num_classes), dtype=float)
+
+    # get model prediction result
+    pbar = tqdm(total=len(eval_generator), desc='Eval model')
+    for n, (image_data, y_true) in enumerate(eval_generator):
+
+        # support of tflite model
+        if model_format == 'TFLITE':
+            y_pred = deeplab_predict_tflite(model, image_data)
+        # support of MNN model
+        elif model_format == 'MNN':
+            y_pred =deeplab_predict_mnn(model, session, image_data)
+        # support of TF 1.x frozen pb model
+        elif model_format == 'PB':
+            y_pred = deeplab_predict_pb(model, image_data)
+        # support of ONNX model
+        elif model_format == 'ONNX':
+            y_pred = deeplab_predict_onnx(model, image_data)
+        # normal keras h5 model
+        elif model_format == 'H5':
+            y_pred = deeplab_predict_keras(model, image_data)
+        else:
+            raise ValueError('invalid model format')
+
+        image = image_data[0].astype('uint8')
+        pred_mask = y_pred.reshape(model_input_shape)
+        gt_mask = y_true.reshape(model_input_shape).astype('int')
+
+        # add CRF postprocess
+        if do_crf:
+            pred_mask = crf_postprocess(image, pred_mask, zero_unsure=False)
+
+        # save segmentation result image
+        if save_result:
+            # get eval image name to save corresponding result
+            image_list = eval_generator.get_batch_image_path(n)
+            assert len(image_list) == 1, 'incorrect image batch'
+            image_id = os.path.splitext(os.path.basename(image_list[0]))[0]
+
+            save_seg_result(image, pred_mask, gt_mask, image_id, class_names)
+
+        # update confusion matrix
+        pred_mask = pred_mask.astype('int')
+        gt_mask = gt_mask.astype('int')
+        confusion_matrix += generate_matrix(gt_mask, pred_mask, num_classes)
+
+        # compare prediction result with label
+        # to update confusion matrix
+        #flat_pred = np.ravel(pred_mask).astype('int')
+        #flat_label = np.ravel(gt_mask).astype('int')
+        #for p, l in zip(flat_pred, flat_label):
+            #if l == num_classes or l == 255:
+                #continue
+            #if l < num_classes and p < num_classes:
+                #confusion_matrix[l, p] += 1
+            #else:
+                #print('Invalid entry encountered, skipping! Label: ', l,
+                        #' Prediction: ', p)
+
+        pbar.update(1)
+    pbar.close()
+
+    # calculate Pixel accuracy
+    PixelAcc = np.diag(confusion_matrix).sum() / confusion_matrix.sum()
+
+    # calculate Class accuracy
+    ClassAcc = np.diag(confusion_matrix) / confusion_matrix.sum(axis=1)
+    mClassAcc = np.nanmean(ClassAcc)
+
+    # calculate mIoU
+    I = np.diag(confusion_matrix)
+    U = np.sum(confusion_matrix, axis=0) + np.sum(confusion_matrix, axis=1) - I
+    IoU = I/U
+    #mIoU = np.nanmean(IoU)
+
+    # calculate FW (Frequency Weighted) IoU
+    Freq = np.sum(confusion_matrix, axis=1) / np.sum(confusion_matrix)
+    FWIoU = (Freq[Freq > 0] * IoU[Freq > 0]).sum()
+
+    # calculate Dice Coefficient
+    DiceCoef = 2*I / (U+I)
+
+    # collect IOU/ClassAcc/Dice/Freq for every class
+    IOUs, CLASS_ACCs, DICEs, FREQs = {}, {}, {}, {}
+    for i,(class_name, iou, class_acc, dice, freq) in enumerate(zip(class_names, IoU, ClassAcc, DiceCoef, Freq)):
+        IOUs[class_name] = iou
+        CLASS_ACCs[class_name] = class_acc
+        DICEs[class_name] = dice
+        FREQs[class_name] = freq
+
+    if not show_background:
+        #get ride of background class info
+        display_class_names = copy.deepcopy(class_names)
+        display_class_names.remove('background')
+        display_confusion_matrix = copy.deepcopy(confusion_matrix[1:, 1:])
+        IOUs.pop('background')
+        num_classes = num_classes - 1
+    else:
+        display_class_names = class_names
+        display_confusion_matrix = confusion_matrix
+
+    #sort IoU result by value, in descending order
+    IOUs = OrderedDict(sorted(IOUs.items(), key=operator.itemgetter(1), reverse=True))
+
+    #calculate mIOU from final IOU dict
+    mIoU = np.nanmean(list(IOUs.values()))
+
+    #show result
+    print('\nevaluation summary')
+    for class_name, iou in IOUs.items():
+        print('%s: IoU %.4f, Freq %.4f, ClassAcc %.4f, Dice %.4f' % (class_name, iou, FREQs[class_name], CLASS_ACCs[class_name], DICEs[class_name]))
+    print('mIoU=%.3f' % (mIoU*100))
+    print('FWIoU=%.3f' % (FWIoU*100))
+    print('PixelAcc=%.3f' % (PixelAcc*100))
+    print('mClassAcc=%.3f' % (mClassAcc*100))
+
+
+    # Plot mIOU & confusion matrix
+    plot_mIOU_result(IOUs, mIoU, num_classes)
+    plot_confusion_matrix(display_confusion_matrix, display_class_names, mIoU, normalize=True)
+
+    return mIoU
+
+
+
+#load TF 1.x frozen pb graph
+def load_graph(model_path):
+    # We parse the graph_def file
+    with tf.gfile.GFile(model_path, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+
+    # We load the graph_def in the default graph
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(
+            graph_def,
+            input_map=None,
+            return_elements=None,
+            name="graph",
+            op_dict=None,
+            producer_op_list=None
+        )
+    return graph
+
+
+def load_eval_model(model_path):
+    # support of tflite model
+    if model_path.endswith('.tflite'):
+        from tensorflow.lite.python import interpreter as interpreter_wrapper
+        model = interpreter_wrapper.Interpreter(model_path=model_path)
+        model.allocate_tensors()
+        model_format = 'TFLITE'
+
+    # support of MNN model
+    elif model_path.endswith('.mnn'):
+        model = MNN.Interpreter(model_path)
+        model_format = 'MNN'
+
+    # support of TF 1.x frozen pb model
+    elif model_path.endswith('.pb'):
+        model = load_graph(model_path)
+        model_format = 'PB'
+
+    # support of ONNX model
+    elif model_path.endswith('.onnx'):
+        model = onnxruntime.InferenceSession(model_path)
+        model_format = 'ONNX'
+
+    # normal keras h5 model
+    elif model_path.endswith('.h5'):
+        custom_object_dict = get_custom_objects()
+
+        model = load_model(model_path, compile=False, custom_objects=custom_object_dict)
+        model_format = 'H5'
+        K.set_learning_phase(0)
+    else:
+        raise ValueError('invalid model file')
+
+    return model, model_format
+
+
+def main():
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description='evaluate Deeplab model (h5/pb/tflite/mnn) with test dataset')
+    '''
+    Command line options
+    '''
+    parser.add_argument(
+        '--model_path', type=str, required=True,
+        help='path to model file')
+
+    parser.add_argument(
+        '--dataset_path', type=str, required=True,
+        help='dataset path containing images and label png file')
+
+    parser.add_argument(
+        '--dataset_file', type=str, required=True,
+        help='eval samples txt file')
+
+    parser.add_argument(
+        '--classes_path', type=str, required=False, default='configs/voc_classes.txt',
+        help='path to class definitions, default=%(default)s')
+
+    parser.add_argument(
+        '--model_input_shape', type=str,
+        help='model image input size as <height>x<width>, default=%(default)s', default='512x512')
+
+    parser.add_argument(
+        '--do_crf', action="store_true",
+        help='whether to add CRF postprocess for model output', default=False)
+
+    parser.add_argument(
+        '--show_background', default=False, action="store_true",
+        help='Show background evaluation info')
+
+    parser.add_argument(
+        '--save_result', default=False, action="store_true",
+        help='Save the segmentaion result image in result/segmentation dir')
+
+    args = parser.parse_args()
+
+    # param parse
+    height, width = args.model_input_shape.split('x')
+    model_input_shape = (int(height), int(width))
+
+    # add background class to match model & GT
+    class_names = get_classes(args.classes_path)
+    assert len(class_names) < 254, 'PNG image label only support less than 254 classes.'
+    class_names = ['background'] + class_names
+
+    model, model_format = load_eval_model(args.model_path)
+
+    # get dataset list
+    dataset = get_data_list(args.dataset_file)
+
+    start = time.time()
+    eval_mIOU(model, model_format, args.dataset_path, dataset, class_names, model_input_shape, args.do_crf, args.save_result, args.show_background)
+    end = time.time()
+    print("Evaluation time cost: {:.6f}s".format(end - start))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/models/deeplab/example/2007_000039.jpg b/models/deeplab/example/2007_000039.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1a3b717acc6e3f803c96b92fe09c26977d8142ee
Binary files /dev/null and b/models/deeplab/example/2007_000039.jpg differ
diff --git a/models/deeplab/example/2007_000039.png b/models/deeplab/example/2007_000039.png
new file mode 100644
index 0000000000000000000000000000000000000000..0cb946daa5d7acbdf4502c5318be10f7d6ea1725
Binary files /dev/null and b/models/deeplab/example/2007_000039.png differ
diff --git a/models/deeplab/example/2007_000346.jpg b/models/deeplab/example/2007_000346.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6d6eead5d9d8de32bc2efc1a37a18fb857348a6b
Binary files /dev/null and b/models/deeplab/example/2007_000346.jpg differ
diff --git a/models/deeplab/example/2007_000346.png b/models/deeplab/example/2007_000346.png
new file mode 100644
index 0000000000000000000000000000000000000000..52eb677932454bf6d41469edcb1f3c2e53683b66
Binary files /dev/null and b/models/deeplab/example/2007_000346.png differ
diff --git a/models/deeplab/example/air.jpg b/models/deeplab/example/air.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7474d827e83b27b4a205f24cbc3014cbd0cc145c
Binary files /dev/null and b/models/deeplab/example/air.jpg differ
diff --git a/models/deeplab/example/car.jpg b/models/deeplab/example/car.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1bf000c13054c6c701f4d4ec92df4b3a70a16ff2
Binary files /dev/null and b/models/deeplab/example/car.jpg differ
diff --git a/models/deeplab/example/dog.jpg b/models/deeplab/example/dog.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..77b0381222eaed50867643f4166092c781e56d5b
Binary files /dev/null and b/models/deeplab/example/dog.jpg differ
diff --git a/models/deeplab/example/face1.jpg b/models/deeplab/example/face1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2dbb4c9b5fbbdebba9f6c9d517813f01c5e88892
Binary files /dev/null and b/models/deeplab/example/face1.jpg differ
diff --git a/models/deeplab/example/face2.jpg b/models/deeplab/example/face2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6b89917080e4f62855e8e246853dcdeb0fd92db6
Binary files /dev/null and b/models/deeplab/example/face2.jpg differ
diff --git a/models/deeplab/example/face3.jpg b/models/deeplab/example/face3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a2ccf4946c0604a9b81ecfda9aa3d21f64aa1f2b
Binary files /dev/null and b/models/deeplab/example/face3.jpg differ
diff --git a/models/deeplab/example/horse.jpg b/models/deeplab/example/horse.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4e06c202ba5a2ed96ba07804b804c59a9b3e0aef
Binary files /dev/null and b/models/deeplab/example/horse.jpg differ
diff --git a/models/deeplab/example/person.jpg b/models/deeplab/example/person.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..61d377fff94d48c365b0cf18edcd4de38b229465
Binary files /dev/null and b/models/deeplab/example/person.jpg differ
diff --git a/models/deeplab/fast_scnn/model.py b/models/deeplab/fast_scnn/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..642046d9242e6d25ea729753bb54a85703e54353
--- /dev/null
+++ b/models/deeplab/fast_scnn/model.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+create fast_scnn models
+"""
+#from functools import partial
+from tensorflow.keras.layers import Reshape, Softmax, Input
+from tensorflow.keras.models import Model
+
+from fast_scnn.models.fast_scnn import FastSCNN
+
+#
+# A map of model type to construction function for FastSCNN
+#
+fast_scnn_model_map = {
+    'fast_scnn': FastSCNN,
+}
+
+def get_fast_scnn_model(model_type, num_classes, model_input_shape, weights_path=None, training=True):
+    # check if model type is valid
+    if model_type not in fast_scnn_model_map.keys():
+        raise ValueError('This model type is not supported now')
+
+    model_function = fast_scnn_model_map[model_type]
+
+    input_tensor = Input(shape=model_input_shape + (3,), name='image_input')
+    model = model_function(num_classes, input_tensor=input_tensor,
+                           input_shape=model_input_shape + (3,),
+                           weights=None,
+                           training=training)
+
+    # for training model, we need to flatten mask to calculate loss
+    #if training:
+        #x = Reshape((model_input_shape[0]*model_input_shape[1], num_classes)) (base_model.output)
+    #else:
+        #x = base_model.output
+
+    #x = Softmax(name='pred_mask')(x)
+    #model = Model(base_model.input, x, name=model_type)
+
+    if weights_path:
+        model.load_weights(weights_path, by_name=False)#, skip_mismatch=True)
+        print('Load weights {}.'.format(weights_path))
+
+    return model
+
diff --git a/models/deeplab/fast_scnn/models/fast_scnn.py b/models/deeplab/fast_scnn/models/fast_scnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f61086948dfc641e3d2a8ef2ad50131f9e30c5
--- /dev/null
+++ b/models/deeplab/fast_scnn/models/fast_scnn.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" Fast-SCNN model for Keras. Ported from
+https://github.com/kshitizrimal/Fast-SCNN
+
+# Reference Paper:
+- [Fast-SCNN: Fast Semantic Segmentation Network](https://arxiv.org/abs/1902.04502)
+"""
+from tensorflow.keras.models import Model
+from tensorflow.keras.activations import relu
+from tensorflow.keras.layers import Conv2D, SeparableConv2D, DepthwiseConv2D, UpSampling2D, ZeroPadding2D, Lambda, AveragePooling2D, Input, Concatenate, Add, Reshape, BatchNormalization, Dropout, ReLU, Softmax
+from tensorflow.keras.utils import get_source_inputs, get_file
+from tensorflow.keras import backend as K
+
+from deeplabv3p.models.layers import normalize, img_resize, DeeplabConv2D, DeeplabDepthwiseConv2D, DeeplabSeparableConv2D, CustomBatchNormalization
+
+
+def conv_block(inputs, conv_type, kernel, kernel_size, strides, padding='same', relu=True):
+    """
+    # Model Architecture
+    #### Custom function for conv2d: conv_block
+    """
+    if(conv_type == 'ds'):
+      x = DeeplabSeparableConv2D(kernel, kernel_size, padding=padding, strides = strides)(inputs)
+    else:
+      x = DeeplabConv2D(kernel, kernel_size, padding=padding, strides = strides)(inputs)
+
+    x = CustomBatchNormalization()(x)
+
+    if (relu):
+      x = ReLU()(x)
+
+    return x
+
+
+def _res_bottleneck(inputs, filters, kernel, t, s, r=False):
+    """
+    #### residual custom method
+    """
+    tchannel = K.int_shape(inputs)[-1] * t
+
+    x = conv_block(inputs, 'conv', tchannel, (1, 1), strides=(1, 1))
+
+    x = DeeplabDepthwiseConv2D(kernel, strides=(s, s), depth_multiplier=1, padding='same')(x)
+    x = CustomBatchNormalization()(x)
+    x = ReLU()(x)
+
+    x = conv_block(x, 'conv', filters, (1, 1), strides=(1, 1), padding='same', relu=False)
+
+    if r:
+        x = Add()([x, inputs])
+    return x
+
+
+def bottleneck_block(inputs, filters, kernel, t, strides, n):
+    """
+    #### Bottleneck custom method
+    """
+    x = _res_bottleneck(inputs, filters, kernel, t, strides)
+
+    for i in range(1, n):
+      x = _res_bottleneck(x, filters, kernel, t, 1, True)
+
+    return x
+
+
+def pyramid_pooling_block(input_tensor, bin_sizes):
+    """
+    #### PPM Method
+    """
+    concat_list = [input_tensor]
+    w = input_tensor.shape[1]
+    h = input_tensor.shape[2]
+
+    for bin_size in bin_sizes:
+      x = AveragePooling2D(pool_size=(w//bin_size, h//bin_size), strides=(w//bin_size, h//bin_size))(input_tensor)
+      x = DeeplabConv2D(128, 3, 2, padding='same')(x)
+      x = Lambda(img_resize, arguments={'size': (w, h), 'mode': 'bilinear'})(x)
+      #x = UpSampling2D((w//bin_size, h//bin_size))(x)
+
+      concat_list.append(x)
+
+    return Concatenate(axis=-1)(concat_list)
+
+
+def FastSCNN(num_classes,
+             input_shape=(2048, 1024, 3),
+             input_tensor=None,
+             weights=None,
+             training=True,
+             **kwargs):
+
+    if input_tensor is None:
+        inputs = Input(shape=input_shape, name='image_input')
+    else:
+        inputs = input_tensor
+
+    # normalize input image
+    inputs_norm= Lambda(normalize, name='input_normalize')(inputs)
+
+    """## Step 1: Learning to DownSample"""
+    lds_layer = conv_block(inputs_norm, 'conv', 32, (3, 3), strides = (2, 2))
+    lds_layer = conv_block(lds_layer, 'ds', 48, (3, 3), strides = (2, 2))
+    lds_layer = conv_block(lds_layer, 'ds', 64, (3, 3), strides = (2, 2))
+
+
+    """## Step 2: Global Feature Extractor"""
+    """#### Assembling all the methods"""
+    gfe_layer = bottleneck_block(lds_layer, 64, (3, 3), t=6, strides=2, n=3)
+    gfe_layer = bottleneck_block(gfe_layer, 96, (3, 3), t=6, strides=2, n=3)
+    gfe_layer = bottleneck_block(gfe_layer, 128, (3, 3), t=6, strides=1, n=3)
+    gfe_layer = pyramid_pooling_block(gfe_layer, [2,4,6,8])
+
+    """## Step 3: Feature Fusion"""
+    ff_layer1 = conv_block(lds_layer, 'conv', 128, (1,1), padding='same', strides= (1,1), relu=False)
+    ff_layer2 = UpSampling2D((4, 4))(gfe_layer)
+    ff_layer2 = DeeplabSeparableConv2D(128, (3, 3), padding='same', strides = (1, 1), activation=None, dilation_rate=(4, 4))(ff_layer2)
+
+    # old approach with DepthWiseConv2d
+    #ff_layer2 = DeeplabDepthwiseConv2D((3,3), strides=(1, 1), depth_multiplier=1, padding='same')(ff_layer2)
+
+    ff_layer2 = CustomBatchNormalization()(ff_layer2)
+    ff_layer2 = ReLU()(ff_layer2)
+    ff_layer2 = DeeplabConv2D(128, 1, 1, padding='same', activation=None)(ff_layer2)
+
+    ff_final = Add()([ff_layer1, ff_layer2])
+    ff_final = CustomBatchNormalization()(ff_final)
+    ff_final = ReLU()(ff_final)
+
+    """## Step 4: Classifier"""
+    classifier = DeeplabSeparableConv2D(128, (3, 3), padding='same', strides = (1, 1), name = 'DSConv1_classifier')(ff_final)
+    classifier = CustomBatchNormalization()(classifier)
+    classifier = ReLU()(classifier)
+
+    classifier = DeeplabSeparableConv2D(128, (3, 3), padding='same', strides = (1, 1), name = 'DSConv2_classifier')(classifier)
+    classifier = CustomBatchNormalization()(classifier)
+    classifier = ReLU()(classifier)
+
+    classifier = conv_block(classifier, 'conv', num_classes, (1, 1), strides=(1, 1), padding='same', relu=False)
+
+    classifier = Dropout(0.3)(classifier)
+
+    classifier = UpSampling2D((8, 8))(classifier)
+
+    # for training model, we need to flatten mask to calculate loss
+    if training:
+        classifier = Reshape((inputs.shape[1]*inputs.shape[2], num_classes))(classifier)
+
+    classifier = Softmax(name='pred_mask')(classifier)
+
+    model = Model(inputs=inputs, outputs=classifier, name='Fast_SCNN')
+
+    return model
+
+
+if __name__ == '__main__':
+    from tensorflow.keras.optimizers import SGD
+
+    #input_tensor = Input(shape=(2048, 1024, 3), name='image_input')
+    input_tensor = Input(shape=(512, 512, 3), name='image_input')
+    model = FastSCNN(input_tensor=input_tensor,
+                                      weights=None,
+                                      num_classes=21)
+
+    optimizer = SGD(momentum=0.9, lr=0.045)
+    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
+
+    model.summary()
diff --git a/models/deeplab/inference/MNN/CMakeLists.txt b/models/deeplab/inference/MNN/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bf8f5412c86d6bee66e600e073acf48af52607dd
--- /dev/null
+++ b/models/deeplab/inference/MNN/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.5)
+project(deeplabSegment LANGUAGES CXX)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+SET(CMAKE_BUILD_TYPE "Debug")
+#SET(CMAKE_BUILD_TYPE "Release")
+
+#### specify the compiler flag
+SET(CMAKE_CXX_FLAGS  "-std=c++11 -O2")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
+
+set(DEEPLAB_SEGMENT_SRC
+        deeplabSegment.cpp)
+
+#set(MNN_ROOT_PATH /mnt/d/Projects/MNN)
+link_directories("${MNN_ROOT_PATH}/build/")
+
+add_executable(deeplabSegment ${DEEPLAB_SEGMENT_SRC})
+
+target_include_directories(deeplabSegment PRIVATE ${MNN_ROOT_PATH}/include/ ${MNN_ROOT_PATH}/3rd_party/imageHelper/)
+target_link_libraries(deeplabSegment PRIVATE -lMNN -lstdc++ -lpthread)
+#target_link_libraries(deeplabSegment PRIVATE libMNN.a -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -lpthread)
diff --git a/models/deeplab/inference/MNN/configs/quantizeConfig.json b/models/deeplab/inference/MNN/configs/quantizeConfig.json
new file mode 100644
index 0000000000000000000000000000000000000000..b40a8b00151ae03ad1f31358c8697eac3429f306
--- /dev/null
+++ b/models/deeplab/inference/MNN/configs/quantizeConfig.json
@@ -0,0 +1,19 @@
+{
+    "format":"RGB",
+    "mean":[
+        0.0,
+        0.0,
+        0.0
+    ],
+    "normal":[
+        1.0,
+        1.0,
+        1.0
+    ],
+    "width":512,
+    "height":512,
+    "path":"/mnt/d/Projects/tf-keras-deeplabv3p-model-set/VOCdevkit/VOC2012/JPEGImages/",
+    "used_image_num":500,
+    "feature_quantize_method":"KL",
+    "weight_quantize_method":"MAX_ABS"
+}
diff --git a/models/deeplab/inference/MNN/deeplabSegment.cpp b/models/deeplab/inference/MNN/deeplabSegment.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..10af6d5530dce0662e64f0b8783c19fe003463c4
--- /dev/null
+++ b/models/deeplab/inference/MNN/deeplabSegment.cpp
@@ -0,0 +1,468 @@
+//
+//  deeplabSegment.cpp
+//  MNN
+//
+//  Created by david8862 on 2020/08/25.
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "MNN/ImageProcess.hpp"
+#include "MNN/Interpreter.hpp"
+#define MNN_OPEN_TIME_TRACE
+#include <algorithm>
+#include <fstream>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <numeric>
+#include <math.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/time.h>
+#include "MNN/AutoTime.hpp"
+#include "MNN/ErrorCode.hpp"
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize.h"
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb_image_write.h"
+
+using namespace MNN;
+using namespace MNN::CV;
+
+
+// model inference settings
+struct Settings {
+    int loop_count = 1;
+    int number_of_threads = 4;
+    int number_of_warmup_runs = 2;
+    float input_mean = 0.0f;
+    float input_std = 1.0f;
+    std::string model_name = "./model.mnn";
+    std::string input_img_name = "./dog.jpg";
+    std::string classes_file_name = "./classes.txt";
+    std::string mask_img_name = "./mask.png";
+    bool keep_shape = false;
+    bool input_floating = false;
+    //bool verbose = false;
+    //string input_layer_type = "uint8_t";
+};
+
+
+double get_us(struct timeval t)
+{
+    return (t.tv_sec * 1000000 + t.tv_usec);
+}
+
+
+void display_usage() {
+    std::cout
+        << "Usage: deeplabSegment\n"
+        << "--mnn_model, -m: model_name.mnn\n"
+        << "--image, -i: image_name.jpg\n"
+        << "--classes, -l: classes labels for the model\n"
+        << "--input_mean, -b: input mean\n"
+        << "--input_std, -s: input standard deviation\n"
+        << "--threads, -t: number of threads\n"
+        << "--count, -c: loop model run for certain times\n"
+        << "--warmup_runs, -w: number of warmup runs\n"
+        << "--mask, -k: mask png file to save segment output\n"
+        << "--keep_shape, -p: [0|1] keep predict mask as the same shape of input image\n"
+        //<< "--verbose, -v: [0|1] print more information\n"
+        << "\n";
+    return;
+}
+
+
+// DeepLab postprocess for prediction mask tensor
+void deeplab_postprocess(const Tensor* mask_tensor, uint8_t* mask_array, std::vector<uint8_t> &class_indexes)
+{
+    // 1. do following transform to get the output segmentation
+    //    mask array:
+    //
+    //    mask = np.argmax(prediction, -1)
+    //
+    const float* data = mask_tensor->host<float>();
+    auto unit = sizeof(float);
+    auto dimType = mask_tensor->getDimensionType();
+
+    auto batch   = mask_tensor->batch();
+    auto channel = mask_tensor->channel();
+    auto height  = mask_tensor->height();
+    auto width   = mask_tensor->width();
+
+    int bytesPerRow, bytesPerImage, bytesPerBatch;
+    if (dimType == Tensor::TENSORFLOW) {
+        // Tensorflow format tensor, NHWC
+        MNN_PRINT("Tensorflow format: NHWC\n");
+
+        bytesPerRow   = channel * unit;
+        bytesPerImage = width * bytesPerRow;
+        bytesPerBatch = height * bytesPerImage;
+
+    } else if (dimType == Tensor::CAFFE) {
+        // Caffe format tensor, NCHW
+        MNN_PRINT("Caffe format: NCHW\n");
+
+        bytesPerRow   = width * unit;
+        bytesPerImage = height * bytesPerRow;
+        bytesPerBatch = channel * bytesPerImage;
+
+    } else if (dimType == Tensor::CAFFE_C4) {
+        MNN_PRINT("Caffe format: NC4HW4, not supported\n");
+        exit(-1);
+    } else {
+        MNN_PRINT("Invalid tensor dim type: %d\n", dimType);
+        exit(-1);
+    }
+
+    // Check and clear output mask array
+    MNN_ASSERT(mask_array != nullptr);
+    bzero((void*)mask_array, height * width * 1 * sizeof(uint8_t));
+
+    for (int b = 0; b < batch; b++) {
+        auto bytes = data + b * bytesPerBatch / unit;
+        MNN_PRINT("batch %d:\n", b);
+
+        for (int h = 0; h < height; h++) {
+            for (int w = 0; w < width; w++) {
+                //get bbox prediction data offset for each anchor, each feature point
+                int class_scores_offset, class_scores_step;
+                if (dimType == Tensor::TENSORFLOW) {
+                    // Tensorflow format tensor, NHWC
+                    class_scores_offset = h * width * channel + w * channel;
+                    class_scores_step = 1;
+                } else if (dimType == Tensor::CAFFE) {
+                    // Caffe format tensor, NCHW
+                    class_scores_offset = h * width + w;
+                    class_scores_step = width * height;
+
+                } else if (dimType == Tensor::CAFFE_C4) {
+                    MNN_PRINT("Caffe format: NC4HW4, not supported\n");
+                    exit(-1);
+                } else {
+                    MNN_PRINT("Invalid tensor dim type: %d\n", dimType);
+                    exit(-1);
+                }
+
+                // Get class index with max score (index 0 should be background),
+                // just as Python postprocess:
+                //
+                // mask = np.argmax(prediction, -1)
+                //
+                uint8_t class_index = 0;
+                float max_score = 0.0;
+                for (int i = 0; i < channel; i++) {
+                    if (bytes[class_scores_offset + i * class_scores_step] > max_score) {
+                        class_index = i;
+                        max_score = bytes[class_scores_offset + i * class_scores_step];
+                    }
+                }
+                int mask_offset = h * width + w;
+                mask_array[mask_offset] = class_index;
+
+                if(class_index != 0 && std::count(class_indexes.begin(), class_indexes.end(), class_index) == 0) {
+                    class_indexes.emplace_back(class_index);
+                }
+            }
+        }
+    }
+    return;
+}
+
+
+//Resize image to model input shape
+uint8_t* image_resize(uint8_t* inputImage, int image_width, int image_height, int image_channel, int input_width, int input_height, int input_channel)
+{
+    // assume the data channel match
+    MNN_ASSERT(image_channel == input_channel);
+
+    uint8_t* input_image = (uint8_t*)malloc(input_height * input_width * input_channel * sizeof(uint8_t));
+    if (input_image == nullptr) {
+        MNN_PRINT("Can't alloc memory\n");
+        exit(-1);
+    }
+    stbir_resize_uint8(inputImage, image_width, image_height, 0,
+                     input_image, input_width, input_height, 0, image_channel);
+
+    return input_image;
+}
+
+
+template <class T>
+void fill_data(T* out, uint8_t* in, int input_width, int input_height,
+            int input_channels, Settings* s) {
+  auto output_number_of_pixels = input_height * input_width * input_channels;
+
+  for (int i = 0; i < output_number_of_pixels; i++) {
+    if (s->input_floating)
+      out[i] = (in[i] - s->input_mean) / s->input_std;
+    else
+      out[i] = (uint8_t)in[i];
+  }
+
+  return;
+}
+
+
+void RunInference(Settings* s) {
+    // record run time for every stage
+    struct timeval start_time, stop_time;
+
+    // create model & session
+    std::shared_ptr<Interpreter> net(Interpreter::createFromFile(s->model_name.c_str()));
+    ScheduleConfig config;
+    config.type  = MNN_FORWARD_AUTO; //MNN_FORWARD_CPU, MNN_FORWARD_OPENCL
+    config.backupType = MNN_FORWARD_CPU;
+    config.numThread = s->number_of_threads;
+
+    BackendConfig bnconfig;
+    bnconfig.memory = BackendConfig::Memory_Normal; //Memory_High, Memory_Low
+    bnconfig.power = BackendConfig::Power_Normal; //Power_High, Power_Low
+    bnconfig.precision = BackendConfig::Precision_Normal; //Precision_High, Precision_Low
+    config.backendConfig = &bnconfig;
+
+    auto session = net->createSession(config);
+    // since we don't need to create other sessions any more,
+    // just release model data to save memory
+    net->releaseModel();
+
+    // get classes labels and add background label
+    std::vector<std::string> classes;
+    classes.emplace_back("background");
+    std::ifstream classesOs(s->classes_file_name.c_str());
+    std::string line;
+    while (std::getline(classesOs, line)) {
+        classes.emplace_back(line);
+    }
+    int num_classes = classes.size();
+    MNN_PRINT("num_classes: %d\n", num_classes);
+
+    // get input tensor info, assume only 1 input tensor (image_input)
+    auto inputs = net->getSessionInputAll(session);
+    MNN_ASSERT(inputs.size() == 1);
+    auto image_input = inputs.begin()->second;
+    int input_width = image_input->width();
+    int input_height = image_input->height();
+    int input_channel = image_input->channel();
+    int input_dim_type = image_input->getDimensionType();
+
+    std::vector<std::string> dim_type_string = {"TENSORFLOW", "CAFFE", "CAFFE_C4"};
+
+    MNN_PRINT("image_input: name:%s, width:%d, height:%d, channel:%d, input_dim_type:%s\n", inputs.begin()->first.c_str(), input_width, input_height, input_channel, dim_type_string[input_dim_type].c_str());
+
+    // assume input tensor is Tensorflow format, NHWC
+    // to align with the input image format
+    MNN_ASSERT(input_dim_type == Tensor::TENSORFLOW);
+
+    //auto shape = image_input->shape();
+    //shape[0] = 1;
+    //net->resizeTensor(image_input, shape);
+    //net->resizeSession(session);
+
+    // load input image
+    auto inputPath = s->input_img_name.c_str();
+    int image_width, image_height, image_channel;
+    uint8_t* inputImage = (uint8_t*)stbi_load(inputPath, &image_width, &image_height, &image_channel, input_channel);
+    if (nullptr == inputImage) {
+        MNN_ERROR("Can't open %s\n", inputPath);
+        return;
+    }
+    MNN_PRINT("origin image size: width:%d, height:%d, channel:%d\n", image_width, image_height, image_channel);
+
+    // resize input image
+    uint8_t* resizeImage = image_resize(inputImage, image_width, image_height, image_channel, input_width, input_height, input_channel);
+
+    // free input image
+    stbi_image_free(inputImage);
+    inputImage = nullptr;
+
+    // assume input tensor type is float
+    MNN_ASSERT(image_input->getType().code == halide_type_float);
+    s->input_floating = true;
+
+    // create a host tensor for input data
+    auto dataTensor = new Tensor(image_input, Tensor::TENSORFLOW);
+    fill_data<float>(dataTensor->host<float>(), resizeImage,
+                input_width, input_height, input_channel, s);
+
+    // run warm up session
+    if (s->loop_count > 1)
+        for (int i = 0; i < s->number_of_warmup_runs; i++) {
+            image_input->copyFromHostTensor(dataTensor);
+            if (net->runSession(session) != NO_ERROR) {
+                MNN_PRINT("Failed to invoke MNN!\n");
+            }
+        }
+
+    // run model sessions to get output
+    gettimeofday(&start_time, nullptr);
+    for (int i = 0; i < s->loop_count; i++) {
+        image_input->copyFromHostTensor(dataTensor);
+        if (net->runSession(session) != NO_ERROR) {
+            MNN_PRINT("Failed to invoke MNN!\n");
+        }
+    }
+    gettimeofday(&stop_time, nullptr);
+    MNN_PRINT("model invoke average time: %lf ms\n", (get_us(stop_time) - get_us(start_time)) / (1000 * s->loop_count));
+
+    // get output tensor info, assume only 1 output tensor (pred_mask/Softmax)
+    // image_input: 1 x 512 x 512 x 3
+    // "pred_mask/Softmax": 1 x 512 x 512 x num_classes
+    auto outputs = net->getSessionOutputAll(session);
+    MNN_ASSERT(outputs.size() == 1);
+
+    auto mask_output = outputs.begin()->second;
+    int mask_width = mask_output->width();
+    int mask_height = mask_output->height();
+    int mask_channel = mask_output->channel();
+    MNN_PRINT("output tensor: name:%s, width:%d, height:%d, channel: %d\n", outputs.begin()->first.c_str(), mask_width, mask_height, mask_channel);
+
+    // check if predict mask channel number
+    // matches classes definition
+    MNN_ASSERT(num_classes == mask_channel);
+
+    // Copy output tensors to host, for further postprocess
+    auto dim_type = mask_output->getDimensionType();
+    if (mask_output->getType().code != halide_type_float) {
+        dim_type = Tensor::TENSORFLOW;
+    }
+    std::shared_ptr<Tensor> output_tensor(new Tensor(mask_output, dim_type));
+    mask_output->copyToHostTensor(output_tensor.get());
+
+    // Now we only support float32 type output tensor
+    MNN_ASSERT(output_tensor->getType().code == halide_type_float);
+    MNN_ASSERT(output_tensor->getType().bits == 32);
+
+    // Alloc mask array for post process
+    uint8_t* mask_array = (uint8_t*)malloc(mask_height * mask_width * 1 * sizeof(uint8_t));
+    if (mask_array == nullptr) {
+        MNN_PRINT("Can't alloc memory\n");
+        exit(-1);
+    }
+    std::vector<uint8_t> class_indexes;
+
+    // Do deeplab_postprocess to generate mask array
+    gettimeofday(&start_time, nullptr);
+    deeplab_postprocess(output_tensor.get(), mask_array, class_indexes);
+    gettimeofday(&stop_time, nullptr);
+    MNN_PRINT("deeplab_postprocess time: %lf ms\n", (get_us(stop_time) - get_us(start_time)) / 1000);
+
+    int save_width, save_height;
+    if (s->keep_shape) {
+        // Resize the prediction mask back to original image shape
+        uint8_t* origin_mask_array = image_resize(mask_array, mask_width, mask_height, 1, image_width, image_height, 1);
+        // free prediction mask
+        free(mask_array);
+        mask_array = origin_mask_array;
+        save_width = image_width;
+        save_height = image_height;
+    } else {
+        save_width = mask_width;
+        save_height = mask_height;
+    }
+
+    // Show segment class result
+    MNN_PRINT("Segment class:\n");
+    for(auto class_index : class_indexes) {
+        MNN_PRINT("%s\n", classes[class_index].c_str());
+    }
+
+    // Save mask array to png image file
+    stbi_write_png(s->mask_img_name.c_str(), save_width, save_height, 1, mask_array, 0);
+    MNN_PRINT("Segmentation result has been saved to: %s\n", s->mask_img_name.c_str());
+
+    // Release session and model
+    net->releaseSession(session);
+    //net->releaseModel();
+    return;
+}
+
+
+int main(int argc, char** argv) {
+  Settings s;
+
+  int c;
+  while (1) {
+    static struct option long_options[] = {
+        {"mnn_model", required_argument, nullptr, 'm'},
+        {"image", required_argument, nullptr, 'i'},
+        {"classes", required_argument, nullptr, 'l'},
+        {"input_mean", required_argument, nullptr, 'b'},
+        {"input_std", required_argument, nullptr, 's'},
+        {"threads", required_argument, nullptr, 't'},
+        {"count", required_argument, nullptr, 'c'},
+        {"warmup_runs", required_argument, nullptr, 'w'},
+        {"mask", required_argument, nullptr, 'k'},
+        {"keep_shape", required_argument, nullptr, 'p'},
+        //{"verbose", required_argument, nullptr, 'v'},
+        {"help", no_argument, nullptr, 'h'},
+        {nullptr, 0, nullptr, 0}};
+
+    /* getopt_long stores the option index here. */
+    int option_index = 0;
+
+    c = getopt_long(argc, argv,
+                    "b:c:hi:l:k:m:p:s:t:w:", long_options,
+                    &option_index);
+
+    /* Detect the end of the options. */
+    if (c == -1) break;
+
+    switch (c) {
+      case 'b':
+        s.input_mean = strtod(optarg, nullptr);
+        break;
+      case 'c':
+        s.loop_count =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
+      case 'i':
+        s.input_img_name = optarg;
+        break;
+      case 'l':
+        s.classes_file_name = optarg;
+        break;
+      case 'm':
+        s.model_name = optarg;
+        break;
+      case 's':
+        s.input_std = strtod(optarg, nullptr);
+        break;
+      case 't':
+        s.number_of_threads = strtol(  // NOLINT(runtime/deprecated_fn)
+            optarg, nullptr, 10);
+        break;
+      case 'p':
+        s.keep_shape =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
+      //case 'v':
+        //s.verbose =
+            //strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        //break;
+      case 'w':
+        s.number_of_warmup_runs =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
+      case 'k':
+        s.mask_img_name = optarg;
+        break;
+      case 'h':
+      case '?':
+      default:
+        /* getopt_long already printed an error message. */
+        display_usage();
+        exit(-1);
+    }
+  }
+  RunInference(&s);
+  return 0;
+}
+
diff --git a/models/deeplab/inference/README.md b/models/deeplab/inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3f1757a7e533bb351401e014d8a2ee765e211a58
--- /dev/null
+++ b/models/deeplab/inference/README.md
@@ -0,0 +1,257 @@
+## C++ on-device (X86/ARM) inference app for DeepLab v3+ semantic segmentation modelset
+
+Here are some C++ implementation of the on-device inference for trained DeepLab v3+ inference model, including forward propagation of the model, postprocess and predict segment mask output. Generally it should support all DeepLab v3+ archs and all kinds of backbones. Now we have 2 approaches with different inference engine for that:
+
+* Tensorflow-Lite (verified on commit id: 1b8f5bc8011a1e85d7a110125c852a4f431d0f59)
+* [MNN](https://github.com/alibaba/MNN) from Alibaba (verified on release: [1.0.0](https://github.com/alibaba/MNN/releases/tag/1.0.0))
+
+
+### MNN
+
+1. Install Python runtime and Build libMNN
+
+Refer to [MNN build guide](https://www.yuque.com/mnn/cn/build_linux), we need to prepare cmake & protobuf first for MNN build. And since MNN support both X86 & ARM platform, we can do either native compile or ARM cross-compile
+```
+# apt install cmake autoconf automake libtool ocl-icd-opencl-dev
+# wget https://github.com/google/protobuf/releases/download/v3.4.1/protobuf-cpp-3.4.1.tar.gz
+# tar xzvf protobuf-cpp-3.4.1.tar.gz
+# cd protobuf-3.4.1
+# ./autogen.sh
+# ./configure && make && make check && make install && ldconfig
+# pip install --upgrade pip && pip install --upgrade mnn
+
+# git clone https://github.com/alibaba/MNN.git <Path_to_MNN>
+# cd <Path_to_MNN>
+# ./schema/generate.sh
+# ./tools/script/get_model.sh  # optional
+# mkdir build && cd build
+# cmake [-DCMAKE_TOOLCHAIN_FILE=<cross-compile toolchain file>]
+        [-DMNN_BUILD_QUANTOOLS=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_BENCHMARK=ON -DMNN_BUILD_TRAIN=ON -MNN_BUILD_TRAIN_MINI=ON -MNN_USE_OPENCV=OFF] ..
+        && make -j4
+
+### MNN OpenCL backend build
+# apt install ocl-icd-opencl-dev
+# cmake [-DCMAKE_TOOLCHAIN_FILE=<cross-compile toolchain file>]
+        [-DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_USE_SYSTEM_LIB=ON] ..
+        && make -j4
+```
+If you want to do cross compile for ARM platform, "CMAKE_TOOLCHAIN_FILE" should be specified
+
+"MNN_BUILD_QUANTOOLS" is for enabling MNN Quantization tool
+
+"MNN_BUILD_CONVERTER" is for enabling MNN model converter
+
+"MNN_BUILD_BENCHMARK" is for enabling on-device inference benchmark tool
+
+"MNN_BUILD_TRAIN" related are for enabling MNN training tools
+
+
+2. Build demo inference application
+```
+# cd tf-keras-deeplabv3p-model-set/inference/MNN
+# mkdir build && cd build
+# cmake -DMNN_ROOT_PATH=<Path_to_MNN> [-DCMAKE_TOOLCHAIN_FILE=<cross-compile toolchain file>] ..
+# make
+```
+
+3. Convert trained DeepLab v3+ model to MNN model
+
+Refer to [Model dump](https://github.com/david8862/tf-keras-deeplabv3p-model-set#model-dump), [Tensorflow model convert](https://github.com/david8862/tf-keras-deeplabv3p-model-set#tensorflow-model-convert) and [MNN model convert](https://www.yuque.com/mnn/cn/model_convert), we need to:
+
+* dump out inference model from training checkpoint:
+
+    ```
+    # python deeplab.py --model_type=mobilenetv2_lite --weights_path=logs/000/<checkpoint>.h5 --classes_path=configs/voc_classes.txt --model_input_shape=512x512 --output_stride=16 --dump_model --output_model_file=model.h5
+    ```
+
+* convert keras .h5 model to tensorflow frozen pb model:
+
+    ```
+    # python keras_to_tensorflow.py
+        --input_model="path/to/keras/model.h5"
+        --output_model="path/to/save/model.pb"
+    ```
+
+* convert TF pb model to MNN model:
+
+    ```
+    # cd <Path_to_MNN>/build/
+    # ./MNNConvert -f TF --modelFile model.pb --MNNModel model.pb.mnn --bizCode MNN
+    ```
+    or
+
+    ```
+    # mnnconvert -f TF --modelFile model.pb --MNNModel model.pb.mnn
+    ```
+
+MNN support Post Training Integer quantization, so we can use its python CLI interface to do quantization on the generated .mnn model to get quantized .mnn model for ARM acceleration . A json config file [quantizeConfig.json](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/inference/MNN/configs/quantizeConfig.json) is needed to describe the feeding data:
+
+* Quantized MNN model:
+
+    ```
+    # cd <Path_to_MNN>/build/
+    # ./quantized.out model.pb.mnn model_quant.pb.mnn quantizeConfig.json
+    ```
+    or
+
+    ```
+    # mnnquant model.pb.mnn model_quant.pb.mnn quantizeConfig.json
+    ```
+
+4. Run validate script to check MNN model
+```
+# cd tf-keras-deeplabv3p-model-set/tools/evaluation/
+# python validate_deeplab.py --model_path=model_quant.pb.mnn --classes_path=../../configs/voc_classes.txt --image_file=../../example/2007_000346.jpg --label_file=../../example/2007_000346.png --loop_count=5
+```
+
+Visualized segmentation result:
+
+<p align="center">
+  <img src="../assets/2007_000346_inference.png">
+</p>
+
+#### You can also use [eval.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set#evaluation) to do evaluation on the MNN model
+
+
+5. Run application to do inference with model, or put all the assets to your ARM board and run if you use cross-compile
+```
+# cd tf-keras-deeplabv3p-model-set/inference/MNN/build
+# ./deeplabSegment -h
+Usage: deeplabSegment
+--mnn_model, -m: model_name.mnn
+--image, -i: image_name.jpg
+--classes, -l: classes labels for the model
+--input_mean, -b: input mean
+--input_std, -s: input standard deviation
+--threads, -t: number of threads
+--count, -c: loop model run for certain times
+--warmup_runs, -w: number of warmup runs
+--mask, -k: mask png file to save segment output
+--keep_shape, -p: [0|1] keep predict mask as the same shape of input image
+
+# ./deeplabSegment -m model.pb.mnn -i ../../../example/dog.jpg -t 4 -c 10 -w 2 -l ../../../configs/voc_classes.txt -p 0
+Can't Find type=4 backend, use 0 instead
+num_classes: 21
+image_input: width:512 , height:512, channel: 3
+origin image size: width:768, height:576, channel:3
+model invoke average time: 111.680200 ms
+output tensor: name:pred_mask/Softmax, width:512, height:512, channel: 21
+Tensorflow format: NHWC
+batch 0:
+deeplab_postprocess time: 8.132000 ms
+Segment class:
+car
+bicycle
+dog
+Segmentation result has been saved to: ./mask.png
+```
+Here the [classes](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/configs/voc_classes.txt) file format are the same as used in training part
+
+
+
+
+### Tensorflow-Lite
+
+1. Build TF-Lite lib
+
+We can do either native compile for X86 or cross-compile for ARM
+
+```
+# git clone https://github.com/tensorflow/tensorflow <Path_to_TF>
+# cd <Path_to_TF>
+# ./tensorflow/lite/tools/make/download_dependencies.sh
+# make -f tensorflow/lite/tools/make/Makefile   #for X86 native compile
+# ./tensorflow/lite/tools/make/build_rpi_lib.sh #for ARM cross compile, e.g Rasperberry Pi
+```
+
+2. Build demo inference application
+```
+# cd tf-keras-deeplabv3p-model-set/inference/tflite
+# mkdir build && cd build
+# cmake -DTF_ROOT_PATH=<Path_to_TF> [-DCMAKE_TOOLCHAIN_FILE=<cross-compile toolchain file>] [-DTARGET_PLAT=<target>] ..
+# make
+```
+If you want to do cross compile for ARM platform, "CMAKE_TOOLCHAIN_FILE" and "TARGET_PLAT" should be specified. Refer [CMakeLists.txt](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/inference/tflite/CMakeLists.txt) for details.
+
+3. Convert trained DeepLab v3+ model to tflite model
+
+Tensorflow-lite support both Float32 and UInt8 type model. We can dump out the keras .h5 model to Float32 .tflite model or use [post_train_quant_convert.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/tools/model_converter/post_train_quant_convert.py) script to convert to UInt8 model with TF 2.0 Post-training integer quantization tech, which could be smaller and faster on ARM:
+
+* dump out inference model from training checkpoint:
+
+    ```
+    # python deeplab.py --model_type=mobilenetv2_lite --weights_path=logs/000/<checkpoint>.h5 --classes_path=configs/voc_classes.txt --model_input_shape=512x512 --output_stride=16 --dump_model --output_model_file=model.h5
+    ```
+
+* convert keras .h5 model to Float32 tflite model:
+
+    ```
+    # tflite_convert --keras_model_file=model.h5 --output_file=model.tflite
+    ```
+
+* convert keras .h5 model to UInt8 tflite model with TF 2.0 Post-training integer quantization:
+
+    ```
+    # cd tf-keras-deeplabv3p-model-set/tools/model_converter/
+    # python post_train_quant_convert.py --keras_model_file=model.h5 --dataset_path=../../VOC2012/ --dataset_file=../../VOC2012/ImageSets/Segmentation/val.txt --model_input_shape=512x512 --sample_num=30 --output_file=model_quant.tflite
+    ```
+
+
+4. Run validate script to check TFLite model
+```
+# cd tf-keras-deeplabv3p-model-set/tools/evaluation/
+# python validate_deeplab.py --model_path=model.tflite --classes_path=../../configs/voc_classes.txt --image_file=../../example/2007_000346.jpg --label_file=../../example/2007_000346.png --loop_count=5
+```
+#### You can also use [eval.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set#evaluation) to do evaluation on the TFLite model
+
+
+
+5. Run application to do inference with model, or put assets to ARM board and run if cross-compile
+```
+# cd tf-keras-deeplabv3p-model-set/inference/tflite/build
+# ./deeplabSegment -h
+Usage: deeplabSegment
+--tflite_model, -m: model_name.tflite
+--image, -i: image_name.jpg
+--classes, -l: classes labels for the model
+--input_mean, -b: input mean
+--input_std, -s: input standard deviation
+--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not
+--threads, -t: number of threads
+--count, -c: loop interpreter->Invoke() for certain times
+--warmup_runs, -w: number of warmup runs
+--mask, -k: mask png file to save segment output
+--keep_shape, -p: [0|1] keep predict mask as the same shape of input image
+--verbose, -v: [0|1] print more information
+
+
+# ./deeplabSegment -m model.tflite -i ../../../example/dog.jpg -t 4 -c 10 -w 2 -l ../../../configs/voc_classes.txt -p 0 -v 1
+Loaded model model.tflite
+resolved reporter
+num_classes: 21
+input tensor info: type 1, batch 1, height 512, width 512, channels 3
+origin image size: width:768, height:576, channel:3
+invoked average time:428.07 ms
+output tensor info: name pred_mask/Softmax, type 1, batch 1, height 512, width 512, channels 21
+batch 0
+deeplab_postprocess time: 7.399 ms
+Segment class:
+car
+bicycle
+dog
+Segmentation result has been saved to: ./mask.png
+```
+
+### On-device evaluation
+
+1. Build your MNN/TFLite version "deeplabSegment" application and put it on device together with [eval_inference.sh](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/inference/eval_inference.sh). Then run the script to generate on-device inference result txt file for test images:
+
+```
+# ./eval_inference.sh
+Usage: ./eval_inference.sh <model_file> <image_path> <dataset_file> <class_file> <output_path>
+```
+
+The output label mask PNG image will be saved at `<output_path>/<image_id>.png`
+
+2. Use independent evaluation tool [semantic_segment_eval.py](https://github.com/david8862/tf-keras-deeplabv3p-model-set/blob/master/tools/evaluation/onboard/semantic_segment_eval.py) to calculate mIOU and other metrics with result PNG images.
+
diff --git a/models/deeplab/inference/eval_inference.sh b/models/deeplab/inference/eval_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d45c4799a225f802cf3c75eb61ea3fd2a523883b
--- /dev/null
+++ b/models/deeplab/inference/eval_inference.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+if [[ "$#" -ne 5 ]]; then
+    echo "Usage: $0 <model_file> <image_path> <dataset_file> <class_file> <output_path>"
+    exit 1
+fi
+
+MODEL_FILE=$1
+IMAGE_PATH=$2
+DATASET_FILE=$3
+CLASS_FILE=$4
+OUTPUT_PATH=$5
+
+IMAGE_LIST=$(cat $DATASET_FILE)
+IMAGE_NUM=$(cat $DATASET_FILE | wc -l)
+
+#prepare process bar
+i=0
+ICON_ARRAY=("\\" "|" "/" "-")
+
+#create output path first
+mkdir -p $OUTPUT_PATH
+
+for IMAGE_ID in $IMAGE_LIST
+do
+    ./deeplabSegment -m $MODEL_FILE -i $IMAGE_PATH"/"$IMAGE_ID".jpg" -l $CLASS_FILE -k $OUTPUT_PATH"/"$IMAGE_ID".png" -t 4 -c 1 -w 1 -p 0 2>&1 >> /dev/null
+    #update process bar
+    let index=i%4
+    printf "inference process: %d/%d [%c]\r" "$i" "$IMAGE_NUM" "${ICON_ARRAY[$index]}"
+    let i=i+1
+done
+printf "\nDone\n"
diff --git a/models/deeplab/inference/tflite/CMakeLists.txt b/models/deeplab/inference/tflite/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dc1e12532c5e770e4a31769de17810598a081f45
--- /dev/null
+++ b/models/deeplab/inference/tflite/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.5)
+project(deeplabSegment LANGUAGES CXX)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+#SET(CMAKE_BUILD_TYPE "Debug")
+SET(CMAKE_BUILD_TYPE "Release")
+
+#### specify the build flag
+SET(CMAKE_CXX_FLAGS  "-std=c++11 -O2")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
+SET(TARGET_PLAT "linux_x86_64" CACHE STRING INTERNAL)
+
+set(DEEPLAB_SEGMENT_SRC
+        deeplabSegment.cpp)
+
+#set(TF_ROOT_PATH /mnt/d/Downloads/tensorflow)
+link_directories("${TF_ROOT_PATH}/tensorflow/lite/tools/make/gen/${TARGET_PLAT}/lib/")
+
+add_executable(deeplabSegment ${DEEPLAB_SEGMENT_SRC})
+
+target_include_directories(deeplabSegment PRIVATE ${TF_ROOT_PATH} ${TF_ROOT_PATH}/tensorflow/lite/tools/make/downloads/flatbuffers/include)
+target_link_libraries(deeplabSegment PRIVATE libtensorflow-lite.a -lstdc++ -lpthread -lm -ldl -lrt)
+#target_link_libraries(deeplabSegment PRIVATE -ltensorflow-lite -lstdc++ -lpthread -lm -ldl -lrt)
diff --git a/models/deeplab/inference/tflite/deeplabSegment.cpp b/models/deeplab/inference/tflite/deeplabSegment.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..27bd1f5076015e08f62fc0f2b610a53d31c61f56
--- /dev/null
+++ b/models/deeplab/inference/tflite/deeplabSegment.cpp
@@ -0,0 +1,458 @@
+//
+//  deeplabSegment.cpp
+//  Tensorflow-lite
+//
+//  Created by david8862 on 2020/08/26.
+//
+#include <fcntl.h>
+#include <math.h>
+#include <getopt.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <climits>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/string_util.h"
+
+#include "deeplabSegment.h"
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize.h"
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb_image_write.h"
+
+#define LOG(x) std::cout
+
+namespace deeplabSegment {
+
+double get_us(struct timeval t)
+{
+    return (t.tv_sec * 1000000 + t.tv_usec);
+}
+
+
+// DeepLab postprocess for prediction mask tensor
+void deeplab_postprocess(const TfLiteTensor* mask_tensor, uint8_t* mask_array, std::vector<uint8_t> &class_indexes)
+{
+    // 1. do following transform to get the output segmentation
+    //    mask array:
+    //
+    //    mask = np.argmax(prediction, -1)
+    //
+    const float* data = reinterpret_cast<float*>(mask_tensor->data.raw);
+
+    TfLiteIntArray* output_dims = mask_tensor->dims;
+    int batch = output_dims->data[0];
+    int height = output_dims->data[1];
+    int width = output_dims->data[2];
+    int channel = output_dims->data[3];
+    auto unit = sizeof(float);
+
+    // TF/TFLite tensor format: NHWC
+    auto bytesPerRow   = channel * unit;
+    auto bytesPerImage = width * bytesPerRow;
+    auto bytesPerBatch = height * bytesPerImage;
+
+    // Check and clear output mask array
+    assert(mask_array != nullptr);
+    bzero((void*)mask_array, height * width * 1 * sizeof(uint8_t));
+
+    for (int b = 0; b < batch; b++) {
+        auto bytes = data + b * bytesPerBatch / unit;
+        LOG(INFO) << "batch " << b << "\n";
+
+        for (int h = 0; h < height; h++) {
+            for (int w = 0; w < width; w++) {
+                //get bbox prediction data offset for each anchor, each feature point
+                int class_scores_offset, class_scores_step;
+                // Tensorflow format tensor, NHWC
+                class_scores_offset = h * width * channel + w * channel;
+                class_scores_step = 1;
+
+                // Get class index with max score (index 0 should be background),
+                // just as Python postprocess:
+                //
+                // mask = np.argmax(prediction, -1)
+                //
+                uint8_t class_index = 0;
+                float max_score = 0.0;
+                for (int i = 0; i < channel; i++) {
+                    if (bytes[class_scores_offset + i * class_scores_step] > max_score) {
+                        class_index = i;
+                        max_score = bytes[class_scores_offset + i * class_scores_step];
+                    }
+                }
+                int mask_offset = h * width + w;
+                mask_array[mask_offset] = class_index;
+
+                if(class_index != 0 && std::count(class_indexes.begin(), class_indexes.end(), class_index) == 0) {
+                    class_indexes.emplace_back(class_index);
+                }
+            }
+        }
+    }
+    return;
+}
+
+
+//Resize image to model input shape
+uint8_t* image_resize(uint8_t* inputImage, int image_width, int image_height, int image_channel, int input_width, int input_height, int input_channel)
+{
+    // assume the data channel match
+    assert(image_channel == input_channel);
+
+    uint8_t* input_image = (uint8_t*)malloc(input_height * input_width * input_channel * sizeof(uint8_t));
+    if (input_image == nullptr) {
+        LOG(ERROR) << "Can't alloc memory\n";
+        exit(-1);
+    }
+    stbir_resize_uint8(inputImage, image_width, image_height, 0,
+                     input_image, input_width, input_height, 0, image_channel);
+
+    return input_image;
+}
+
+
+template <class T>
+void fill_data(T* out, uint8_t* in, int input_width, int input_height,
+            int input_channels, Settings* s) {
+  auto output_number_of_pixels = input_height * input_width * input_channels;
+
+  for (int i = 0; i < output_number_of_pixels; i++) {
+    if (s->input_floating)
+      out[i] = (in[i] - s->input_mean) / s->input_std;
+    else
+      out[i] = (uint8_t)in[i];
+  }
+
+  return;
+}
+
+
+void RunInference(Settings* s) {
+  if (!s->model_name.c_str()) {
+    LOG(ERROR) << "no model file name\n";
+    exit(-1);
+  }
+
+  // load model
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  model = tflite::FlatBufferModel::BuildFromFile(s->model_name.c_str());
+  if (!model) {
+    LOG(FATAL) << "\nFailed to mmap model " << s->model_name << "\n";
+    exit(-1);
+  }
+  //s->model = model.get();
+  LOG(INFO) << "Loaded model " << s->model_name << "\n";
+  model->error_reporter();
+  LOG(INFO) << "resolved reporter\n";
+
+  // prepare model interpreter
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (!interpreter) {
+    LOG(FATAL) << "Failed to construct interpreter\n";
+    exit(-1);
+  }
+
+  interpreter->SetAllowFp16PrecisionForFp32(s->allow_fp16);
+  if (s->number_of_threads != -1) {
+    interpreter->SetNumThreads(s->number_of_threads);
+  }
+
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    LOG(FATAL) << "Failed to allocate tensors!";
+  }
+
+  // get classes labels and add background label
+  std::vector<std::string> classes;
+  classes.emplace_back("background");
+  std::ifstream classesOs(s->classes_file_name.c_str());
+  std::string line;
+  while (std::getline(classesOs, line)) {
+      classes.emplace_back(line);
+  }
+  int num_classes = classes.size();
+  LOG(INFO) << "num_classes: " << num_classes << "\n";
+
+
+  // assuming one input only
+  const std::vector<int> inputs = interpreter->inputs();
+  assert(inputs.size() == 1);
+
+  // get input dimension from the input tensor metadata
+  int input = interpreter->inputs()[0];
+  TfLiteIntArray* dims = interpreter->tensor(input)->dims;
+  int input_batch = dims->data[0];
+  int input_height = dims->data[1];
+  int input_width = dims->data[2];
+  int input_channels = dims->data[3];
+
+  if (s->verbose) LOG(INFO) << "input tensor info: "
+                            << "type " << interpreter->tensor(input)->type << ", "
+                            << "batch " << input_batch << ", "
+                            << "height " << input_height << ", "
+                            << "width " << input_width << ", "
+                            << "channels " << input_channels << "\n";
+
+  // read input image
+  int image_width, image_height, image_channel;
+
+  auto input_image = (uint8_t*)stbi_load(s->input_img_name.c_str(), &image_width, &image_height, &image_channel, 3);
+  if (input_image == nullptr) {
+      LOG(FATAL) << "Can't open" << s->input_img_name << "\n";
+      exit(-1);
+  }
+
+  LOG(INFO) << "origin image size: width:" << image_width
+            << ", height:" << image_height
+            << ", channel:" << image_channel
+            << "\n";
+
+  // resize input image
+  uint8_t* resizeImage = image_resize(input_image, image_width, image_height, image_channel, input_width, input_height, input_channels);
+
+  // free input image
+  stbi_image_free(input_image);
+  input_image = nullptr;
+
+  // fulfill image data to model input tensor
+  switch (interpreter->tensor(input)->type) {
+    case kTfLiteFloat32:
+      s->input_floating = true;
+      fill_data<float>(interpreter->typed_tensor<float>(input), resizeImage,
+                    input_width, input_height, input_channels, s);
+      break;
+    case kTfLiteUInt8:
+      fill_data<uint8_t>(interpreter->typed_tensor<uint8_t>(input), resizeImage,
+                    input_width, input_height, input_channels, s);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle input type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
+  }
+
+  // run warm up session
+  if (s->loop_count > 1)
+    for (int i = 0; i < s->number_of_warmup_runs; i++) {
+      if (interpreter->Invoke() != kTfLiteOk) {
+        LOG(FATAL) << "Failed to invoke tflite!\n";
+      }
+    }
+
+  // run model sessions to get output
+  struct timeval start_time, stop_time;
+  gettimeofday(&start_time, nullptr);
+  for (int i = 0; i < s->loop_count; i++) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      LOG(FATAL) << "Failed to invoke tflite!\n";
+    }
+  }
+  gettimeofday(&stop_time, nullptr);
+  LOG(INFO) << "invoked average time:" << (get_us(stop_time) - get_us(start_time)) / (s->loop_count * 1000) << " ms \n";
+
+  // get output tensor info, assume only 1 output tensor (pred_mask/Softmax)
+  // image_input: 1 x 512 x 512 x 3
+  // "pred_mask/Softmax": 1 x 512 x 512 x num_classes
+  const std::vector<int> outputs = interpreter->outputs();
+  assert(outputs.size() == 1);
+
+  // Now we only support float32 type output tensor
+  assert(mask_output->type == kTfLiteFloat32);
+
+  int output = interpreter->outputs()[0];
+  TfLiteTensor* mask_output = interpreter->tensor(output);
+
+  TfLiteIntArray* output_dims = mask_output->dims;
+  int mask_batch = output_dims->data[0];
+  int mask_height = output_dims->data[1];
+  int mask_width = output_dims->data[2];
+  int mask_channels = output_dims->data[3];
+
+  if (s->verbose) LOG(INFO) << "output tensor info: "
+      << "name " << mask_output->name << ", "
+          << "type " << mask_output->type << ", "
+          << "batch " << mask_batch << ", "
+          << "height " << mask_height << ", "
+          << "width " << mask_width << ", "
+          << "channels " << mask_channels << "\n";
+
+  // check if predict mask channel number
+  // matches classes definition
+  assert(num_classes == mask_channels);
+
+  // Alloc mask array for post process
+  uint8_t* mask_array = (uint8_t*)malloc(mask_height * mask_width * 1 * sizeof(uint8_t));
+  if (mask_array == nullptr) {
+      LOG(ERROR) << "Can't alloc memory\n";
+      exit(-1);
+  }
+  std::vector<uint8_t> class_indexes;
+
+  // Do deeplab_postprocess to generate mask array
+  gettimeofday(&start_time, nullptr);
+  deeplab_postprocess(mask_output, mask_array, class_indexes);
+  gettimeofday(&stop_time, nullptr);
+  LOG(INFO) << "deeplab_postprocess time: " << (get_us(stop_time) - get_us(start_time)) / 1000 << " ms\n";
+
+  int save_width, save_height;
+  if (s->keep_shape) {
+      // Resize the prediction mask back to original image shape
+      uint8_t* origin_mask_array = image_resize(mask_array, mask_width, mask_height, 1, image_width, image_height, 1);
+      // free prediction mask
+      free(mask_array);
+      mask_array = origin_mask_array;
+      save_width = image_width;
+      save_height = image_height;
+  } else {
+      save_width = mask_width;
+      save_height = mask_height;
+  }
+
+  // Show segment class result
+  LOG(INFO) << "Segment class:\n";
+  for(auto class_index : class_indexes) {
+      LOG(INFO) << classes[class_index] << "\n";
+  }
+
+  // Save mask array to png image file
+  stbi_write_png(s->mask_img_name.c_str(), save_width, save_height, 1, mask_array, 0);
+  LOG(INFO) << "Segmentation result has been saved to: " << s->mask_img_name << "\n";
+
+  return;
+}
+
+void display_usage() {
+  LOG(INFO)
+      << "Usage: deeplabSegment\n"
+      << "--tflite_model, -m: model_name.tflite\n"
+      << "--image, -i: image_name.jpg\n"
+      << "--classes, -l: classes labels for the model\n"
+      << "--input_mean, -b: input mean\n"
+      << "--input_std, -s: input standard deviation\n"
+      << "--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not\n"
+      << "--threads, -t: number of threads\n"
+      << "--count, -c: loop interpreter->Invoke() for certain times\n"
+      << "--warmup_runs, -w: number of warmup runs\n"
+      << "--mask, -k: mask png file to save segment output\n"
+      << "--keep_shape, -p: [0|1] keep predict mask as the same shape of input image\n"
+      << "--verbose, -v: [0|1] print more information\n"
+      << "\n";
+}
+
+
+int Main(int argc, char** argv) {
+  Settings s;
+
+  int c;
+  while (1) {
+    static struct option long_options[] = {
+        {"tflite_model", required_argument, nullptr, 'm'},
+        {"image", required_argument, nullptr, 'i'},
+        {"classes", required_argument, nullptr, 'l'},
+        {"input_mean", required_argument, nullptr, 'b'},
+        {"input_std", required_argument, nullptr, 's'},
+        {"threads", required_argument, nullptr, 't'},
+        {"allow_fp16", required_argument, nullptr, 'f'},
+        {"count", required_argument, nullptr, 'c'},
+        {"warmup_runs", required_argument, nullptr, 'w'},
+        {"mask", required_argument, nullptr, 'k'},
+        {"keep_shape", required_argument, nullptr, 'p'},
+        {"verbose", required_argument, nullptr, 'v'},
+        {"help", no_argument, nullptr, 'h'},
+        {nullptr, 0, nullptr, 0}};
+
+    /* getopt_long stores the option index here. */
+    int option_index = 0;
+
+    c = getopt_long(argc, argv,
+                    "b:c:f:i:hk:l:m:p:s:t:v:w:", long_options,
+                    &option_index);
+
+    /* Detect the end of the options. */
+    if (c == -1) break;
+
+    switch (c) {
+      case 'b':
+        s.input_mean = strtod(optarg, nullptr);
+        break;
+      case 'c':
+        s.loop_count =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
+      case 'f':
+        s.allow_fp16 =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
+      case 'i':
+        s.input_img_name = optarg;
+        break;
+      case 'l':
+        s.classes_file_name = optarg;
+        break;
+      case 'm':
+        s.model_name = optarg;
+        break;
+      case 's':
+        s.input_std = strtod(optarg, nullptr);
+        break;
+      case 't':
+        s.number_of_threads = strtol(  // NOLINT(runtime/deprecated_fn)
+            optarg, nullptr, 10);
+        break;
+      case 'v':
+        s.verbose =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
+      case 'w':
+        s.number_of_warmup_runs =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
+      case 'p':
+        s.keep_shape =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
+      case 'k':
+        s.mask_img_name = optarg;
+        break;
+      case 'h':
+      case '?':
+      default:
+        /* getopt_long already printed an error message. */
+        display_usage();
+        exit(-1);
+        exit(-1);
+    }
+  }
+  RunInference(&s);
+  return 0;
+}
+
+}  // namespace deeplabSegment
+
+int main(int argc, char** argv) {
+  return deeplabSegment::Main(argc, argv);
+}
diff --git a/models/deeplab/inference/tflite/deeplabSegment.h b/models/deeplab/inference/tflite/deeplabSegment.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae15c5c3b5da1718e848ba1ecea9b3db83e6b6c0
--- /dev/null
+++ b/models/deeplab/inference/tflite/deeplabSegment.h
@@ -0,0 +1,38 @@
+//
+//  deeplabSegment.h
+//  Tensorflow-lite
+//
+//  Created by david8862 on 2020/08/26.
+//
+//
+
+#ifndef DEEPLAB_SEGMENT_H_
+#define DEEPLAB_SEGMENT_H_
+
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace deeplabSegment {
+
+struct Settings {
+  bool verbose = false;
+  bool accel = false;
+  bool input_floating = false;
+  bool allow_fp16 = false;
+  bool keep_shape = false;
+  int loop_count = 1;
+  float input_mean = 0.0f;
+  float input_std = 1.0f;
+  std::string model_name = "./model.tflite";
+  //tflite::FlatBufferModel* model;
+  std::string input_img_name = "./dog.jpg";
+  std::string classes_file_name = "./classes.txt";
+  std::string mask_img_name = "./mask.png";
+  //std::string input_layer_type = "uint8_t";
+  int number_of_threads = 4;
+  int number_of_warmup_runs = 2;
+};
+
+}  // namespace deeplabSegment
+
+#endif  // DEEPLAB_SEGMENT_H_
diff --git a/models/deeplab/requirements.txt b/models/deeplab/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2d6dd16e4b02587c2bf270f2619508cbcc42b0a4
--- /dev/null
+++ b/models/deeplab/requirements.txt
@@ -0,0 +1,18 @@
+numpy
+scipy
+scikit-learn
+opencv-python==4.2.0.32
+opencv-contrib-python==4.2.0.32
+tensorflow-gpu
+keras_applications
+tensorflow-addons
+matplotlib
+tqdm
+pillow
+mnn
+keras2onnx
+tf2onnx
+onnx
+onnxruntime
+pydensecrf
+labelme
diff --git a/models/deeplab/train.py b/models/deeplab/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..b186258abda4e6d4002040aa4ba13501db304b6f
--- /dev/null
+++ b/models/deeplab/train.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Train the deeplabv3p model for your own dataset.
+"""
+import os, sys, argparse, time
+import warnings
+import tensorflow.keras.backend as K
+from tensorflow.keras.optimizers import Adam, RMSprop
+from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, TerminateOnNaN
+
+from deeplabv3p.model import get_deeplabv3p_model
+from unet.model import get_unet_model
+from fast_scnn.model import get_fast_scnn_model
+from deeplabv3p.data import SegmentationGenerator
+from deeplabv3p.loss import sparse_crossentropy, softmax_focal_loss, WeightedSparseCategoricalCrossEntropy
+from deeplabv3p.metrics import Jaccard#, sparse_accuracy_ignoring_last_label
+from common.utils import get_classes, get_data_list, optimize_tf_gpu, calculate_weigths_labels, load_class_weights
+from common.model_utils import get_optimizer
+from common.callbacks import EvalCallBack
+
+# Try to enable Auto Mixed Precision on TF 2.0
+os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+os.environ['TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'] = '1'
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+import tensorflow as tf
+optimize_tf_gpu(tf, K)
+
+
+def main(args):
+    log_dir = 'logs/000/'
+    # get class info, add background class to match model & GT
+    class_names = get_classes(args.classes_path)
+    assert len(class_names) < 254, 'PNG image label only support less than 254 classes.'
+    class_names = ['background'] + class_names
+    num_classes = len(class_names)
+
+    # callbacks for training process
+    monitor = 'Jaccard'
+
+    tensorboard = TensorBoard(log_dir=log_dir, histogram_freq=0, write_graph=False, write_grads=False, write_images=False, update_freq='batch')
+    checkpoint = ModelCheckpoint(os.path.join(log_dir, 'ep{epoch:03d}-loss{loss:.3f}-Jaccard{Jaccard:.3f}-val_loss{val_loss:.3f}-val_Jaccard{val_Jaccard:.3f}.h5'),
+        monitor='val_{}'.format(monitor),
+        mode='max',
+        verbose=1,
+        save_weights_only=False,
+        save_best_only=True,
+        period=1)
+
+    reduce_lr = ReduceLROnPlateau(monitor='val_{}'.format(monitor), factor=0.5, mode='max',
+                patience=5, verbose=1, cooldown=0, min_lr=1e-6)
+    early_stopping = EarlyStopping(monitor='val_{}'.format(monitor), min_delta=0, patience=100, verbose=1, mode='max')
+    terminate_on_nan = TerminateOnNaN()
+
+    callbacks=[tensorboard, checkpoint, reduce_lr, early_stopping, terminate_on_nan]
+
+
+    # get train&val dataset
+    dataset = get_data_list(args.dataset_file)
+    if args.val_dataset_file:
+        val_dataset = get_data_list(args.val_dataset_file)
+        num_train = len(dataset)
+        num_val = len(val_dataset)
+        dataset.extend(val_dataset)
+    else:
+        val_split = args.val_split
+        num_val = int(len(dataset)*val_split)
+        num_train = len(dataset) - num_val
+
+    # prepare train&val data generator
+    train_generator = SegmentationGenerator(args.dataset_path, dataset[:num_train],
+                                            args.batch_size,
+                                            num_classes,
+                                            target_size=args.model_input_shape[::-1],
+                                            weighted_type=args.weighted_type,
+                                            is_eval=False,
+                                            augment=True)
+
+    valid_generator = SegmentationGenerator(args.dataset_path, dataset[num_train:],
+                                            args.batch_size,
+                                            num_classes,
+                                            target_size=args.model_input_shape[::-1],
+                                            weighted_type=args.weighted_type,
+                                            is_eval=False,
+                                            augment=False)
+
+    # prepare online evaluation callback
+    if args.eval_online:
+        eval_callback = EvalCallBack(args.dataset_path, dataset[num_train:], class_names, args.model_input_shape, args.model_pruning, log_dir, eval_epoch_interval=args.eval_epoch_interval, save_eval_checkpoint=args.save_eval_checkpoint)
+        callbacks.append(eval_callback)
+
+    # prepare optimizer
+    optimizer = get_optimizer(args.optimizer, args.learning_rate, average_type=None, decay_type=None)
+
+    # prepare loss according to loss type & weigted type
+    if args.weighted_type == 'balanced':
+        classes_weights_path = os.path.join(args.dataset_path, 'classes_weights.txt')
+        if os.path.isfile(classes_weights_path):
+            weights = load_class_weights(classes_weights_path)
+        else:
+            weights = calculate_weigths_labels(train_generator, num_classes, save_path=args.dataset_path)
+        losses = WeightedSparseCategoricalCrossEntropy(weights)
+        sample_weight_mode = None
+    elif args.weighted_type == 'adaptive':
+        losses = sparse_crossentropy
+        sample_weight_mode = 'temporal'
+    elif args.weighted_type == None:
+        losses = sparse_crossentropy
+        sample_weight_mode = None
+    else:
+        raise ValueError('invalid weighted_type {}'.format(args.weighted_type))
+
+    if args.loss == 'focal':
+        warnings.warn("Focal loss doesn't support weighted class balance, will ignore related config")
+        losses = softmax_focal_loss
+        sample_weight_mode = None
+    elif args.loss == 'crossentropy':
+        # using crossentropy will keep the weigted type setting
+        pass
+    else:
+        raise ValueError('invalid loss type {}'.format(args.loss))
+
+    # prepare metric
+    metrics = {'pred_mask' : Jaccard}
+
+    # support multi-gpu training
+    if args.gpu_num >= 2:
+        # devices_list=["/gpu:0", "/gpu:1"]
+        devices_list=["/gpu:{}".format(n) for n in range(args.gpu_num)]
+        strategy = tf.distribute.MirroredStrategy(devices=devices_list)
+        print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))
+        with strategy.scope():
+            # get multi-gpu train model
+            if args.model_type.startswith('unet_'):
+                model = get_unet_model(args.model_type, num_classes, args.model_input_shape, args.freeze_level, weights_path=args.weights_path)
+            elif args.model_type.startswith('fast_scnn'):
+                model = get_fast_scnn_model(args.model_type, num_classes, args.model_input_shape, weights_path=args.weights_path)
+            else:
+                model = get_deeplabv3p_model(args.model_type, num_classes, args.model_input_shape, args.output_stride, args.freeze_level, weights_path=args.weights_path)
+            # compile model
+            model.compile(optimizer=optimizer, sample_weight_mode=sample_weight_mode,
+                          loss = losses, metrics = metrics)
+    else:
+        # get normal train model
+        if args.model_type.startswith('unet_'):
+            model = get_unet_model(args.model_type, num_classes, args.model_input_shape, args.freeze_level, weights_path=args.weights_path)
+        elif args.model_type.startswith('fast_scnn'):
+            model = get_fast_scnn_model(args.model_type, num_classes, args.model_input_shape, weights_path=args.weights_path)
+        else:
+            model = get_deeplabv3p_model(args.model_type, num_classes, args.model_input_shape, args.output_stride, args.freeze_level, weights_path=args.weights_path)
+        # compile model
+        model.compile(optimizer=optimizer, sample_weight_mode=sample_weight_mode,
+                      loss = losses, metrics = metrics)
+    model.summary()
+
+    # Transfer training some epochs with frozen layers first if needed, to get a stable loss.
+    initial_epoch = args.init_epoch
+    epochs = initial_epoch + args.transfer_epoch
+    print("Transfer training stage")
+    print('Train on {} samples, val on {} samples, with batch size {}, input_shape {}.'.format(num_train, num_val, args.batch_size, args.model_input_shape))
+    model.fit_generator(generator=train_generator,
+                        steps_per_epoch=len(train_generator),
+                        validation_data=valid_generator,
+                        validation_steps=len(valid_generator),
+                        epochs=epochs,
+                        initial_epoch=initial_epoch,
+                        verbose=1,
+                        workers=1,
+                        use_multiprocessing=False,
+                        max_queue_size=10,
+                        callbacks = callbacks)
+
+    # Wait 2 seconds for next stage
+    time.sleep(2)
+
+    if args.decay_type or args.average_type:
+        # rebuild optimizer to apply learning rate decay or weights averager,
+        # only after unfreeze all layers
+        if args.decay_type:
+            callbacks.remove(reduce_lr)
+
+        if args.average_type == 'ema' or args.average_type == 'swa':
+            # weights averager need tensorflow-addons,
+            # which request TF 2.x and have version compatibility
+            import tensorflow_addons as tfa
+            callbacks.remove(checkpoint)
+            avg_checkpoint = tfa.callbacks.AverageModelCheckpoint(filepath=os.path.join(log_dir, 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'),
+                                                                  update_weights=True,
+                                                                  monitor='val_loss',
+                                                                  mode='min',
+                                                                  verbose=1,
+                                                                  save_weights_only=False,
+                                                                  save_best_only=True,
+                                                                  period=1)
+            callbacks.append(avg_checkpoint)
+
+        steps_per_epoch = max(1, len(train_generator))
+        decay_steps = steps_per_epoch * (args.total_epoch - args.init_epoch - args.transfer_epoch)
+        optimizer = get_optimizer(args.optimizer, args.learning_rate, average_type=args.average_type, decay_type=args.decay_type, decay_steps=decay_steps)
+
+    # Unfreeze the whole network for further tuning
+    # NOTE: more GPU memory is required after unfreezing the body
+    print("Unfreeze and continue training, to fine-tune.")
+    if args.gpu_num >= 2:
+        with strategy.scope():
+            for i in range(len(model.layers)):
+                model.layers[i].trainable = True
+            model.compile(optimizer=optimizer, sample_weight_mode=sample_weight_mode,
+                          loss = losses, metrics = metrics) # recompile to apply the change
+
+    else:
+        for i in range(len(model.layers)):
+            model.layers[i].trainable = True
+        model.compile(optimizer=optimizer, sample_weight_mode=sample_weight_mode,
+                      loss = losses, metrics = metrics) # recompile to apply the change
+
+    print('Train on {} samples, val on {} samples, with batch size {}, input_shape {}.'.format(num_train, num_val, args.batch_size, args.model_input_shape))
+    model.fit_generator(generator=train_generator,
+                        steps_per_epoch=len(train_generator),
+                        validation_data=valid_generator,
+                        validation_steps=len(valid_generator),
+                        epochs=args.total_epoch,
+                        initial_epoch=epochs,
+                        verbose=1,
+                        workers=1,
+                        use_multiprocessing=False,
+                        max_queue_size=10,
+                        callbacks = callbacks)
+
+    # Finally store model
+    model.save(os.path.join(log_dir, 'trained_final.h5'))
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Model definition options
+    parser.add_argument('--model_type', type=str, required=False, default='mobilenetv2_lite',
+        help='DeepLabv3+ model type: mobilenetv2/mobilenetv2_lite/resnet50, default=%(default)s')
+    parser.add_argument('--weights_path', type=str, required=False, default=None,
+        help = "Pretrained model/weights file for fine tune")
+    parser.add_argument('--model_input_shape', type=str, required=False, default='512x512',
+        help = "model image input shape as <height>x<width>, default=%(default)s")
+    parser.add_argument('--output_stride', type=int, required=False, default=16, choices=[8, 16, 32],
+        help = "model output stride, default=%(default)s")
+
+    # Data options
+    parser.add_argument('--dataset_path', type=str, required=False, default='VOC2012/',
+        help='dataset path containing images and label png file, default=%(default)s')
+    parser.add_argument('--dataset_file', type=str, required=False, default='VOC2012/ImageSets/Segmentation/trainval.txt',
+        help='train samples txt file, default=%(default)s')
+    parser.add_argument('--val_dataset_file', type=str, required=False, default=None,
+        help='val samples txt file, default=%(default)s')
+    parser.add_argument('--val_split', type=float, required=False, default=0.1,
+        help = "validation data persentage in dataset if no val dataset provide, default=%(default)s")
+    parser.add_argument('--classes_path', type=str, required=False, default='configs/voc_classes.txt',
+        help='path to class definitions, default=%(default)s')
+
+    # Training options
+    parser.add_argument("--batch_size", type=int, required=False, default=16,
+        help='batch size for training, default=%(default)s')
+    parser.add_argument('--optimizer', type=str, required=False, default='sgd', choices=['adam', 'rmsprop', 'sgd'],
+        help = "optimizer for training (adam/rmsprop/sgd), default=%(default)s")
+    parser.add_argument('--loss', type=str, required=False, default='crossentropy', choices=['crossentropy', 'focal'],
+        help = "loss type for training (crossentropy/focal), default=%(default)s")
+    parser.add_argument('--weighted_type', type=str, required=False, default=None, choices=[None, 'adaptive', 'balanced'],
+        help = "class balance weighted type, default=%(default)s")
+    parser.add_argument('--learning_rate', type=float, required=False, default=1e-2,
+        help = "Initial learning rate, default=%(default)s")
+    parser.add_argument('--average_type', type=str, required=False, default=None, choices=[None, 'ema', 'swa', 'lookahead'],
+        help = "weights average type, default=%(default)s")
+    parser.add_argument('--decay_type', type=str, required=False, default=None, choices=[None, 'cosine', 'exponential', 'polynomial', 'piecewise_constant'],
+        help = "Learning rate decay type, default=%(default)s")
+    parser.add_argument('--transfer_epoch', type=int, required=False, default=5,
+        help = "Transfer training stage epochs, default=%(default)s")
+    parser.add_argument('--freeze_level', type=int, required=False, default=1, choices=[0, 1, 2],
+        help = "Freeze level of the model in transfer training stage. 0:NA/1:backbone/2:only open prediction layer")
+
+    parser.add_argument("--init_epoch", type=int, required=False, default=0,
+        help="initial training epochs for fine tune training, default=%(default)s")
+    parser.add_argument("--total_epoch", type=int, required=False, default=150,
+        help="total training epochs, default=%(default)s")
+    parser.add_argument('--gpu_num', type=int, required=False, default=1,
+        help='Number of GPU to use, default=%(default)s')
+    parser.add_argument('--model_pruning', default=False, action="store_true",
+        help='Use model pruning for optimization, only for TF 1.x')
+
+    # Evaluation options
+    parser.add_argument('--eval_online', default=False, action="store_true",
+        help='Whether to do evaluation on validation dataset during training')
+    parser.add_argument('--eval_epoch_interval', type=int, required=False, default=10,
+        help = "Number of iteration(epochs) interval to do evaluation, default=%(default)s")
+    parser.add_argument('--save_eval_checkpoint', default=False, action="store_true",
+        help='Whether to save checkpoint with best evaluation result')
+
+    args = parser.parse_args()
+    height, width = args.model_input_shape.split('x')
+    args.model_input_shape = (int(height), int(width))
+
+    main(args)
diff --git a/models/unet.py b/models/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4730acf2c45d3924204932c7e1aaf0c714e51f4e
--- /dev/null
+++ b/models/unet.py
@@ -0,0 +1,89 @@
+import tensorflow as tf
+from tensorflow.keras import layers as L
+from tensorflow.keras.models import Model
+
+
+def build_model(x_shape, y_shape, config):
+    inp = L.Input(shape=x_shape)
+    x = inp
+
+    n_stages = config.get('num_stages', 2)
+    n_conv = config.get('num_conv', 1)
+    n_filters = config.get('num_filters', 16)
+    grow_mult = config.get('grow_factor', 1)
+    up_activation = config.get('up_act', 'relu')
+    conv_type = config.get('conv_type', 'conv')
+    use_aspp = config.get('aspp', False)
+
+    if up_activation == 'lrelu':
+        up_activation = L.LeakyReLU()
+    else:
+        up_activation = L.Activation(up_activation)
+
+    use_bn = 'bn-' not in conv_type
+
+    conv = L.SeparableConv2D if 'sep-' in conv_type else L.Conv2D
+    conv_common = dict(padding='same', use_bias=not use_bn)
+
+    def conv_block(*args, **kwargs):
+        def layer(x):
+            if use_bn:
+                act = kwargs.pop('activation', None)
+                x = conv(*args, **kwargs)(x)
+                x = L.BatchNormalization()(x)
+                return L.Activation(act)(x) if act else x
+            return conv(*args, **kwargs)(x)
+
+        return layer
+
+    intermediate = []
+
+    for _ in range(n_conv):
+        x = conv_block(n_filters, 3, activation='relu', **conv_common)(x)
+
+    # downsample path
+    for i in range(n_stages):
+        intermediate.append(x)
+        n = round(n_filters * (grow_mult ** i))
+        x = conv_block(n, 3, 2, activation='relu', **conv_common)(x)
+        for _ in range(n_conv - 1):
+            x = conv_block(n, 3, activation='relu', **conv_common)(x)
+
+    middle = L.GlobalAveragePooling2D()(x)
+
+    if use_aspp:
+        n = round(n / 4)
+        x1 = conv_block(n, 1, dilation_rate=1, activation='relu', **conv_common)(x)
+        x2 = conv_block(n, 3, dilation_rate=2, activation='relu', **conv_common)(x)
+        x3 = conv_block(n, 3, dilation_rate=4, activation='relu', **conv_common)(x)
+        x4 = conv_block(n, 3, dilation_rate=6, activation='relu', **conv_common)(x)
+
+        # global feature
+        xg = L.Reshape((1, 1, -1))(middle)
+        xg = conv_block(n, 1, activation='relu', **conv_common)(xg)
+        feature_tiling = tf.pad(tf.shape(x)[1:3], tf.constant([[1, 1]]), constant_values=1)
+        xg = tf.tile(xg, feature_tiling)
+
+        x = tf.concat([x1, x2, x3, x4, xg], axis=-1)
+
+    # upsample path
+    for i in range(n_stages - 1, -1, -1):
+        x = L.UpSampling2D(size=2, interpolation='bilinear')(x)
+        x = L.Concatenate()([x, intermediate.pop()])
+        n = round(n_filters * (grow_mult ** i))
+        for _ in range(n_conv):
+            x = conv_block(n, 3, **conv_common)(x)
+            x = up_activation(x)
+
+    # segmentation mask
+    out_mask = conv(y_shape[-1], 3, activation='sigmoid', padding='same', name='mask')(x)
+    # metadata tags (is_eye and is_blink)
+    out_tags = L.Dense(2, activation='sigmoid', name='tags')(middle)
+
+    return Model(inp, [out_mask, out_tags])
+
+
+if __name__ == '__main__':
+    shape = (128, 128, 1)
+    model = build_model(shape, shape, {'aspp': True})
+    model.summary()
\ No newline at end of file
diff --git a/test_model.py b/test_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae929267f93d37f57d1c33c9513255420fae9e5
--- /dev/null
+++ b/test_model.py
@@ -0,0 +1,90 @@
+import tensorflow as tf
+import numpy as np
+from tensorflow.keras import backend as K
+from adabelief_tf import AdaBeliefOptimizer
+import matplotlib.pyplot as plt
+import os
+from glob import glob
+
+# [Previous function definitions stay the same: iou_coef, dice_coef, etc.]
+
+def visualize_prediction(original_img, mask_pred, tags_pred, save_path=None):
+    plt.figure(figsize=(15, 5))
+    
+    # Original image
+    plt.subplot(1, 3, 1)
+    plt.imshow(original_img[:,:,0], cmap='gray')
+    plt.title('Original Image')
+    plt.axis('off')
+    
+    # Predicted mask
+    plt.subplot(1, 3, 2)
+    plt.imshow(mask_pred[:,:,0], cmap='jet')
+    plt.title('Predicted Mask')
+    plt.axis('off')
+    
+    # Overlay
+    plt.subplot(1, 3, 3)
+    plt.imshow(original_img[:,:,0], cmap='gray')
+    plt.imshow(mask_pred[:,:,0], cmap='jet', alpha=0.4)
+    plt.title(f'Overlay\nEye: {tags_pred[0]:.3f}, Blink: {tags_pred[1]:.3f}')
+    plt.axis('off')
+    
+    plt.tight_layout()
+    if save_path:
+        plt.savefig(save_path)
+        plt.close()
+    else:
+        plt.show()
+
+def test_single_image(image_path, model, output_dir=None):
+    print(f"\nTesting image: {os.path.basename(image_path)}")
+    img = load_image(image_path)
+    img_batch = tf.expand_dims(img, 0)
+    
+    # Get predictions
+    mask_pred, tags_pred = model.predict(img_batch, verbose=0)
+    
+    print("Predictions:")
+    print(f"Eye detection confidence: {tags_pred[0][0]:.3f}")
+    print(f"Blink detection confidence: {tags_pred[0][1]:.3f}")
+    
+    # Visualize if output directory is provided
+    if output_dir:
+        base_name = os.path.splitext(os.path.basename(image_path))[0]
+        save_path = os.path.join(output_dir, f'{base_name}_prediction.png')
+        visualize_prediction(img.numpy(), mask_pred[0], tags_pred[0], save_path)
+    
+    return mask_pred[0], tags_pred[0]
+
+# Load the model
+model_path = 'runs/b32_c-conv_d-|root|meye|data|NN_human_mouse_eyes|_g1.5_l0.001_num_c1_num_f16_num_s5_r128_se23_sp-random_up-relu_us0/best_model.h5'
+print("Loading model...")
+model = tf.keras.models.load_model(model_path, custom_objects=custom_objects)
+
+output_dir = "/root/meye/test_predictions"  # absolute path in /meye directory
+os.makedirs(output_dir, exist_ok=True)
+
+print(f"\nSaving predictions to: {output_dir}")
+
+# Test directory with multiple images
+test_dir = "/root/meye/data/NN_human_mouse_eyes/fullFrames"
+image_files = glob(os.path.join(test_dir, "*.jpg"))[:10]  # Test first 10 images
+
+print(f"\nTesting {len(image_files)} images...")
+results = []
+
+for image_path in image_files:
+    mask_pred, tags_pred = test_single_image(image_path, model, output_dir)
+    results.append({
+        'image': os.path.basename(image_path),
+        'eye_conf': tags_pred[0],
+        'blink_conf': tags_pred[1]
+    })
+
+# Print summary
+print("\nSummary:")
+df = pd.DataFrame(results)
+print("\nAverage confidences:")
+print(f"Eye detection: {df['eye_conf'].mean():.3f} ± {df['eye_conf'].std():.3f}")
+print(f"Blink detection: {df['blink_conf'].mean():.3f} ± {df['blink_conf'].std():.3f}")
\ No newline at end of file
diff --git a/timings.html b/timings.html
new file mode 100644
index 0000000000000000000000000000000000000000..51fec491127f319f8dce65ca690c2af9cf970374
--- /dev/null
+++ b/timings.html
@@ -0,0 +1,82 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Efficiency Tests</title>
+        <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs/dist/tf.min.js" type="text/javascript"></script>
+        <script src="models.js" type="text/javascript"></script>
+        <script type="text/javascript">
+
+            var running = false;
+            var stop = false;
+            var timings = {};
+
+            models.sort()
+
+            function getMean(array) {
+                return array.reduce((a, b) => a + b) / array.length;
+            }
+
+            async function measureTimings() {
+                const button = document.getElementById("button");
+                const log = document.getElementById("log");
+
+                if (running) {
+                    running = false;
+                    button.innerHTML = "Stopping ...";
+                    return;
+                }
+
+                running = true;
+                button.innerHTML = "Stop";
+
+                const x = tf.randomNormal([1, 128, 128, 1]);
+                timings = {};
+
+                for (const modelName of models) {
+                    if (modelName in timings) continue;
+
+                    tf.engine().startScope()
+                    const modelUrl = modelName + '/model.json';
+                    console.log('Loading: ' + modelUrl);
+                    const model = await tf.loadGraphModel(modelUrl);
+
+                    // preload model
+                    await model.predict(x);
+
+                    const n = 100;
+                    let times = [];
+                    for (let i = 0; i < 5; ++i) {
+                        const time = await tf.time(() => {
+                            for (let i = 0; i < n; ++i) model.predict(x);
+                        });
+                        times.push(time);
+                    }
+                    tf.engine().endScope()
+
+                    console.log(times);
+
+                    timings[modelName] = times;
+
+                    const t = times.map(x => x.wallMs / n);
+                    const mean = getMean(t);
+                    const std = Math.sqrt(getMean(t.map(x => (x - mean) ** 2)));
+                    log.innerHTML += `<tr><td>${modelName}</td><td>${mean.toFixed(2)} &pm; ${std.toFixed(2)}</td>`;
+
+                    if (!running) {
+                        button.innerHTML = "Start";
+                        break;
+                    }
+                }
+
+                running = false;
+                button.innerHTML = "Start";
+            }
+        </script>
+    </head>
+    <body>
+        <button id="button" onclick="measureTimings()">Start</button>
+        <table id="log">
+            <tr><th>model</th><th>timing</th></tr>
+        </table>
+    </body>
+</html>
\ No newline at end of file
diff --git a/timings.py b/timings.py
new file mode 100644
index 0000000000000000000000000000000000000000..547fe8d67cf6ee555804ebe3ad1d6590ed72d97f
--- /dev/null
+++ b/timings.py
@@ -0,0 +1,63 @@
+import argparse
+import os
+import time
+
+os.sys.path += ['expman']
+import expman
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from tqdm import trange
+
+
+def main(args):
+    is_run_dir = expman.is_exp_dir(args.model)
+    if is_run_dir:
+        exp = expman.from_dir(args.model)
+        for model_file in ('best_savedmodel', 'best_model.h5', 'last_model.h5'):
+            model_path = exp.path_to(model_file)
+            if os.path.exists(model_path):
+                break
+    elif tf.saved_model.contains_saved_model(args.model):
+        model_path = args.model
+    else:
+        print('Cannot find suitable model snapshot in {}'.format(args.model))
+        exit(1)
+
+    model = tf.keras.models.load_model(model_path, compile=False, custom_objects={'tf': tf})
+    data = np.empty((1, args.rh, args.rw, 1), dtype=np.float32)
+
+    # warm-up
+    model.predict(data)
+
+    start = time.time()
+    for _ in trange(args.n):
+        model.predict(data)
+    end = time.time()
+    elapsed = end - start
+
+    throughput = elapsed / args.n
+    fps = args.n / elapsed
+    print(f'Total: {elapsed:g}s ({throughput * 1000} ms/img, {fps} fps)')
+
+    timings = pd.Series({'elapsed': elapsed, 'throughput': throughput, 'fps': fps})
+
+    if is_run_dir and not args.output:
+        timings_path = exp.path_to('timings.csv')
+        timings.to_csv(timings_path)
+
+    if args.output:
+        timings.to_csv(args.output)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Predict on test video')
+    parser.add_argument('model', help='path to model or run dir')
+    parser.add_argument('-n', type=int, default=100, help='number of predictions')
+    parser.add_argument('-rh', type=int, default=128, help='RoI height (-1 for full height)')
+    parser.add_argument('-rw', type=int, default=128, help='RoI width (-1 for full width)')
+    parser.add_argument('-o', '--output', help='CSV output file')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fbda38b4459d415b73cb5b9d63cdc12bb525121
--- /dev/null
+++ b/train.py
@@ -0,0 +1,268 @@
+# train.py
+
+import argparse
+import os
+os.sys.path += ['expman']
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import math
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import tensorflowjs as tfjs
+from tensorflow.keras import backend as K
+from tensorflow.keras.models import load_model
+from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, CSVLogger
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, average_precision_score
+from adabelief_tf import AdaBeliefOptimizer
+from tqdm.keras import TqdmCallback
+from tqdm import tqdm
+from functools import partial
+
+from dataloader import get_loader, load_datasets, validate_data_files
+from models.unet import build_model
+from utils import visualize
+from expman import Experiment
+import evaluate
+
+def boundary_loss(y_true, y_pred):
+    """Additional loss focusing on boundaries"""
+    y_true = tf.cast(y_true, tf.float32)
+    y_pred = tf.cast(y_pred, tf.float32)
+    # Compute gradients
+    dy_true, dx_true = tf.image.image_gradients(y_true)
+    dy_pred, dx_pred = tf.image.image_gradients(y_pred)
+
+    # Compute boundary loss
+    loss = tf.reduce_mean(tf.abs(dy_pred - dy_true) + tf.abs(dx_pred - dx_true))
+    return loss * 0.5
+
+def enhanced_binary_crossentropy(y_true, y_pred):
+    """Combine standard BCE with boundary loss"""
+    y_true = tf.cast(y_true, tf.float32)
+    y_pred = tf.cast(y_pred, tf.float32)
+    bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
+    boundary = boundary_loss(y_true, y_pred)
+    return bce + boundary
+
+def cosine_decay_with_warmup(epoch, total_epochs, warmup_epochs=5, initial_lr=0.001):
+    if epoch < warmup_epochs:
+        # Linear warmup
+        return initial_lr * (epoch + 1) / warmup_epochs
+    # Cosine decay after warmup
+    progress = (epoch - warmup_epochs) / (total_epochs - warmup_epochs)
+    return initial_lr * (1 + math.cos(math.pi * progress)) / 2
+
+def main(args):
+    try:
+        # Verify data directories exist
+        for data_dir in args.data:
+            if not os.path.exists(data_dir):
+                raise FileNotFoundError(f"Data directory not found: {data_dir}")
+        
+        exp = Experiment(args, ignore=('epochs', 'resume'))
+        print(exp)
+
+        np.random.seed(args.seed)
+        tf.random.set_seed(args.seed)
+
+        data = load_datasets(args.data)
+        if len(data) == 0:
+            raise ValueError("No valid data found after loading datasets")
+
+        # Validate all files exist
+        validate_data_files(data)
+
+        # TRAIN/VAL/TEST SPLIT
+        if args.split == 'subjects':  # by SUBJECTS
+            val_subjects = (6, 9, 11, 13, 16, 28, 30, 48, 49)
+            test_subjects = (3, 4, 19, 38, 45, 46, 51, 52)
+            train_data = data[~data['sub'].isin(val_subjects + test_subjects)]
+            val_data = data[data['sub'].isin(val_subjects)]
+            test_data = data[data['sub'].isin(test_subjects)]
+
+        elif args.split == 'random':  # 70-20-10 %
+            train_data, valtest_data = train_test_split(data, test_size=.3, shuffle=True)
+            val_data, test_data = train_test_split(valtest_data, test_size=.33)
+
+        lengths = map(len, (data, train_data, val_data, test_data))
+        print("Total: {} - Train / Val / Test: {} / {} / {}".format(*lengths))
+
+        x_shape = (args.resolution, args.resolution, 1)
+        y_shape = (args.resolution, args.resolution, 1)
+
+        train_gen, _ = get_loader(train_data, batch_size=args.batch_size, shuffle=True, augment=True, x_shape=x_shape)
+        val_gen, val_categories = get_loader(val_data, batch_size=args.batch_size, x_shape=x_shape)
+
+        log = exp.path_to('log.csv')
+
+        # weights_only checkpoints
+        best_weights_path = exp.path_to('best_weights.weights.h5')
+        best_mask_weights_path = exp.path_to('best_weights_mask.weights.h5')
+
+        # whole model checkpoints
+        best_ckpt_path = exp.path_to('best_model.keras')
+        last_ckpt_path = exp.path_to('last_model.keras')
+
+        if args.resume and os.path.exists(last_ckpt_path):
+            custom_objects = {
+                'iou_coef': evaluate.iou_coef,
+                'dice_coef': evaluate.dice_coef,
+                'enhanced_binary_crossentropy': enhanced_binary_crossentropy,
+                'boundary_loss': boundary_loss
+            }
+            model = tf.keras.models.load_model(last_ckpt_path, custom_objects=custom_objects)
+            optimizer = model.optimizer
+            initial_epoch = len(pd.read_csv(log)) if os.path.exists(log) else 0
+        else:
+            config = vars(args)
+            model = build_model(x_shape, y_shape, config)
+
+            # Use Adam optimizer
+            optimizer = tf.keras.optimizers.Adam(
+                learning_rate=float(args.lr),
+                beta_1=0.9,
+                beta_2=0.999,
+                epsilon=1e-7
+            )
+            initial_epoch = 0
+
+        model.compile(
+            optimizer=optimizer,
+            loss={
+                'mask': enhanced_binary_crossentropy,
+                'tags': 'binary_crossentropy'
+            },
+            metrics={
+                'mask': [evaluate.iou_coef, evaluate.dice_coef],
+                'tags': 'binary_accuracy'
+            }
+        )
+
+        model_stopped_file = exp.path_to('early_stopped.txt')
+        need_training = not os.path.exists(model_stopped_file) and initial_epoch < args.epochs
+
+        if need_training:
+            lr_schedule = partial(cosine_decay_with_warmup,
+                    total_epochs=args.epochs,
+                    warmup_epochs=5,
+                    initial_lr=args.lr)
+
+            best_checkpointer = ModelCheckpoint(
+                best_weights_path,
+                monitor='val_loss',
+                save_best_only=True,
+                save_weights_only=True,
+                mode='min'
+            )
+
+            best_mask_checkpointer = ModelCheckpoint(
+                best_mask_weights_path,
+                monitor='val_mask_dice_coef',
+                mode='max',
+                save_best_only=True,
+                save_weights_only=True
+            )
+
+            last_checkpointer = ModelCheckpoint(
+                last_ckpt_path,
+                save_best_only=False,
+                save_weights_only=False
+            )
+
+            logger = CSVLogger(log, append=args.resume)
+            progress = TqdmCallback(verbose=1, initial=initial_epoch, dynamic_ncols=True)
+
+            early_stop = tf.keras.callbacks.EarlyStopping(
+                monitor='val_mask_dice_coef',
+                mode='max',
+                patience=100,
+                restore_best_weights=True
+            )
+
+            lr_scheduler = LearningRateScheduler(lr_schedule)
+
+            callbacks = [
+                best_checkpointer,
+                best_mask_checkpointer,
+                last_checkpointer,
+                logger,
+                progress,
+                early_stop,
+                lr_scheduler
+            ]
+
+            try:
+                model.fit(
+                    train_gen,
+                    epochs=args.epochs,
+                    callbacks=callbacks,
+                    initial_epoch=initial_epoch,
+                    steps_per_epoch=len(train_gen),
+                    validation_data=val_gen,
+                    validation_steps=len(val_gen),
+                    verbose=False
+                )
+            except Exception as e:
+                print(f"Training failed: {str(e)}")
+                raise
+
+            if model.stop_training:
+                open(model_stopped_file, 'w').close()
+
+            # Save the model in .keras format
+            best_ckpt_path = exp.path_to('best_model.keras')
+            tf.keras.models.save_model(model, best_ckpt_path, include_optimizer=False)
+
+            # Only evaluate if training was successful
+            evaluate.evaluate(exp, force=need_training)
+
+            # save best snapshot in SavedModel format
+            model.load_weights(best_mask_weights_path)
+            best_savedmodel_path = exp.path_to('best_savedmodel')
+            model.save(best_savedmodel_path, save_traces=True)
+
+            # export to tfjs (Layers model)
+            tfjs_model_dir = exp.path_to('tfjs')
+            tfjs.converters.save_keras_model(model, tfjs_model_dir)
+        else:
+            print("No training needed, model already exists and training completed.")
+            # Optionally evaluate existing model
+            evaluate.evaluate(exp, force=False)
+
+    except Exception as e:
+        print(f"Error in main: {str(e)}")
+        raise
+
+if __name__ == '__main__':
+    default_data = ['data/NN_human_mouse_eyes']
+
+    parser = argparse.ArgumentParser(description='MEye Training Script')
+    # data params
+    parser.add_argument('-d', '--data', nargs='+', default=default_data, help='Data directory (may be multiple)')
+    parser.add_argument('--split', default='random', choices=('random', 'subjects'), help='How to split data')
+    parser.add_argument('-r', '--resolution', type=int, default=128, help='Input image resolution')
+
+    # model params
+    parser.add_argument('--num-stages', type=int, default=5, help='number of down-up sample stages')
+    parser.add_argument('--num-conv', type=int, default=1, help='number of convolutions per stage')
+    parser.add_argument('--num-filters', type=int, default=16, help='number of conv filter at first stage')
+    parser.add_argument('--grow-factor', type=float, default=1.5,
+                        help='# filters at stage i = num-filters * grow-factor ** i')
+    parser.add_argument('--up-activation', default='relu', choices=('relu', 'lrelu'),
+                        help='activation in upsample stages')
+    parser.add_argument('--conv-type', default='conv', choices=('conv', 'bn-conv', 'sep-conv', 'sep-bn-conv'),
+                        help='convolution type')
+    parser.add_argument('--use-aspp', default=False, action='store_true', help='Use Atrous Spatial Pyramid Pooling')
+
+    # train params
+    parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
+    parser.add_argument('-b', '--batch-size', type=int, default=32, help='Batch size')
+    parser.add_argument('-e', '--epochs', type=int, default=1500, help='Number of training epochs')
+    parser.add_argument('-s', '--seed', type=int, default=23, help='Random seed')
+    parser.add_argument('--resume', default=False, action='store_true', help='Resume training')
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/train_dl.py b/train_dl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4d2b8ad7d03fe650ddcbea122a75bcc32c0ce72
--- /dev/null
+++ b/train_dl.py
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+""" MEye: Semantic Segmentation """
+
+import argparse
+import os
+
+os.sys.path += ['expman', 'models/deeplab']
+import matplotlib
+matplotlib.use('Agg')
+
+import matplotlib.pyplot as plt
+
+import math
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import tensorflowjs as tfjs
+from tensorflow.keras import backend as K
+from tensorflow.keras.models import load_model
+from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, CSVLogger
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, average_precision_score
+from adabelief_tf import AdaBeliefOptimizer
+from tqdm.keras import TqdmCallback
+from tqdm import tqdm
+from functools import partial
+
+from dataloader import get_loader, load_datasets
+from deeplabv3p.models.deeplabv3p_mobilenetv3 import hard_swish
+from models.deeplab import build_model, AVAILABLE_BACKBONES
+from utils import visualize
+from expman import Experiment
+
+import evaluate
+
+
+def main(args):
+    exp = Experiment(args, ignore=('epochs', 'resume'))
+    print(exp)
+
+    np.random.seed(args.seed)
+    tf.random.set_seed(args.seed)
+
+    data = load_datasets(args.data)
+
+    # TRAIN/VAL/TEST SPLIT
+    if args.split == 'subjects':  # by SUBJECTS
+        val_subjects = (6, 9, 11, 13, 16, 28, 30, 48, 49)
+        test_subjects = (3, 4, 19, 38, 45, 46, 51, 52)
+        train_data = data[~data['sub'].isin(val_subjects + test_subjects)]
+        val_data = data[data['sub'].isin(val_subjects)]
+        test_data = data[data['sub'].isin(test_subjects)]
+
+    elif args.split == 'random':  # 70-20-10 %
+        train_data, valtest_data = train_test_split(data, test_size=.3, shuffle=True)
+        val_data, test_data = train_test_split(valtest_data, test_size=.33)
+
+    lengths = map(len, (data, train_data, val_data, test_data))
+    print("Total: {} - Train / Val / Test: {} / {} / {}".format(*lengths))
+
+    x_shape = (args.resolution, args.resolution, 1)
+    y_shape = (args.resolution, args.resolution, 1)
+
+    train_gen, _ = get_loader(train_data, batch_size=args.batch_size, shuffle=True, augment=True, x_shape=x_shape)
+    val_gen, val_categories = get_loader(val_data, batch_size=args.batch_size, x_shape=x_shape)
+    # test_gen, test_categories = get_loader(test_data, batch_size=1, x_shape=x_shape)
+
+    log = exp.path_to('log.csv')
+
+    # weights_only checkpoints
+    best_weights_path = exp.path_to('best_weights.h5')
+    best_mask_weights_path = exp.path_to('best_weights_mask.h5')
+
+    # whole model checkpoints
+    best_ckpt_path = exp.path_to('best_model.h5')
+    last_ckpt_path = exp.path_to('last_model.h5')
+
+    if args.resume and os.path.exists(last_ckpt_path):
+        custom_objects={'AdaBeliefOptimizer': AdaBeliefOptimizer, 'iou_coef': evaluate.iou_coef, 'dice_coef': evaluate.dice_coef, 'hard_swish': hard_swish}
+        model = tf.keras.models.load_model(last_ckpt_path, custom_objects=custom_objects)
+        optimizer = model.optimizer
+        initial_epoch = len(pd.read_csv(log))
+    else:
+        config = vars(args)
+        model = build_model(x_shape, y_shape, config)
+        optimizer = AdaBeliefOptimizer(learning_rate=args.lr, print_change_log=False)
+        initial_epoch = 0
+
+    model.compile(optimizer=optimizer,
+                  loss='binary_crossentropy',
+                  metrics={'mask': [evaluate.iou_coef, evaluate.dice_coef],
+                           'tags': 'binary_accuracy'})
+
+    model_stopped_file = exp.path_to('early_stopped.txt')
+    need_training = not os.path.exists(model_stopped_file) and initial_epoch < args.epochs
+    if need_training:
+        best_checkpointer = ModelCheckpoint(best_weights_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
+        best_mask_checkpointer = ModelCheckpoint(best_mask_weights_path, monitor='val_mask_dice_coef', mode='max', save_best_only=True, save_weights_only=True)
+        last_checkpointer = ModelCheckpoint(last_ckpt_path, save_best_only=False, save_weights_only=False)
+        logger = CSVLogger(log, append=args.resume)
+        progress = TqdmCallback(verbose=1, initial=initial_epoch, dynamic_ncols=True)
+        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_mask_dice_coef', mode='max', patience=100)
+
+        callbacks = [best_checkpointer, best_mask_checkpointer, last_checkpointer, logger, progress, early_stop]
+
+        model.fit(train_gen,
+                  epochs=args.epochs,
+                  callbacks=callbacks,
+                  initial_epoch=initial_epoch,
+                  steps_per_epoch=len(train_gen),
+                  validation_data=val_gen,
+                  validation_steps=len(val_gen),
+                  verbose=False)
+
+        if model.stop_training:
+            open(model_stopped_file, 'w').close()
+
+        tf.keras.models.save_model(model, best_ckpt_path, include_optimizer=False)
+
+    # evaluation on test set
+    evaluate.evaluate(exp, force=need_training)
+
+    # save best snapshot in SavedModel format
+    model.load_weights(best_mask_weights_path)
+    best_savedmodel_path = exp.path_to('best_savedmodel')
+    model.save(best_savedmodel_path, save_traces=True)
+
+    # export to tfjs (Layers model)
+    tfjs_model_dir = exp.path_to('tfjs')
+    tfjs.converters.save_keras_model(model, tfjs_model_dir)
+
+
+if __name__ == '__main__':
+    default_data = ['data/NN_human_mouse_eyes']
+
+    parser = argparse.ArgumentParser(description='Train DeepLab models')
+    # data params
+    parser.add_argument('-d', '--data', nargs='+', default=default_data, help='Data directory (may be multiple)')
+    parser.add_argument('--split', default='random', choices=('random', 'subjects'), help='How to split data')
+    parser.add_argument('-r', '--resolution', type=int, default=128, help='Input image resolution')
+
+    # model params
+    parser.add_argument('-a', '--backbone', default='resnet50', choices=AVAILABLE_BACKBONES, help='Backbone architecture')
+
+    # train params
+    parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
+    parser.add_argument('-b', '--batch-size', type=int, default=32, help='Batch size')
+    parser.add_argument('-e', '--epochs', type=int, default=500, help='Number of training epochs')
+    parser.add_argument('-s', '--seed', type=int, default=23, help='Random seed')
+    parser.add_argument('--resume', default=False, action='store_true', help='Resume training')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..06636f711e351705321c7013863c4551249f3d5a
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,89 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+from PIL import Image, ImageDraw
+from scipy.ndimage import center_of_mass, label, sum as area
+
+
+def nms_on_area(x, s):  # x is a binary image, s is a structuring element
+    labels, num_labels = label(x, structure=s)  # find connected components
+    if num_labels > 1:
+        indexes = np.arange(1, num_labels + 1)
+        areas = area(x, labels, indexes)  # compute area for each connected components
+        
+        biggest = max(zip(areas, indexes))[1]  # get index of largest component
+        x[labels != biggest] = 0  # discard other components
+
+    return x
+
+
+def compute_metrics(p, thr=None, nms=False):
+    p = p.squeeze()
+
+    if thr:
+        p = p > thr
+        if nms:  # perform non-maximum suppression: keep only largest area
+            s = np.ones((3, 3))  # connectivity structure
+            p = nms_on_area(p, s)
+
+    center = center_of_mass(p)
+    area = p.sum()
+    return center, area
+
+
+def visualizable(x, y, alpha=(.5, .5), thr=0):
+    xx = np.tile(x, (3,))  # Gray -> RGB: repeat channels 3 times
+    yy = (y, ) + (np.zeros_like(x),) * (3 - y.shape[-1])
+    yy = np.concatenate(yy, axis=-1)  # add a zero channels to pad to RGB
+    mask = yy.max(axis=-1, keepdims=True) > thr  # blend only where a prediction is present
+    # mask = mask[:, :, None]
+    return np.where(mask, alpha[0] * xx + alpha[1] * yy, xx)
+
+
+def draw_predictions(image, predictions, thr=None):
+    x = image.convert('RGBA')
+
+    maps, tags = predictions
+    maps = maps[0] if maps.ndim == 4 else maps
+    eye, blink = tags.squeeze()
+    alpha = maps.max(axis=-1, keepdims=True)
+    alpha = alpha > thr if thr is not None else alpha
+
+    n_pad = 3 - maps.shape[-1]
+    zero_channels = np.zeros(image.size + (n_pad,))
+    y = np.concatenate((maps, zero_channels, alpha), axis=-1)  # add pad and masked alpha channel
+    y = (y * 255).astype(np.uint8)
+    y = Image.fromarray(y).convert('RGBA')
+
+    preview = Image.alpha_composite(x, y)
+    draw = ImageDraw.Draw(preview)
+    draw.text((5, 5), 'E: {: >3.1%}  B:{: >3.1%}'.format(eye, blink), fill=(0, 0, 255))
+    # draw.text((5, image.height - 5), ''.format(blink), fill=(255, 0, 0))
+
+    return preview
+
+
+def visualize(x, y, out=None, thr=0, n_cols=4, width=20):
+    n_rows = len(x) // n_cols
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(width, width * n_rows // n_cols))
+    y_masks, y_tags = y
+
+    axes = axes.flatten() if isinstance(axes, np.ndarray) else (axes,)
+    
+    for xi, yi_mask, yi_tags, ax in zip(x, y_masks, y_tags, axes):
+        i = visualizable(xi, yi_mask, thr=thr)
+        ax.imshow(i, cmap=plt.cm.gray)
+        ax.grid(False)
+        if len(yi_tags) == 2:
+            title = 'E: {:.1%} - B: {:.1%}'
+        elif len(yi_tags) == 4:
+            title = 'pE: {:.1%} - pB: {:.1%}\ntE: {:.1%} - tB: {:.1%}'
+
+        ax.text(x=0.5, y=-0.02, s=title.format(*yi_tags), transform=ax.transAxes,
+                ha='center', va='top',
+                fontsize=width * 4 / 5, fontfamily='monospace')
+        ax.set_axis_off()
+
+    if out:
+        plt.savefig(out, bbox_inches='tight')
+        plt.close()