diff --git a/mindinsight/backend/lineagemgr/lineage_api.py b/mindinsight/backend/lineagemgr/lineage_api.py index 8d8547d03c7ae8db916a4fbf8b91b65c1b8ad6af..ca96c9370f2c96112f9bc44f74d478a0749f2900 100644 --- a/mindinsight/backend/lineagemgr/lineage_api.py +++ b/mindinsight/backend/lineagemgr/lineage_api.py @@ -15,15 +15,22 @@ """Lineage restful api.""" import json import os +import itertools +import pandas as pd +import numpy as np from flask import Blueprint, jsonify, request from mindinsight.conf import settings from mindinsight.datavisual.utils.tools import get_train_id from mindinsight.datavisual.data_transform.data_manager import DATA_MANAGER from mindinsight.lineagemgr.api.model import general_filter_summary_lineage, general_get_summary_lineage +from mindinsight.lineagemgr.lineage_parser import LineageOrganizer from mindinsight.utils.exceptions import MindInsightException, ParamValueError from mindinsight.lineagemgr.cache_item_updater import update_lineage_object +from mindinsight.utils.utils import calc_histogram +import mindinsight.utils.utils as utils +from mindinsight.lineagemgr.common.log import logger BLUEPRINT = Blueprint("lineage", __name__, url_prefix=settings.URL_PATH_PREFIX+settings.API_PREFIX) @@ -159,6 +166,291 @@ def get_dataset_graph(): return jsonify(dataset_graph) +@BLUEPRINT.route("/lineagemgr/optimizer/targets", methods=["GET"]) +def get_optimize_targets(): + targets = _get_optimize_targets(DATA_MANAGER) + return jsonify({"targets": targets}) + + +METRIC_PREFIX = "[M]" +USER_DEFINED_PREFIX = "[U]" + + +class LineageTable: + METRIC_PREFIX = "[M]" + USER_DEFINED_PREFIX = "[U]" + LOSS_NAME = "loss" + + + def __init__(self, df: pd.DataFrame): + self._df = df + self._remove_unsupported_columns() + + def _remove_unsupported_columns(self): + columns_to_drop = [] + for name, data in self._df.iteritems(): + if not utils.is_simple_numpy_number(data.dtype): + columns_to_drop.append(name) + + if columns_to_drop: + logger.warning("Unsupported columns: %s", columns_to_drop) + self._df = self._df.drop(columns=columns_to_drop) + + @property + def target_names(self): + target_names = [name for name in self._df.columns if name.startswith(self.METRIC_PREFIX)] + if self.LOSS_NAME in self._df.columns: + target_names.append(self.LOSS_NAME) + return target_names + + @property + def hyper_param_names(self): + hyper_param_names = [name for name in self._df.columns if not name.startswith(self.METRIC_PREFIX)] + if self.LOSS_NAME in hyper_param_names: + hyper_param_names.remove(self.LOSS_NAME) + + return hyper_param_names + + @property + def sys_hyper_param_names(self): + names = [name for name in self._df.columns if not name.startswith(self.METRIC_PREFIX) and not name.startswith(self.USER_DEFINED_PREFIX)] + if self.LOSS_NAME in names: + names.remove(self.LOSS_NAME) + + return names + + @property + def user_defined_hyper_param_names(self): + names = [name for name in self._df.columns if name.startswith(self.USER_DEFINED_PREFIX)] + return names + + def calc_hyper_param_importance(self, hyper_param, target): + logger.warning("Calculating importance for hyper_param %s, target is %s.", hyper_param, target) + + new_df = self._df[[hyper_param, target]] + no_missing_value_df = new_df.dropna() + if len(no_missing_value_df) > 2: + correlation = no_missing_value_df[target].corr(no_missing_value_df[hyper_param]) + if np.isnan(correlation): + logger.warning("Correlation is nan!") + return 0 + return abs(correlation) + else: + # No enough data to calculate importance. + return 0 + + def get_column(self, name): + return self._df[name] + + def get_best_row(self, sort_by, best="max"): + self._df = self._df.reset_index() + if best == "max": + return self._df.loc[self._df[sort_by].idxmax()] + return self._df.loc[self._df[sort_by].idxmin()] + + +def _get_optimize_targets(data_manager): + # lineage_objs = LineageOrganizer(data_manager=DATA_MANAGER).super_lineage_objs + + table = _organize_lineage_to_table(data_manager) + + target_summaries = [] + for target in table.target_names: + hyper_parameters = [{ + "name": hyper_param, + "importance": table.calc_hyper_param_importance(hyper_param, target) + } for hyper_param in table.hyper_param_names] + + hyper_parameters.sort(key=lambda hyper_param: hyper_param.get("importance"), reverse=True) + + target_summary = { + "name": target, + "histogram_buckets": calc_histogram(table.get_column(target), bins=5), + "goal": "max", # TODO choose goal smartly + "hyper_parameters": hyper_parameters + } + target_summaries.append(target_summary) + + target_summaries.sort(key=lambda summary: summary.get("name")) + + return target_summaries + + +def _organize_lineage_to_table(data_manager): + lineages = general_filter_summary_lineage(data_manager=data_manager) + lineage_objects = lineages.get("object", []) + cnt_lineages = len(lineage_objects) + metric_prefix = "[M]" + user_defined_prefix = "[U]" + # Step 1, get column names + column_names = set() + hyper_parameter_names = set() + target_names = set() + for lineage in lineage_objects: + model_lineage = lineage.get("model_lineage", {}) + + metric = model_lineage.pop("metric", {}) + metric_names = tuple(metric_prefix + key for key in metric.keys()) + + user_defined = model_lineage.pop("user_defined", {}) + user_defined_names = tuple(user_defined_prefix + key for key in user_defined.keys()) + + column_names.update(model_lineage.keys()) + column_names.update(metric_names) + column_names.update(user_defined_names) + + hyper_parameter_names.update(model_lineage.keys()) + hyper_parameter_names.update(user_defined_names) + + target_names.update(metric_names) + + model_lineage["metric"] = metric + model_lineage["user_defined"] = user_defined + # Step 2, collect data + column_data = {key: [None] * cnt_lineages for key in column_names} + for ind, lineage in enumerate(lineage_objects): + model_lineage = lineage.get("model_lineage", {}) + metric = model_lineage.pop("metric", {}) + metric_content = { + metric_prefix + key: val for key, val in metric.items() + } + + user_defined = model_lineage.pop("user_defined", {}) + user_defined_content = { + user_defined_prefix + key: val for key, val in user_defined.items() + } + + final_content = {} + final_content.update(model_lineage) + final_content.update(metric_content) + final_content.update(user_defined_content) + + for key, val in final_content.items(): + column_data[key][ind] = val + table = LineageTable(pd.DataFrame(column_data)) + return table + + +@BLUEPRINT.route("/lineagemgr/optimizer/targets//suggestion", methods=["GET"]) +def get_optimize_suggestion(target_name): + goal = request.args.get("goal") + suggestions = _get_optimize_suggestion(data_manager=DATA_MANAGER, target_name=target_name, goal=goal) + return jsonify({"hyper_parameters": suggestions}) + + +def _get_optimize_suggestion(data_manager, target_name, goal): + logger.warning("target_name: %s, goal: %s", target_name, goal) + table = _organize_lineage_to_table(data_manager) + + sys_hyper_param_names = table.sys_hyper_param_names + hypers = {} + for name in sys_hyper_param_names: + data = table.get_column(name) + bound, data_type = _infer_bound_and_type(data, name) + + hypers[name] = { + "bound": bound, + "type": data_type + } + + user_defined_hyper_param_names = table.user_defined_hyper_param_names + user_hypers = {} + for name in user_defined_hyper_param_names: + data = table.get_column(name) + bound, data_type = _infer_bound_and_type(data, name) + + user_hypers[name] = { + "bound": bound, + "type": data_type + } + + params_info = { + "hyper": hypers, + "user_defined": user_hypers + } + + target_info = [{ + "group": "metric" if target_name.startswith(METRIC_PREFIX) else None, + "name": target_name[3:] if target_name.startswith(METRIC_PREFIX) else None, + "goal": goal + }] + + logger.warning("params_info: %s", params_info) + logger.warning("target_info: %s", target_info) + + from mindinsight.optimizer.utils.param_suggest_auto import suggest + + summary_lineage = general_filter_summary_lineage(data_manager=DATA_MANAGER) + suggestion = suggest(summary_lineage, params_info, target_info) + + logger.warning("suggestion: %s", suggestion) + + history_best = table.get_best_row(sort_by=target_name, best=goal) + suggested_hypers = [] + for hyper_name, suggested_value in zip(itertools.chain(hypers.keys(), user_hypers.keys()), suggestion[0]): + suggested_hypers.append({ + "name": hyper_name, + "suggested_value": float(suggested_value), + "history_best": float(history_best[hyper_name]) + }) + + return suggested_hypers + + +def _infer_bound_and_type(data, name): + if np.issubdtype(data.dtype, np.integer): + data_type = "int" + elif np.issubdtype(data.dtype, np.floating): + data_type = "float" + else: + logger.warning("Type not supported for hyper param %s, type is %s", name, data.dtype) + # continue + try: + bound = _calc_bound(data) + except NoValidInputDataException: + logger.warning("No valid data for hyper param %s", name) + # continue + return bound, data_type + + +class NoValidInputDataException(Exception): + pass + + +def _calc_bound(np_value: np.ndarray): + ma_value = np.ma.masked_invalid(np_value) + + valid_cnt = ma_value.count() + if not valid_cnt: + raise NoValidInputDataException() + + # Note that max of a masked array with dtype np.float16 returns inf (numpy issue#15077). + if np.issubdtype(np_value.dtype, np.floating): + max_val = ma_value.max(fill_value=np.NINF) + min_val = ma_value.min(fill_value=np.PINF) + else: + max_val = ma_value.max() + min_val = ma_value.min() + + # min_val equals max_val + if not min_val < max_val: + if min_val == 0: + return [-1, 1] + elif min_val > 0: + return [0, 2*min_val] + else: + return [2*min_val, 0] + + if max_val <= 0: + return [2*min_val, 0] + + if min_val >= 0: + return [0, 2*max_val] + + # max_val < 0 and min_val > 0 + return [2*min_val, 2*max_val] + + def init_module(app): """ Init module entry. diff --git a/mindinsight/optimizer/utils/param_suggest_auto.py b/mindinsight/optimizer/utils/param_suggest_auto.py index 87e41b719ffdb0365941709ea86a8959f0fe7f42..889877ddb25651dc3bb122684aa812fbd731006d 100644 --- a/mindinsight/optimizer/utils/param_suggest_auto.py +++ b/mindinsight/optimizer/utils/param_suggest_auto.py @@ -111,14 +111,14 @@ def get_params_target(summary_base_dir, target=None): def suggest(summary_lineage, params_info, target_info): param_matrix, target_matrix = organize_params_target(summary_lineage, params_info, target_info) - # No need to do, but now miss bound: # TODO: delete - min_list = np.nanmin(param_matrix, axis=0) - max_list = np.nanmax(param_matrix, axis=0) - index = 0 - for param_group in params_info: - for param_name in params_info[param_group]: - params_info[param_group][param_name]["bound"] = [min_list[index], max_list[index]] - index += 1 + # No need to do, but now miss bound: + # min_list = np.nanmin(param_matrix, axis=0) + # max_list = np.nanmax(param_matrix, axis=0) + # index = 0 + # for param_group in params_info: + # for param_name in params_info[param_group]: + # params_info[param_group][param_name]["bound"] = [min_list[index], max_list[index]] + # index += 1 suggestions = [] for i, it_target_info in enumerate(target_info): diff --git a/mindinsight/utils/utils.py b/mindinsight/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..588d996fb96653f9211bcf6a269a56d9bff6dc04 --- /dev/null +++ b/mindinsight/utils/utils.py @@ -0,0 +1,64 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Utils.""" +import numpy as np + + +def calc_histogram(np_value: np.ndarray, bins): + """ + Calculates histogram. + + This is a simple wrapper around the error-prone np.histogram() to improve robustness. + """ + ma_value = np.ma.masked_invalid(np_value) + + valid_cnt = ma_value.count() + if not valid_cnt: + max_val = 0 + min_val = 0 + else: + # Note that max of a masked array with dtype np.float16 returns inf (numpy issue#15077). + if np.issubdtype(np_value.dtype, np.floating): + max_val = ma_value.max(fill_value=np.NINF) + min_val = ma_value.min(fill_value=np.PINF) + else: + max_val = ma_value.max() + min_val = ma_value.min() + + range_left = min_val + range_right = max_val + + if not range_left < range_right: + range_left -= 0.5 + range_right += 0.5 + + counts, edges = np.histogram(np_value, bins=bins, range=(range_left, range_right)) + + histogram_bins = [None] * len(counts) + for ind, count in enumerate(counts): + histogram_bins[ind] = [float(edges[ind]), float(edges[ind + 1] - edges[ind]), float(count)] + + return histogram_bins + + +def is_simple_numpy_number(dtype): + if np.issubdtype(dtype, np.integer): + return True + + if np.issubdtype(dtype, np.floating): + return True + + return False + diff --git a/requirements.txt b/requirements.txt index 444ec104c9a06c495036ce914845ba62db51cfcf..4a6f664b15eddb6ba2428fdb80825f436b594412 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,6 @@ protobuf>=3.8.0 psutil>=5.6.1 six>=1.12.0 Werkzeug>=1.0.0 -tabulate>=0.8.6 \ No newline at end of file +tabulate>=0.8.6 +pandas +bayesian-optimization diff --git a/tests/st/func/lineagemgr/api/test_model_api.py b/tests/st/func/lineagemgr/api/test_model_api.py index 4c421eb011166c2ceebc16affd358b63c0f9db00..63c0bcd35354dcac4c0fc9e11b0b2922131a2b91 100644 --- a/tests/st/func/lineagemgr/api/test_model_api.py +++ b/tests/st/func/lineagemgr/api/test_model_api.py @@ -25,6 +25,7 @@ from unittest import TestCase import pytest +from mindinsight.backend.lineagemgr.lineage_api import _get_optimize_suggestion from mindinsight.lineagemgr import filter_summary_lineage, get_summary_lineage from mindinsight.lineagemgr.common.exceptions.exceptions import (LineageFileNotFoundError, LineageParamSummaryPathError, LineageParamTypeError, LineageParamValueError, @@ -828,3 +829,33 @@ class TestModelApi(TestCase): BASE_SUMMARY_DIR, search_condition ) + + def test_get_targets(self): + + # from mindinsight.datavisual.data_transform.data_manager import DATA_MANAGER + from mindinsight.datavisual.data_transform.data_manager import DataManager + from mindinsight.backend.lineagemgr.lineage_api import _get_optimize_targets + from mindinsight.lineagemgr.cache_item_updater import LineageCacheItemUpdater + from tests.utils.tools import check_loading_done + + data_manager = DataManager(summary_base_dir="/home/wenkai/code/gitee_wenkai01/mindinsight_examples/lenet_mnist/0702demo") + data_manager.register_brief_cache_item_updater(LineageCacheItemUpdater()) + data_manager.start_load_data(reload_interval=0) + check_loading_done(data_manager) + targets = _get_optimize_targets(data_manager) + print(targets) + + def test_get_suggestion(self): + from mindinsight.datavisual.data_transform.data_manager import DataManager + from mindinsight.backend.lineagemgr.lineage_api import _get_optimize_targets + from mindinsight.lineagemgr.cache_item_updater import LineageCacheItemUpdater + from tests.utils.tools import check_loading_done + + data_manager = DataManager(summary_base_dir="/home/wenkai/code/gitee_wenkai01/mindinsight_examples/lenet_mnist/0702demo") + data_manager.register_brief_cache_item_updater(LineageCacheItemUpdater()) + data_manager.start_load_data(reload_interval=0) + check_loading_done(data_manager) + + suggestion = _get_optimize_suggestion(data_manager, target_name="loss", goal="min") + print(suggestion) +