From 71fab7458c31f7ec637f33ceb7eb3f558e82b2bf Mon Sep 17 00:00:00 2001 From: xjxia Date: Tue, 14 Jan 2025 15:08:13 +0800 Subject: [PATCH] feat(dbm-services): add global monitor for DBHA close #9055 --- dbm-services/common/dbha/ha-module/Makefile | 4 +- .../dbha/ha-module/agent/monitor_agent.go | 38 +- .../common/dbha/ha-module/client/client.go | 5 +- .../common/dbha/ha-module/client/cmdb.go | 95 +++-- .../common/dbha/ha-module/client/hadb.go | 92 +++-- .../common/dbha/ha-module/config/config.go | 23 +- .../dbha/ha-module/constvar/constant.go | 17 +- dbm-services/common/dbha/ha-module/dbha.go | 28 +- .../dbmodule/dbmysql/MySQL_common_switch.go | 2 +- .../dbmodule/mongodb/mongos_callback.go | 4 +- .../ha-module/dbmodule/redis/redis_switch.go | 5 + .../dbha/ha-module/dbmodule/register.go | 2 +- .../globalmonitor/monitor_component.go | 349 ++++++++++++++++++ dbm-services/common/dbha/ha-module/gm/gcm.go | 4 +- dbm-services/common/dbha/ha-module/gm/gqa.go | 5 + .../common/dbha/ha-module/monitor/monitor.go | 144 ++------ .../common/dbha/ha-module/test/client_test.go | 69 ---- .../pkg/handler/hastatus/hastatus_handler.go | 6 +- .../cloud/script_template/dbha_template.py | 2 + 19 files changed, 573 insertions(+), 321 deletions(-) create mode 100644 dbm-services/common/dbha/ha-module/globalmonitor/monitor_component.go delete mode 100644 dbm-services/common/dbha/ha-module/test/client_test.go diff --git a/dbm-services/common/dbha/ha-module/Makefile b/dbm-services/common/dbha/ha-module/Makefile index 487f462fb2..26c1ec22f8 100644 --- a/dbm-services/common/dbha/ha-module/Makefile +++ b/dbm-services/common/dbha/ha-module/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash BASE_DIR = $(shell pwd) -VERSION = 0.0.1 -GITHASH = "" +VERSION = $(shell git describe --tags --always --dirty) +GITHASH = $(shell git rev-parse --short HEAD) APPNAME = dbha GOOS ?= linux BUILD_FLAG = " -X main.version=${VERSION} -X main.githash=${GITHASH} " diff --git a/dbm-services/common/dbha/ha-module/agent/monitor_agent.go b/dbm-services/common/dbha/ha-module/agent/monitor_agent.go index 24200836da..3225e3b627 100644 --- a/dbm-services/common/dbha/ha-module/agent/monitor_agent.go +++ b/dbm-services/common/dbha/ha-module/agent/monitor_agent.go @@ -100,7 +100,7 @@ func NewMonitorAgent(conf *config.Config, detectType string) (*MonitorAgent, err // report agent's heartbeat info. func (a *MonitorAgent) Process(instances map[string]dbutil.DataBaseDetect) { var wg sync.WaitGroup - startTime := time.Now().Unix() + startTime := time.Now() sem := make(chan struct{}, a.MaxConcurrency) // 创建一个有缓冲的通道,容量为 maxConcurrency log.Logger.Debugf("[%s] need to detect instances number:%d", a.DetectType, len(a.DBInstance)) for _, ins := range instances { @@ -113,9 +113,10 @@ func (a *MonitorAgent) Process(instances map[string]dbutil.DataBaseDetect) { }(ins) } wg.Wait() + interval := int(time.Now().Sub(startTime).Seconds()) log.Logger.Debugf("[%s] detected instances number:%d ,cost: %d", - a.DetectType, len(a.DBInstance), time.Now().Unix()-startTime) - a.DetectPostProcess() + a.DetectType, len(a.DBInstance), interval) + a.DetectPostProcess(interval) time.Sleep(time.Second) } @@ -185,8 +186,8 @@ func (a *MonitorAgent) DoDetectSingle(ins dbutil.DataBaseDetect) { } // DetectPostProcess post agent heartbeat -func (a *MonitorAgent) DetectPostProcess() { - err := a.reporterHeartbeat() +func (a *MonitorAgent) DetectPostProcess(interval int) { + err := a.reporterHeartbeat(interval) if err != nil { log.Logger.Errorf("reporter heartbeat failed. err:%s", err.Error()) } @@ -232,15 +233,19 @@ func (a *MonitorAgent) FetchDBInstance() error { a.HashMod = mod a.HashValue = modValue - req := client.DBInstanceInfoRequest{ + req := client.DBInstanceByCityRequest{ LogicalCityIDs: []int{a.CityID}, HashCnt: mod, HashValue: modValue, ClusterTypes: []string{a.DetectType}, } - rawInfo, err := a.CmDBClient.GetDBInstanceInfoByClusterType(req) + rawInfo, err := a.CmDBClient.GetDBInstanceInfoByCityID(req) if err != nil { + minInfo := monitor.GetApiAlertInfo(constvar.CmDBInstanceUrl, err.Error()) + if e := monitor.MonitorSend("get instances failed", minInfo); e != nil { + log.Logger.Warnf(e.Error()) + } log.Logger.Errorf("get instance info from cmdb failed. err:%s", err.Error()) return err } @@ -306,20 +311,20 @@ func (a *MonitorAgent) FetchGMInstance() error { continue } // needn't lock - _, ok := a.GMInstance[info.Ip] + _, ok := a.GMInstance[info.IP] if ok { - a.GMInstance[info.Ip].LastFetchTime = time.Now() + a.GMInstance[info.IP].LastFetchTime = time.Now() } else { - a.GMInstance[info.Ip] = &GMConnection{ - Ip: info.Ip, + a.GMInstance[info.IP] = &GMConnection{ + Ip: info.IP, Port: info.Port, LastFetchTime: time.Now(), IsClose: false, } - err = a.GMInstance[info.Ip].Init() + err = a.GMInstance[info.IP].Init() if err != nil { log.Logger.Errorf("init gm failed. gm_ip:%s, gm_port:%d, err:%s", - info.Ip, info.Port, err.Error()) + info.Port, info.Port, err.Error()) return err } } @@ -342,6 +347,7 @@ func (a *MonitorAgent) NeedReportGM(ins dbutil.DataBaseDetect) bool { cachedIns := a.ReportGMCache[ip] now := time.Now() if now.Before(cachedIns.ReporterGMTime.Add(time.Second * time.Duration(cachedIns.ExpireInterval))) { + log.Logger.Debugf("instance[%s] cached, skip report to gm", cachedIns.Ip) return false } } @@ -395,6 +401,7 @@ func (a *MonitorAgent) ReportDetectInfoToGM(reporterInstance dbutil.DataBaseDete //do retry continue } else { + log.Logger.Debugf("reporter instance[%s#%d] to gm[%s#%d] success", ip, port, gmIns.Ip, gmIns.Port) isReported = true gmIns.Mutex.Unlock() a.ReportGMCache[ip] = &CachedHostInfo{ @@ -484,9 +491,8 @@ func (a *MonitorAgent) registerAgentInfoToHaDB() error { } // reporterHeartbeat send agent heartbeat to HA-DB -func (a *MonitorAgent) reporterHeartbeat() error { - interval := time.Now().Sub(a.heartbeat).Seconds() - err := a.HaDBClient.ReporterAgentHeartbeat(a.MonIp, a.DetectType, int(interval), a.HashMod, a.HashValue) +func (a *MonitorAgent) reporterHeartbeat(interval int) error { + err := a.HaDBClient.ReporterAgentHeartbeat(a.MonIp, a.DetectType, interval, a.HashMod, a.HashValue) a.heartbeat = time.Now() return err } diff --git a/dbm-services/common/dbha/ha-module/client/client.go b/dbm-services/common/dbha/ha-module/client/client.go index bd1205d78e..e5d461fdaa 100644 --- a/dbm-services/common/dbha/ha-module/client/client.go +++ b/dbm-services/common/dbha/ha-module/client/client.go @@ -97,8 +97,9 @@ func (c *Client) DoNewForCB( } var retryErr error + var response interface{} for retryIdx := 0; retryIdx < 5; retryIdx++ { - response, retryErr := c.doNewInner(method, url, params, headers, bodyCB) + response, retryErr = c.doNewInner(method, url, params, headers, bodyCB) if retryErr == nil { return response, nil } @@ -211,7 +212,7 @@ func (c *Client) doNewInner(method, url string, params interface{}, result, err := bodyCB(b) if err != nil { - log.Logger.Errorf(err.Error()) + log.Logger.Errorf(fmt.Sprintf("%s:%s", util.AtWhere(), err.Error())) return nil, err } return result, nil diff --git a/dbm-services/common/dbha/ha-module/client/cmdb.go b/dbm-services/common/dbha/ha-module/client/cmdb.go index 8db9fb3e5a..0f201d1871 100644 --- a/dbm-services/common/dbha/ha-module/client/cmdb.go +++ b/dbm-services/common/dbha/ha-module/client/cmdb.go @@ -1,14 +1,14 @@ package client import ( + "encoding/json" + "fmt" + "net/http" + "dbm-services/common/dbha/ha-module/config" "dbm-services/common/dbha/ha-module/constvar" "dbm-services/common/dbha/ha-module/log" "dbm-services/common/dbha/ha-module/util" - - "encoding/json" - "fmt" - "net/http" ) // CmDBClient client to request cmdb @@ -16,15 +16,25 @@ type CmDBClient struct { Client } -// DBInstanceInfoByAddressRequest fetch instances list from cmdb by ip -type DBInstanceInfoByAddressRequest struct { +// DBInstanceByAddressRequest fetch instances list from cmdb by ip +type DBInstanceByAddressRequest struct { DBCloudToken string `json:"db_cloud_token"` BKCloudID int `json:"bk_cloud_id"` Addresses []string `json:"addresses"` } -// DBInstanceInfoRequest fetch instances list from cmdb by city and status -type DBInstanceInfoRequest struct { +// DBInstanceByClusterTypeRequest fetch instances list from cmdb by ip +type DBInstanceByClusterTypeRequest struct { + DBCloudToken string `json:"db_cloud_token"` + BKCloudID int `json:"bk_cloud_id"` + Statuses []string `json:"statuses"` + HashCnt int `json:"hash_cnt"` + HashValue int `json:"hash_value"` + ClusterTypes []string `json:"cluster_types"` +} + +// DBInstanceByCityRequest fetch instances list from cmdb by city and status +type DBInstanceByCityRequest struct { DBCloudToken string `json:"db_cloud_token"` BKCloudID int `json:"bk_cloud_id"` LogicalCityIDs []int `json:"logical_city_ids"` @@ -135,7 +145,7 @@ func NewCmDBClient(conf *config.APIConfig, cloudId int) *CmDBClient { // GetDBInstanceInfoByIp fetch instance info from cmdb by ip func (c *CmDBClient) GetDBInstanceInfoByIp(ip string) ([]interface{}, error) { var res []interface{} - req := DBInstanceInfoByAddressRequest{ + req := DBInstanceByAddressRequest{ DBCloudToken: c.Conf.BKConf.BkToken, BKCloudID: c.CloudId, Addresses: []string{ip}, @@ -156,41 +166,20 @@ func (c *CmDBClient) GetDBInstanceInfoByIp(ip string) ([]interface{}, error) { return res, nil } -// GetAllDBInstanceInfo detect running, available status instance -func (c *CmDBClient) GetAllDBInstanceInfo() ([]interface{}, error) { - req := DBInstanceInfoRequest{ - DBCloudToken: c.Conf.BKConf.BkToken, - BKCloudID: c.CloudId, - Statuses: []string{constvar.RUNNING, constvar.AVAILABLE}, - } - - response, err := c.DoNew( - http.MethodPost, c.SpliceUrlByPrefix(c.Conf.UrlPre, constvar.CmDBInstanceUrl, ""), req, nil) - if err != nil { - return nil, err - } - if response.Code != 0 { - return nil, fmt.Errorf("%s failed, return code:%d, msg:%s", util.AtWhere(), response.Code, response.Msg) - } - - var res []interface{} - err = json.Unmarshal(response.Data, &res) - if err != nil { - return nil, err - } - - return res, nil -} - -// GetDBInstanceInfoByCity detect running, available status instance -func (c *CmDBClient) GetDBInstanceInfoByCity(cityID int) ([]interface{}, error) { - req := DBInstanceInfoRequest{ +// GetDBInstanceInfoByCityID detect running, available status instance +func (c *CmDBClient) GetDBInstanceInfoByCityID(requestInfo DBInstanceByCityRequest) ([]interface{}, error) { + req := DBInstanceByCityRequest{ DBCloudToken: c.Conf.BKConf.BkToken, BKCloudID: c.CloudId, - LogicalCityIDs: []int{cityID}, + LogicalCityIDs: requestInfo.LogicalCityIDs, Statuses: []string{constvar.RUNNING, constvar.AVAILABLE}, + HashCnt: requestInfo.HashCnt, + HashValue: requestInfo.HashValue, + ClusterTypes: requestInfo.ClusterTypes, } + log.Logger.Debugf("GetDBInstanceInfoByCityID param:%#v", req) + response, err := c.DoNew( http.MethodPost, c.SpliceUrlByPrefix(c.Conf.UrlPre, constvar.CmDBInstanceUrl, ""), req, nil) if err != nil { @@ -209,19 +198,18 @@ func (c *CmDBClient) GetDBInstanceInfoByCity(cityID int) ([]interface{}, error) return res, nil } -// GetDBInstanceInfoByClusterType detect running, available status instance -func (c *CmDBClient) GetDBInstanceInfoByClusterType(requestInfo DBInstanceInfoRequest) ([]interface{}, error) { - req := DBInstanceInfoRequest{ - DBCloudToken: c.Conf.BKConf.BkToken, - BKCloudID: c.CloudId, - LogicalCityIDs: requestInfo.LogicalCityIDs, - Statuses: []string{constvar.RUNNING, constvar.AVAILABLE}, - HashCnt: requestInfo.HashCnt, - HashValue: requestInfo.HashValue, - ClusterTypes: requestInfo.ClusterTypes, +// GetDBInstanceByClusterType detect running, available status instance +func (c *CmDBClient) GetDBInstanceByClusterType(requestInfo DBInstanceByClusterTypeRequest) ([]interface{}, error) { + req := DBInstanceByClusterTypeRequest{ + DBCloudToken: c.Conf.BKConf.BkToken, + BKCloudID: c.CloudId, + Statuses: []string{constvar.RUNNING, constvar.AVAILABLE}, + HashCnt: requestInfo.HashCnt, + HashValue: requestInfo.HashValue, + ClusterTypes: requestInfo.ClusterTypes, } - log.Logger.Debugf("GetDBInstanceInfo param:%#v", req) + log.Logger.Debugf("GetDBInstanceByClusterType param:%#v", req) response, err := c.DoNew( http.MethodPost, c.SpliceUrlByPrefix(c.Conf.UrlPre, constvar.CmDBInstanceUrl, ""), req, nil) @@ -241,15 +229,16 @@ func (c *CmDBClient) GetDBInstanceInfoByClusterType(requestInfo DBInstanceInfoRe return res, nil } -// GetDBInstanceInfoByCluster fetch instance info from cmdb by ip -func (c *CmDBClient) GetDBInstanceInfoByCluster(clusterName string) ([]interface{}, error) { +// GetDBInstanceInfoByAddress fetch instance info from cmdb by ip +func (c *CmDBClient) GetDBInstanceInfoByAddress(clusterName string) ([]interface{}, error) { var res []interface{} - req := DBInstanceInfoByAddressRequest{ + req := DBInstanceByAddressRequest{ DBCloudToken: c.Conf.BKConf.BkToken, BKCloudID: c.CloudId, Addresses: []string{clusterName}, } + log.Logger.Debugf("GetDBInstanceInfoByAddress param:%#v", req) response, err := c.DoNew( http.MethodPost, c.SpliceUrlByPrefix(c.Conf.UrlPre, constvar.CmDBInstanceUrl, ""), req, nil) if err != nil { diff --git a/dbm-services/common/dbha/ha-module/client/hadb.go b/dbm-services/common/dbha/ha-module/client/hadb.go index 936c6dbbfa..8e89c331b5 100644 --- a/dbm-services/common/dbha/ha-module/client/hadb.go +++ b/dbm-services/common/dbha/ha-module/client/hadb.go @@ -21,21 +21,6 @@ type HaDBClient struct { Client } -// GMInfo gm base info, use to report -type GMInfo struct { - Ip string `json:"ip"` - Port int `json:"port"` - CityID int `json:"city_id"` - CloudID int `json:"cloud_id"` -} - -// AgentInfo gm base info, use to report -type AgentInfo struct { - Ip string `json:"ip"` - CityID int `json:"city_id"` - CloudID int `json:"cloud_id"` -} - // HaStatusRequest request ha status table type HaStatusRequest struct { DBCloudToken string `json:"db_cloud_token"` @@ -50,8 +35,8 @@ type HaStatusResponse struct { RowsAffected int `json:"rowsAffected"` } -// DbStatusRequest request db status -type DbStatusRequest struct { +// HaAgentLogsRequest request ha_agent_logs +type HaAgentLogsRequest struct { DBCloudToken string `json:"db_cloud_token"` BKCloudID int `json:"bk_cloud_id"` Name string `json:"name"` @@ -59,8 +44,8 @@ type DbStatusRequest struct { SetArgs *model.HAAgentLogs `json:"set_args,omitempty"` } -// DbStatusResponse db status response -type DbStatusResponse struct { +// HaAgentLogsResponse ha_agent_logs response +type HaAgentLogsResponse struct { RowsAffected int `json:"rowsAffected"` Uid int `json:"uid"` } @@ -129,16 +114,15 @@ func NewHaDBClient(conf *config.APIConfig, cloudId int) *HaDBClient { return &HaDBClient{c} } -// GetDBDetectInfo get gm info from hadb -func (c *HaDBClient) GetDBDetectInfo() ([]model.HAAgentLogs, error) { - req := DbStatusRequest{ +// GetHADetectInfo get gm info from hadb +func (c *HaDBClient) GetHADetectInfo() ([]model.HAAgentLogs, error) { + req := HaAgentLogsRequest{ DBCloudToken: c.Conf.BKConf.BkToken, BKCloudID: c.CloudId, Name: constvar.GetInstanceStatus, - QueryArgs: &model.HAAgentLogs{}, } - log.Logger.Debugf("AgentGetGMInfo param:%#v", req) + log.Logger.Debugf("GetHADetectInfo param:%#v", util.GraceStructString(req.QueryArgs)) response, err := c.DoNew(http.MethodPost, c.SpliceUrlByPrefix(c.Conf.UrlPre, constvar.DbStatusUrl, ""), req, nil) @@ -154,17 +138,18 @@ func (c *HaDBClient) GetDBDetectInfo() ([]model.HAAgentLogs, error) { return nil, err } if len(result) == 0 { - return nil, fmt.Errorf("no gm available") + log.Logger.Debugf("no detected instance found") } + return result, nil } // ReportDBStatus report detected instance's status func (c *HaDBClient) ReportDBStatus(app, agentIp, ip string, port int, dbType, status, bindGM string) error { - var result DbStatusResponse + var result HaAgentLogsResponse currentTime := time.Now() - updateReq := DbStatusRequest{ + updateReq := HaAgentLogsRequest{ DBCloudToken: c.Conf.BKConf.BkToken, BKCloudID: c.CloudId, Name: constvar.UpdateInstanceStatus, @@ -205,7 +190,7 @@ func (c *HaDBClient) ReportDBStatus(app, agentIp, ip string, port int, dbType, s log.Logger.Errorf("bug: update instance status affect rows %d", result.RowsAffected) } - insertReq := DbStatusRequest{ + insertReq := HaAgentLogsRequest{ DBCloudToken: c.Conf.BKConf.BkToken, BKCloudID: c.CloudId, Name: constvar.InsertInstanceStatus, @@ -295,9 +280,10 @@ func (c *HaDBClient) RegisterDBHAInfo( BKCloudID: c.CloudId, Name: constvar.RegisterDBHAInfo, QueryArgs: &model.HaStatus{ - IP: ip, - Module: module, - DbType: dbType, + CloudID: c.CloudId, + IP: ip, + Module: module, + DbType: dbType, }, SetArgs: &model.HaStatus{ IP: ip, @@ -368,8 +354,8 @@ func (c *HaDBClient) GetAliveAgentInfo(cityID int, dbType string, interval int) return result, nil } -// GetAliveHAComponent get alive gm instance from ha_status table -func (c *HaDBClient) GetAliveHAComponent(module string, interval int) ([]GMInfo, error) { +// GetAliveHAComponent get alive ha component instance from ha_status table +func (c *HaDBClient) GetAliveHAComponent(module string, interval int) ([]model.HaStatus, error) { currentTime := time.Now().Add(-time.Second * time.Duration(interval)) req := HaStatusRequest{ DBCloudToken: c.Conf.BKConf.BkToken, @@ -382,7 +368,7 @@ func (c *HaDBClient) GetAliveHAComponent(module string, interval int) ([]GMInfo, }, } - log.Logger.Debugf("GetAliveHAInfo param:%#v", util.GraceStructString(req)) + log.Logger.Debugf("GetAliveHAInfo param:%#v", util.GraceStructString(req.QueryArgs)) response, err := c.DoNew(http.MethodPost, c.SpliceUrlByPrefix(c.Conf.UrlPre, constvar.HaStatusUrl, ""), req, nil) @@ -394,7 +380,7 @@ func (c *HaDBClient) GetAliveHAComponent(module string, interval int) ([]GMInfo, return nil, fmt.Errorf("%s failed, return code:%d, msg:%s", util.AtWhere(), response.Code, response.Msg) } - result := make([]GMInfo, 0) + result := make([]model.HaStatus, 0) err = json.Unmarshal(response.Data, &result) if err != nil { log.Logger.Errorf("GetAliveHAInfo failed, unmarshal failed, err:%s, data:%s", err.Error(), response.Data) @@ -481,6 +467,42 @@ func (c *HaDBClient) ReporterGMHeartbeat(gmIP, module string, interval int) erro return nil } +// ReporterMonitorHeartbeat report global monitor heartbeat to ha_status +func (c *HaDBClient) ReporterMonitorHeartbeat(monIP, detectType string) error { + var result HaStatusResponse + + currentTime := time.Now() + req := HaStatusRequest{ + DBCloudToken: c.Conf.BKConf.BkToken, + BKCloudID: c.CloudId, + Name: constvar.ReporterMonitorHeartbeat, + QueryArgs: &model.HaStatus{ + IP: monIP, + DbType: detectType, + }, + SetArgs: &model.HaStatus{ + LastTime: ¤tTime, + }, + } + + log.Logger.Debugf("ReporterMonitorHeartbeat param:%#v", util.GraceStructString(req)) + + response, err := c.DoNew(http.MethodPost, + c.SpliceUrlByPrefix(c.Conf.UrlPre, constvar.HaStatusUrl, ""), req, nil) + if err != nil { + return err + } + if response.Code != 0 { + return fmt.Errorf("%s failed, return code:%d, msg:%s", util.AtWhere(), response.Code, response.Msg) + } + err = json.Unmarshal(response.Data, &result) + if err != nil { + return err + } + + return nil +} + // QuerySingleTotal check same instance's switch number in a given time period func (c *HaDBClient) QuerySingleTotal(ip string, port int, interval int) (int, error) { var result struct { diff --git a/dbm-services/common/dbha/ha-module/config/config.go b/dbm-services/common/dbha/ha-module/config/config.go index 5d8107d64f..7e06746137 100644 --- a/dbm-services/common/dbha/ha-module/config/config.go +++ b/dbm-services/common/dbha/ha-module/config/config.go @@ -34,6 +34,8 @@ type Config struct { Timezone TimezoneConfig `yaml:"timezone"` // configure for password service PasswdConf APIConfig `yaml:"password_conf"` + // configure for Global Monitor + GlobalMonitorConf *GlobalMonitorConfig `yaml:"global_monitor_conf"` } // LogConfig configure for log @@ -86,6 +88,21 @@ type GMConfig struct { GCM GCMConfig `yaml:"GCM"` } +// GlobalMonitorConfig configure for agent component +type GlobalMonitorConfig struct { + // active type list for db detect, valid type in constant.go + ActiveClusterType []string `yaml:"active_db_type"` + // instance campus for detect + Campus string `yaml:"campus"` + // cloud id for agent, value 0 allowed, so required tag could not assign + CloudID int `yaml:"cloud_id"` + ReportInterval int `yaml:"reporter_interval"` + LocalIP string `yaml:"local_ip"` + // hash mod use to batch fetch cmdb instances + HashMod int `yaml:"hash_mod"` + IgnoreCityList []int `yaml:"ignore_city_list"` +} + // GDMConfig configure for GDM component type GDMConfig struct { DupExpire int `yaml:"dup_expire"` @@ -200,7 +217,7 @@ type BKConfig struct { BkToken string `yaml:"bk_token"` } -// MonitorConfig monitor configure +// MonitorConfig monitor configure for alert type MonitorConfig struct { BkDataId int `yaml:"bk_data_id"` AccessToken string `yaml:"access_token"` @@ -209,10 +226,6 @@ type MonitorConfig struct { LocalIP string `yaml:"local_ip"` //value 0 allowed, so required tag could not assign CloudID int `yaml:"cloud_id"` - //interval(second) for global monitor - MonitorInterval int `yaml:"monitor_interval"` - // active cluster type list for agent detect - ActiveDBType []string `yaml:"active_db_type"` } // TimezoneConfig support config timezone diff --git a/dbm-services/common/dbha/ha-module/constvar/constant.go b/dbm-services/common/dbha/ha-module/constvar/constant.go index f7f0ff26b5..be622cb924 100644 --- a/dbm-services/common/dbha/ha-module/constvar/constant.go +++ b/dbm-services/common/dbha/ha-module/constvar/constant.go @@ -64,6 +64,15 @@ const ( // Mongos MONGOS = EnumField("mongos", _("mongos")) # mongos Mongos = "mongos" + + // TendisCacheMetaType storage layer type name in PredixyRedisCluster + TendisCacheMetaType = "tendiscache" + // TendisPlusMetaType storage layer type name in PredixyTendisplusCluster + TendisPlusMetaType = "tendisplus" + // MongodbMetaType storage layer type name in MongoShardedCluster + MongodbMetaType = "mongodb" + // MongoConfigMetaType storage layer type name in MongoShardedCluster + MongoConfigMetaType = "mongo_config" ) // instance role in cmdb @@ -113,7 +122,7 @@ const ( SqlserverHA = "sqlserver_ha" // MongoShardedCluster = EnumField("MongoShardedCluster", _("Mongo分片集群")) - MongoShardCluster = "MongoShardedCluster" + MongoShardedCluster = "MongoShardedCluster" ) // wrapper name in TenDBCluster @@ -168,6 +177,8 @@ const ( ReporterAgentHeartbeat = "reporter_agent_heartbeat" // ReporterGMHeartbeat TODO ReporterGMHeartbeat = "reporter_gm_heartbeat" + // ReporterMonitorHeartbeat TODO + ReporterMonitorHeartbeat = "reporter_monitor_heartbeat" // QuerySingleTotal TODO QuerySingleTotal = "query_single_total" // QueryIntervalTotal TODO @@ -377,6 +388,8 @@ const ( DBHAEventDoubleCheckAuth = "dbha_doublecheck_auth_fail" // DBHAEventGlobalMonitor TODO DBHAEventGlobalMonitor = "dbha_global_monitor" + // DBHAEventApiFailed TODO + DBHAEventApiFailed = "dbha_call_api_fail" // MonitorInfoSwitch TODO MonitorInfoSwitch = 0 @@ -384,6 +397,8 @@ const ( MonitorInfoDetect = 1 // MonitorInfoGlobal global monitor for component work normal MonitorInfoGlobal = 2 + // MonitorInfoAPI event name for api alert + MonitorInfoAPI = 3 // MonitorReportType TODO MonitorReportType = "agent" diff --git a/dbm-services/common/dbha/ha-module/dbha.go b/dbm-services/common/dbha/ha-module/dbha.go index 902c23d11c..a8eb401276 100644 --- a/dbm-services/common/dbha/ha-module/dbha.go +++ b/dbm-services/common/dbha/ha-module/dbha.go @@ -4,11 +4,11 @@ import ( "flag" "fmt" "os" - "time" "dbm-services/common/dbha/ha-module/agent" "dbm-services/common/dbha/ha-module/config" "dbm-services/common/dbha/ha-module/constvar" + "dbm-services/common/dbha/ha-module/globalmonitor" "dbm-services/common/dbha/ha-module/gm" "dbm-services/common/dbha/ha-module/log" "dbm-services/common/dbha/ha-module/monitor" @@ -17,16 +17,27 @@ import ( var dbhaType string var configFile string +var showVersion bool +var version = "1.0.0" +var githash = "unknown" // Init TODO func Init() { flag.StringVar(&dbhaType, "type", "", `Input dbha type, ["agent","gm","monitor"]`) flag.StringVar(&configFile, "config_file", "", "Input config file path") + flag.BoolVar(&showVersion, "version", false, "Show version") } func main() { Init() flag.Parse() + + if showVersion { + fmt.Println("Version:", version) + fmt.Println("Git hash info:", githash) + os.Exit(0) + } + if flag.NFlag() != 2 { fmt.Println("args wrong.") os.Exit(1) @@ -81,16 +92,13 @@ func main() { os.Exit(1) } case constvar.MONITOR: - for { - if monInfo, err := monitor.CheckHAComponent(conf); err != nil { - if err = monitor.MonitorSend(err.Error(), monInfo); err != nil { - log.Logger.Fatalf("global monitor run failed. err:%s", err.Error()) - os.Exit(1) - } - } - time.Sleep(time.Duration(conf.Monitor.MonitorInterval) * time.Second) + mon := globalmonitor.NewMonitorComponent(conf) + if err = mon.RegisterMonitorInfoToHaDB(); err != nil { + log.Logger.Fatalf("global monitor register failed:%s", err.Error()) + } + if err = mon.Run(); err != nil { + log.Logger.Fatalf("global monitor run failed:%s", err.Error()) } - default: log.Logger.Fatalf("unknow dbha type") os.Exit(1) diff --git a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQL_common_switch.go b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQL_common_switch.go index a9951852cf..b70e054c55 100644 --- a/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQL_common_switch.go +++ b/dbm-services/common/dbha/ha-module/dbmodule/dbmysql/MySQL_common_switch.go @@ -934,7 +934,7 @@ func (ins *SpiderCommonSwitch) GetPrimary() error { // SetSpiderNodes get all spider nodes from dbmeta func (ins *SpiderCommonSwitch) SetSpiderNodes() error { cmdbClient := client.NewCmDBClient(&ins.Config.DBConf.CMDB, ins.Config.GetCloudId()) - rawData, err := cmdbClient.GetDBInstanceInfoByCluster(ins.ClusterName) + rawData, err := cmdbClient.GetDBInstanceInfoByAddress(ins.ClusterName) if err != nil { return fmt.Errorf("get all cluster instance info failed:%s", err.Error()) } diff --git a/dbm-services/common/dbha/ha-module/dbmodule/mongodb/mongos_callback.go b/dbm-services/common/dbha/ha-module/dbmodule/mongodb/mongos_callback.go index ff3cbebc89..7e9ffbcf44 100644 --- a/dbm-services/common/dbha/ha-module/dbmodule/mongodb/mongos_callback.go +++ b/dbm-services/common/dbha/ha-module/dbmodule/mongodb/mongos_callback.go @@ -33,7 +33,7 @@ func NewMongosInstanceByCmDB(instances []interface{}, Conf *config.Config) ([]db ) if unmarshalIns, err = UnMarshalMongosInstanceByCmdb(instances, - constvar.MongoShardCluster, constvar.Mongos); err != nil { + constvar.MongoShardedCluster, constvar.Mongos); err != nil { return nil, err } @@ -55,7 +55,7 @@ func DeserializeMongos(jsonInfo []byte, conf *config.Config) (dbutil.DataBaseDet } var ret dbutil.DataBaseDetect // gm将agent上报的数据结构转换为gdm通道接收的数据结构 - ret = NewMongosDetectInstanceForGdm(&response, constvar.MongoShardCluster, conf) + ret = NewMongosDetectInstanceForGdm(&response, constvar.MongoShardedCluster, conf) return ret, nil } diff --git a/dbm-services/common/dbha/ha-module/dbmodule/redis/redis_switch.go b/dbm-services/common/dbha/ha-module/dbmodule/redis/redis_switch.go index acaf72f68d..29d20e8be6 100644 --- a/dbm-services/common/dbha/ha-module/dbmodule/redis/redis_switch.go +++ b/dbm-services/common/dbha/ha-module/dbmodule/redis/redis_switch.go @@ -20,6 +20,7 @@ import ( "dbm-services/common/dbha/ha-module/constvar" "dbm-services/common/dbha/ha-module/dbutil" "dbm-services/common/dbha/ha-module/log" + "dbm-services/common/dbha/ha-module/monitor" "dbm-services/common/dbha/ha-module/util" ) @@ -356,6 +357,10 @@ func (ins *RedisSwitch) DoKickTwemproxy(proxy dbutil.ProxyInfo) error { ins.ReportLogs(constvar.InfoResult, fmt.Sprintf("kickoff twemproxy: start kickoff by [%s:%d]", proxy.Ip, proxy.Port)) infos, err := ins.CmDBClient.GetDBInstanceInfoByIp(proxy.Ip) if err != nil { + minInfo := monitor.GetApiAlertInfo(constvar.CmDBInstanceUrl, err.Error()) + if e := monitor.MonitorSend("get instances failed", minInfo); e != nil { + log.Logger.Warnf(e.Error()) + } redisErr := fmt.Errorf("kickoff twemproxy: get twemproxy[%s:%d:%d] from cmdb failed", proxy.Ip, proxy.Port, proxy.AdminPort) ins.ReportLogs(constvar.FailResult, redisErr.Error()) diff --git a/dbm-services/common/dbha/ha-module/dbmodule/register.go b/dbm-services/common/dbha/ha-module/dbmodule/register.go index 408da1efb9..006e9b6519 100644 --- a/dbm-services/common/dbha/ha-module/dbmodule/register.go +++ b/dbm-services/common/dbha/ha-module/dbmodule/register.go @@ -99,7 +99,7 @@ func init() { } // Mongos used - DBCallbackMap[constvar.MongoShardCluster] = Callback{ + DBCallbackMap[constvar.MongoShardedCluster] = Callback{ FetchDBCallback: mongodb.NewMongosInstanceByCmDB, DeserializeCallback: mongodb.DeserializeMongos, GetSwitchInstanceInformation: mongodb.NewMongosSwitchInstance, diff --git a/dbm-services/common/dbha/ha-module/globalmonitor/monitor_component.go b/dbm-services/common/dbha/ha-module/globalmonitor/monitor_component.go new file mode 100644 index 0000000000..88f4cf6aae --- /dev/null +++ b/dbm-services/common/dbha/ha-module/globalmonitor/monitor_component.go @@ -0,0 +1,349 @@ +// Package globalmonitor monitor whether component work normal +package globalmonitor + +import ( + "encoding/json" + "fmt" + "strings" + "time" + + "dbm-services/bigdata/db-tools/dbactuator/pkg/util" + "dbm-services/common/dbha/ha-module/client" + "dbm-services/common/dbha/ha-module/config" + "dbm-services/common/dbha/ha-module/constvar" + "dbm-services/common/dbha/ha-module/log" + "dbm-services/common/dbha/ha-module/monitor" + "dbm-services/common/dbha/hadb-api/model" +) + +// MachineInfo instance detail info from cmdb api +type MachineInfo struct { + IP string `json:"ip"` + LogicalCityID int `json:"logical_city_id"` + ClusterType string `json:"cluster_type"` + MachineType string `json:"machine_type"` +} + +// MonitorComponent global monitor work struct +type MonitorComponent struct { + // active type list for db detect, valid type in constant.go + ActiveClusterType []string `yaml:"active_db_type"` + //monitor ip + MonIp string + // all configure file + Conf *config.Config + // global monitor configure fie + MonitorConf *config.GlobalMonitorConfig + // API client to access cmdb metadata + CmDBClient *client.CmDBClient + // API client to access hadb + HaDBClient *client.HaDBClient + //cmdb need detect ip list + NeedDetectMachines map[string]struct{} + //cmdb need detect city list + NeedDetectCities map[int]struct{} + //HA detected ip list + DetectedMachines map[string]struct{} + //HA detected city list + DetectedCities map[int]struct{} + //HA agent list + AgentList []model.HaStatus + //HA gm list + GmList []model.HaStatus + //alert info to bk + AlertInfo monitor.MonitorInfo + //hash mod use to batch fetch cmdb instance + HashMod int + //skip statistics city list + IgnoreCityList []int +} + +// NewMonitorComponent create new global monitor component +func NewMonitorComponent(conf *config.Config) *MonitorComponent { + return &MonitorComponent{ + ActiveClusterType: conf.GlobalMonitorConf.ActiveClusterType, + Conf: conf, + MonitorConf: conf.GlobalMonitorConf, + CmDBClient: client.NewCmDBClient(&conf.DBConf.CMDB, conf.GetCloudId()), + HaDBClient: client.NewHaDBClient(&conf.DBConf.HADB, conf.GetCloudId()), + MonIp: conf.GlobalMonitorConf.LocalIP, + NeedDetectMachines: make(map[string]struct{}), + NeedDetectCities: make(map[int]struct{}), + DetectedMachines: make(map[string]struct{}), + DetectedCities: make(map[int]struct{}), + HashMod: conf.GlobalMonitorConf.HashMod, + IgnoreCityList: conf.GlobalMonitorConf.IgnoreCityList, + AlertInfo: monitor.MonitorInfo{ + EventName: constvar.DBHAEventGlobalMonitor, + MonitorInfoType: constvar.MonitorInfoGlobal, + Global: monitor.GlobalMonitor{ + ServerIp: conf.Monitor.LocalIP, + UnCoveredInsNumber: 0, + UnCoveredCityIDs: nil, + NeedDetectNumber: 0, + HADetectedNumber: 0, + }, + }, + } +} + +// Run global monitor +func (m *MonitorComponent) Run() error { + for { + time.Sleep(10 * time.Second) + log.Logger.Infof("------------------global monitor run start-----------------") + log.Logger.Debugf("try to get all ha componentinfo") + if err := m.getAllHaComponentInfo(); err != nil { + log.Logger.Errorf("get all HA component info failed:%s", err.Error()) + continue + } + if err := m.getAllDetectedMachineInfo(); err != nil { + log.Logger.Errorf("get all HA detected machine failed:%s", err.Error()) + continue + } + if err := m.getAllNeedDetectMachineInfo(); err != nil { + log.Logger.Errorf("get all CMDB need detect machine failed:%s", err.Error()) + continue + } + + m.checkAllCovered() + m.checkComponentNormal() + m.reportHeartbeat() + + log.Logger.Infof("------------------global monitor run finish-----------------") + time.Sleep(time.Duration(m.MonitorConf.ReportInterval) * time.Second) + } +} + +// RegisterMonitorInfoToHaDB register current agent info +func (m *MonitorComponent) RegisterMonitorInfoToHaDB() error { + err := m.HaDBClient.RegisterDBHAInfo( + m.MonIp, + 0, + constvar.MONITOR, + 0, + "", + "ALL") + if err != nil { + return err + } + return nil +} + +// reporterHeartbeat send agent heartbeat to HA-DB +func (m *MonitorComponent) reportHeartbeat() { + err := m.HaDBClient.ReporterMonitorHeartbeat(m.MonIp, strings.Join(m.ActiveClusterType, ",")) + if err != nil { + log.Logger.Errorf("report heartbeat failed:%s", err.Error()) + } +} + +// checkAllCovered check if all instances is covered +func (m *MonitorComponent) checkAllCovered() { + //undetected instances + unCoveredMachineMap := map[string]struct{}{} + //undetected logical_city_ids + unCoveredCityMap := map[int]struct{}{} + m.AlertInfo.Global.NeedDetectNumber = len(m.NeedDetectMachines) + m.AlertInfo.Global.HADetectedNumber = len(m.DetectedMachines) + log.Logger.Infof("all detected city num:%d", len(m.DetectedCities)) + log.Logger.Infof("all detected machine num:%d", len(m.DetectedMachines)) + log.Logger.Infof("all need detect city num:%d", len(m.NeedDetectCities)) + log.Logger.Infof("all need detect machine num:%d", len(m.NeedDetectMachines)) + + for city, _ := range m.NeedDetectCities { + if _, ok := m.DetectedCities[city]; ok { + continue + } else { + unCoveredCityMap[city] = struct{}{} + } + } + + for ip := range m.NeedDetectMachines { + if _, ok := m.DetectedMachines[ip]; ok { + continue + } else { + unCoveredMachineMap[ip] = struct{}{} + } + } + + if len(unCoveredMachineMap) > 0 { + log.Logger.Errorf("uncovered machine list:%#v", unCoveredMachineMap) + if err := monitor.MonitorSend(fmt.Sprintf("%d machines not covered by dbha", + len(unCoveredMachineMap)), m.AlertInfo); err != nil { + log.Logger.Warnf(err.Error()) + } + } + + if len(unCoveredCityMap) > 0 { + for k := range unCoveredCityMap { + m.AlertInfo.Global.UnCoveredCityIDs = append(m.AlertInfo.Global.UnCoveredCityIDs, k) + } + log.Logger.Errorf("uncovered city list:%#v", unCoveredCityMap) + if err := monitor.MonitorSend(fmt.Sprintf("%d logical_city_ids not covered by dbha", + len(unCoveredCityMap)), m.AlertInfo); err != nil { + log.Logger.Warnf(err.Error()) + } + } + + log.Logger.Debugf("global monitor info: %#v", m.AlertInfo.Global) +} + +// checkComponentNormal check if all components is work normal +func (m *MonitorComponent) checkComponentNormal() { + for _, agent := range m.AgentList { + if agent.ReportInterval > 20 { + msg := fmt.Sprintf("agent:%s, cluster_type:%s detect too slow:%d", + agent.IP, agent.DbType, agent.ReportInterval) + log.Logger.Errorf(msg) + if err := monitor.MonitorSend(msg, m.AlertInfo); err != nil { + log.Logger.Warnf(err.Error()) + } + continue + } + } + for _, gm := range m.GmList { + if gm.ReportInterval > 300 { + msg := fmt.Sprintf("gm:%s, Campuse:%s report too slow:%d", gm.IP, gm.Campus, gm.ReportInterval) + log.Logger.Errorf(msg) + if err := monitor.MonitorSend(msg, m.AlertInfo); err != nil { + log.Logger.Warnf(err.Error()) + } + continue + } + } +} + +// getCmDBMachineByCluster get all cmdb instance by cluster type +func (m *MonitorComponent) getCmDBMachineByCluster(clusterType string, hashMod, hashValue int) error { + num := 0 + req := client.DBInstanceByClusterTypeRequest{ + HashCnt: hashMod, + HashValue: hashValue, + ClusterTypes: []string{clusterType}, + } + + //get all instances by cluster type + rawInfo, err := m.CmDBClient.GetDBInstanceByClusterType(req) + if err != nil { + minInfo := monitor.GetApiAlertInfo(constvar.CmDBInstanceUrl, err.Error()) + if e := monitor.MonitorSend("get instances failed", minInfo); e != nil { + log.Logger.Warnf(e.Error()) + } + return fmt.Errorf("fetch all cmdb instance failed:%s", err.Error()) + } + + for _, v := range rawInfo { + cmdbIns := MachineInfo{} + rawIns, jsonErr := json.Marshal(v) + if jsonErr != nil { + log.Logger.Errorf("marshal db instance info failed:%s", jsonErr.Error()) + return fmt.Errorf("get cmdb instance info failed:%s", jsonErr.Error()) + } + if jsonErr = json.Unmarshal(rawIns, &cmdbIns); jsonErr != nil { + log.Logger.Errorf("unmarshal db instance info failed:%s", jsonErr.Error()) + return fmt.Errorf("get cmdb instance info failed:%s", jsonErr.Error()) + } + + //should ignore some city + if util.HasElem(cmdbIns.LogicalCityID, m.IgnoreCityList) { + continue + } + + //some cluster type only Proxy layer need HA, so we should skip its storage layer + //should skip PredixyRedisCluster's storage layer + if cmdbIns.ClusterType == constvar.PredixyRedisCluster && + cmdbIns.MachineType == constvar.TendisCacheMetaType { + continue + } + //should skip TendisplusCluster's storage layer + if cmdbIns.ClusterType == constvar.TendisplusCluster && + cmdbIns.MachineType == constvar.TendisplusMetaType { + continue + } + //should skip MongoShardedCluster's storage layer + if cmdbIns.ClusterType == constvar.MongoShardedCluster && + (cmdbIns.MachineType == constvar.MongodbMetaType || + cmdbIns.MachineType == constvar.MongoConfigMetaType) { + continue + } + + if _, ok := m.NeedDetectMachines[cmdbIns.IP]; !ok { + m.NeedDetectMachines[cmdbIns.IP] = struct{}{} + } + if _, ok := m.NeedDetectCities[cmdbIns.LogicalCityID]; !ok { + m.NeedDetectCities[cmdbIns.LogicalCityID] = struct{}{} + num += 1 + } + } + log.Logger.Debugf("cluster type:%s, hash_mod:%d, hash_value:%d, need detect machine number:%d", + clusterType, hashMod, hashValue, num) + + return nil +} + +//getAllNeedDetectMachineInfo get all need detect machine from cmdb +func (m *MonitorComponent) getAllNeedDetectMachineInfo() error { + for _, clusterType := range m.ActiveClusterType { + log.Logger.Infof("try to get all instances by cluster type:%s", clusterType) + for i := 0; i < m.HashMod; i++ { + if err := m.getCmDBMachineByCluster(clusterType, m.HashMod, i); err != nil { + return err + } + } + } + log.Logger.Debugf("all need detect city info:%#v", m.NeedDetectCities) + + return nil +} + +// getAllHaComponentInfo get all alive component from hadb +func (m *MonitorComponent) getAllHaComponentInfo() error { + interval := m.MonitorConf.ReportInterval + log.Logger.Infof("try to get alive agent info in latest %d second", interval) + agentInfo, err := m.HaDBClient.GetAliveHAComponent(constvar.Agent, interval) + if err != nil { + return fmt.Errorf("get alive agent info failed:%s", err.Error()) + } + m.AgentList = agentInfo + log.Logger.Debugf("agent list:%#v", m.AgentList) + for _, agent := range m.AgentList { + if _, ok := m.DetectedCities[agent.CityID]; ok { + continue + } else { + m.DetectedCities[agent.CityID] = struct{}{} + } + } + log.Logger.Infof("all detected city list:%#v", m.DetectedCities) + + log.Logger.Infof("try to get alive gm info in latest %d second", interval) + gmInfo, err := m.HaDBClient.GetAliveHAComponent(constvar.GM, interval) + if err != nil { + return fmt.Errorf("get alive agent info failed:%s", err.Error()) + } + m.GmList = gmInfo + log.Logger.Debugf("gm list:%#v", m.GmList) + + return nil +} + +//getAllDetectedMachineInfo get all detected machine from HADB +func (m *MonitorComponent) getAllDetectedMachineInfo() error { + log.Logger.Infof("try to get all detected instances info from hadb") + detectInfo, err := m.HaDBClient.GetHADetectInfo() + if err != nil { + return err + } + for _, ins := range detectInfo { + if _, ok := m.DetectedMachines[ins.IP]; ok { + continue + } else { + if ins.LastTime.Before(time.Now()) && time.Since(*ins.LastTime) <= 5*time.Minute { + m.DetectedMachines[ins.IP] = struct{}{} + } + } + } + log.Logger.Debugf("all detected machine info:%#v", m.DetectedMachines) + + return nil +} diff --git a/dbm-services/common/dbha/ha-module/gm/gcm.go b/dbm-services/common/dbha/ha-module/gm/gcm.go index 423f62e00b..af699b49ef 100644 --- a/dbm-services/common/dbha/ha-module/gm/gcm.go +++ b/dbm-services/common/dbha/ha-module/gm/gcm.go @@ -1,8 +1,6 @@ package gm import ( - "dbm-services/common/dbha/ha-module/util" - "dbm-services/common/dbha/hadb-api/model" "fmt" "time" @@ -12,6 +10,8 @@ import ( "dbm-services/common/dbha/ha-module/dbutil" "dbm-services/common/dbha/ha-module/log" "dbm-services/common/dbha/ha-module/monitor" + "dbm-services/common/dbha/ha-module/util" + "dbm-services/common/dbha/hadb-api/model" ) // GCM gcm work struct diff --git a/dbm-services/common/dbha/ha-module/gm/gqa.go b/dbm-services/common/dbha/ha-module/gm/gqa.go index 1d1250ec80..bdc56f3f4f 100644 --- a/dbm-services/common/dbha/ha-module/gm/gqa.go +++ b/dbm-services/common/dbha/ha-module/gm/gqa.go @@ -10,6 +10,7 @@ import ( "dbm-services/common/dbha/ha-module/dbmodule" "dbm-services/common/dbha/ha-module/dbutil" "dbm-services/common/dbha/ha-module/log" + "dbm-services/common/dbha/ha-module/monitor" ) // GQA work struct @@ -200,6 +201,10 @@ func (gqa *GQA) getAllInstanceFromCMDB( ip, _ := instance.db.GetAddress() instances, err := gqa.CmDBClient.GetDBInstanceInfoByIp(ip) if err != nil { + minInfo := monitor.GetApiAlertInfo(constvar.CmDBInstanceUrl, err.Error()) + if e := monitor.MonitorSend("get instances failed", minInfo); e != nil { + log.Logger.Warnf(e.Error()) + } log.Logger.Errorf("get mysql instance failed. err:%s", err.Error()) return nil, err } diff --git a/dbm-services/common/dbha/ha-module/monitor/monitor.go b/dbm-services/common/dbha/ha-module/monitor/monitor.go index 4297545b0e..3d9b77da1d 100644 --- a/dbm-services/common/dbha/ha-module/monitor/monitor.go +++ b/dbm-services/common/dbha/ha-module/monitor/monitor.go @@ -2,11 +2,8 @@ package monitor import ( - "encoding/json" - "fmt" "strconv" - "dbm-services/common/dbha/ha-module/client" "dbm-services/common/dbha/ha-module/config" "dbm-services/common/dbha/ha-module/constvar" "dbm-services/common/dbha/ha-module/dbutil" @@ -49,7 +46,6 @@ type DetectMonitor struct { // GlobalMonitor HA global monitor struct type GlobalMonitor struct { - CloudId int ServerIp string //not detect logical_city_ids UnCoveredCityIDs []int @@ -59,6 +55,13 @@ type GlobalMonitor struct { NeedDetectNumber int //HA detected instances number HADetectedNumber int + Content string +} + +// APIMonitor api monitor struct +type APIMonitor struct { + ApiName string + Message string } // MonitorInfo the struct of monitor information @@ -68,7 +71,8 @@ type MonitorInfo struct { Switch SwitchMonitor Detect DetectMonitor //global monitor - Global GlobalMonitor + Global GlobalMonitor + ApiInfo APIMonitor } // MonitorInit init monitor moudule by config @@ -107,7 +111,8 @@ func MonitorSendDetect(ins dbutil.DataBaseDetect, eventName string, content stri // MonitorSend send dbha monitor information func MonitorSend(content string, info MonitorInfo) error { addDimension := make(map[string]interface{}) - if info.MonitorInfoType == constvar.MonitorInfoSwitch { + switch info.MonitorInfoType { + case constvar.MonitorInfoSwitch: // switch monitor information dimension add addDimension["instance_role"] = info.Switch.Role addDimension["appid"] = info.Switch.Bzid @@ -127,7 +132,7 @@ func MonitorSend(content string, info MonitorInfo) error { addDimension[constvar.NewMasterHost] = info.Switch.NewMasterHost addDimension[constvar.NewMasterPort] = info.Switch.NewMasterPort } - } else if info.MonitorInfoType == constvar.MonitorInfoDetect { + case constvar.MonitorInfoDetect: // detect monitor information dimension add addDimension["appid"] = info.Detect.Bzid addDimension["server_ip"] = info.Detect.ServerIp @@ -136,14 +141,15 @@ func MonitorSend(content string, info MonitorInfo) error { addDimension["cluster_domain"] = info.Detect.Cluster addDimension["machine_type"] = info.Detect.MachineType addDimension["cluster_type"] = info.Detect.ClusterType - } else if info.MonitorInfoType == constvar.MonitorInfoGlobal { - addDimension["cloud_id"] = info.Global.CloudId + case constvar.MonitorInfoGlobal: addDimension["server_ip"] = info.Global.ServerIp - addDimension["cloud_id"] = info.Global.CloudId - addDimension["uncovered_num"] = info.Global.UnCoveredInsNumber + addDimension["uncovered_ins_num"] = info.Global.UnCoveredInsNumber addDimension["need_detect_num"] = info.Global.NeedDetectNumber - addDimension["ha_detect__num"] = info.Global.HADetectedNumber + addDimension["ha_detect_num"] = info.Global.HADetectedNumber addDimension["uncovered_city_ids"] = util.IntSlice2String(info.Global.UnCoveredCityIDs, ",") + case constvar.MonitorInfoAPI: + addDimension["api_name"] = info.ApiInfo.ApiName + addDimension["api_message"] = info.ApiInfo.Message } return SendEvent(info.EventName, content, addDimension) @@ -247,113 +253,13 @@ func GetMonitorInfoByDetect(ins dbutil.DataBaseDetect, eventName string) Monitor } } -// CheckHAComponent check whether HA component work normal -// 1. all need detect CMDB instances should detect -// 2. alive agent should found -func CheckHAComponent(conf *config.Config) (MonitorInfo, error) { - cmdbClient := client.NewCmDBClient(&conf.DBConf.CMDB, conf.GetCloudId()) - hadbClient := client.NewHaDBClient(&conf.DBConf.HADB, conf.GetCloudId()) - monitorInfo := MonitorInfo{ - EventName: constvar.DBHAEventGlobalMonitor, - MonitorInfoType: constvar.MonitorInfoGlobal, - Global: GlobalMonitor{ - CloudId: conf.Monitor.CloudID, - ServerIp: conf.Monitor.LocalIP, - UnCoveredInsNumber: 0, - UnCoveredCityIDs: nil, - NeedDetectNumber: 0, - HADetectedNumber: 0, +func GetApiAlertInfo(apiName, message string) MonitorInfo { + return MonitorInfo{ + EventName: constvar.DBHAEventApiFailed, + MonitorInfoType: constvar.MonitorInfoAPI, + ApiInfo: APIMonitor{ + ApiName: apiName, + Message: message, }, } - - //undetected instances - unCoveredIns := map[string]struct{}{} - //undetected logical_city_ids - unCoveredCityIDs := map[int]struct{}{} - //all logical_city_ids detected by agent - allDetectCityIDs := map[int]struct{}{} - - log.Logger.Infof("try to get alive agent info latest 10 minutes") - if agentInfo, err := hadbClient.GetAliveHAComponent(constvar.Agent, 600); err != nil { - return monitorInfo, fmt.Errorf("get alive agent info failed:%s", err.Error()) - } else { - log.Logger.Debugf("all agent info:%#v", agentInfo) - for _, agent := range agentInfo { - allDetectCityIDs[agent.CityID] = struct{}{} - } - } - - //2. uncovered logic_city_id - log.Logger.Infof("try to get all need detect instances info from cmdb") - if rawInfo, err := cmdbClient.GetAllDBInstanceInfo(); err != nil { - return monitorInfo, fmt.Errorf("fetch all cmdb instance failed:%s", err.Error()) - } else { - needDetectIpMap := map[string]struct{}{} - log.Logger.Debugf("all cmdb instances number:%d", len(rawInfo)) - - log.Logger.Infof("try to get all detected instances info from hadb") - detectInfo, err := hadbClient.GetDBDetectInfo() - if err != nil { - return monitorInfo, fmt.Errorf("fetch all detected instances from hadb failed:%s", err.Error()) - } - log.Logger.Debugf("HA detected instances number:%d", len(detectInfo)) - monitorInfo.Global.HADetectedNumber = len(detectInfo) - - for _, v := range rawInfo { - found := false - cmdbIns := dbutil.DBInstanceInfoDetail{} - rawIns, jsonErr := json.Marshal(v) - if jsonErr != nil { - log.Logger.Errorf("marshal db instance info failed:%s", jsonErr.Error()) - return monitorInfo, fmt.Errorf("get cmdb instance info failed:%s", jsonErr.Error()) - } - if jsonErr = json.Unmarshal(rawIns, &cmdbIns); jsonErr != nil { - log.Logger.Errorf("unmarshal db instance info failed:%s", jsonErr.Error()) - return monitorInfo, fmt.Errorf("get cmdb instance info failed:%s", jsonErr.Error()) - } - - //TODO, API filter active cluster type more efficient - if _, ok := needDetectIpMap[cmdbIns.IP]; ok || - !util.HasElem(cmdbIns.ClusterType, conf.Monitor.ActiveDBType) { - continue - } else { - needDetectIpMap[cmdbIns.IP] = struct{}{} - } - - for _, detectIns := range detectInfo { - if cmdbIns.IP == detectIns.IP { - found = true - break - } - } - if !found { - unCoveredIns[cmdbIns.IP] = struct{}{} - if _, ok := allDetectCityIDs[cmdbIns.LogicalCityID]; !ok { - unCoveredCityIDs[cmdbIns.LogicalCityID] = struct{}{} - } - } - } - monitorInfo.Global.NeedDetectNumber = len(needDetectIpMap) - } - - if len(unCoveredIns) > 0 { - log.Logger.Errorf("uncovered instances list:%#v", unCoveredIns) - return monitorInfo, fmt.Errorf("%d instances not covered by dbha", len(unCoveredIns)) - } - - if len(unCoveredCityIDs) > 0 { - for k := range unCoveredCityIDs { - monitorInfo.Global.UnCoveredCityIDs = append(monitorInfo.Global.UnCoveredCityIDs, k) - } - return monitorInfo, fmt.Errorf("%d logical_city_ids not covered by dbha", len(unCoveredCityIDs)) - } - - if monitorInfo.Global.HADetectedNumber != monitorInfo.Global.NeedDetectNumber { - return monitorInfo, fmt.Errorf("need detect number:%d not equal HA detect number:%d", - monitorInfo.Global.NeedDetectNumber, monitorInfo.Global.HADetectedNumber) - } - - log.Logger.Debugf("global monitor info: %#v", monitorInfo) - - return monitorInfo, nil } diff --git a/dbm-services/common/dbha/ha-module/test/client_test.go b/dbm-services/common/dbha/ha-module/test/client_test.go deleted file mode 100644 index f354d428f6..0000000000 --- a/dbm-services/common/dbha/ha-module/test/client_test.go +++ /dev/null @@ -1,69 +0,0 @@ -package test - -import ( - "fmt" - "net/http" - "testing" - - "dbm-services/common/dbha/ha-module/client" - "dbm-services/common/dbha/ha-module/config" - "dbm-services/common/dbha/ha-module/constvar" - "dbm-services/common/dbha/ha-module/dbmodule" -) - -func TestNewClientByAddrs(t *testing.T) { - addr := "http://127.0.0.1:8080" - c, _ := client.NewClientByAddrs([]string{addr}, constvar.CmDBName) - param := c.ConvertParamForGetRequest(map[string]string{ - "apps": "test1", - }) - result, err := c.DoNew(http.MethodGet, "/cmdb/cluster/query?"+param, nil, nil) - if err != nil { - fmt.Printf("requst failed:%s", err.Error()) - } - fmt.Printf("%s", string(result.Data)) -} - -func TestGetInstanceByCity(t *testing.T) { - GlobalConfig, err := config.ParseConfigureFile("../monitor_agent.yaml") - if err != nil { - fmt.Printf("get config failed. err:%s", err.Error()) - t.FailNow() - } - addr := "http://127.0.0.1:8080" - c, _ := client.NewClientByAddrs([]string{addr}, constvar.CmDBName) - cmdbC := client.CmDBClient{ - Client: *c, - } - rawList, err := cmdbC.GetDBInstanceInfoByCity(2) - if err != nil { - fmt.Printf("get instance failed. err:%s", err.Error()) - t.FailNow() - } - dbs, err := dbmodule.DBCallbackMap[constvar.DetectTenDBHA].FetchDBCallback(rawList, GlobalConfig) - for _, info := range dbs { - ip, port := info.GetAddress() - fmt.Printf("%s, %d, %s, %s, %s\n", ip, port, info.GetDBType(), info.GetStatus(), info.GetApp()) - } -} - -func TestGetInstanceByIp(t *testing.T) { - addr := "http://127.0.0.1:8080" - c, _ := client.NewClientByAddrs([]string{addr}, constvar.CmDBName) - cmdbC := client.CmDBClient{ - Client: *c, - } - inf, err := cmdbC.GetDBInstanceInfoByIp("127.0.0.6") - if err != nil { - fmt.Printf("get instance failed. err:%s", err.Error()) - t.FailNow() - } - list, err := dbmodule.DBCallbackMap[constvar.DetectTenDBHA].GetSwitchInstanceInformation(inf, nil) - if err != nil { - fmt.Printf("get switch instance failed. err:%s", err.Error()) - t.FailNow() - } - for _, info := range list { - fmt.Printf("%v\n", info) - } -} diff --git a/dbm-services/common/dbha/hadb-api/pkg/handler/hastatus/hastatus_handler.go b/dbm-services/common/dbha/hadb-api/pkg/handler/hastatus/hastatus_handler.go index a68af4b1a3..b6ba44a2cc 100644 --- a/dbm-services/common/dbha/hadb-api/pkg/handler/hastatus/hastatus_handler.go +++ b/dbm-services/common/dbha/hadb-api/pkg/handler/hastatus/hastatus_handler.go @@ -52,6 +52,8 @@ const ( GetAgentInfo = "agent_get_agent_info" // UpdateAgentInfo TODO UpdateAgentInfo = "reporter_agent_heartbeat" + // UpdateMonitorInfo TODO + UpdateMonitorInfo = "reporter_monitor_heartbeat" // UpdateGMInfo TODO UpdateGMInfo = "reporter_gm_heartbeat" // GetAliveAgentInfo TODO @@ -77,9 +79,7 @@ func Handler(ctx *fasthttp.RequestCtx) { switch param.Name { case GetGmInfo, GetAgentInfo: GetHaInfo(ctx, param.QueryArgs) - case UpdateAgentInfo: - UpdateHaInfo(ctx, param.QueryArgs, param.SetArgs) - case UpdateGMInfo: + case UpdateAgentInfo, UpdateGMInfo, UpdateMonitorInfo: UpdateHaInfo(ctx, param.QueryArgs, param.SetArgs) case GetAliveHAInfo: GetAliveHAByModule(ctx, param.QueryArgs) diff --git a/dbm-ui/backend/flow/utils/cloud/script_template/dbha_template.py b/dbm-ui/backend/flow/utils/cloud/script_template/dbha_template.py index ebd2371324..3ef26dd08c 100644 --- a/dbm-ui/backend/flow/utils/cloud/script_template/dbha_template.py +++ b/dbm-ui/backend/flow/utils/cloud/script_template/dbha_template.py @@ -306,6 +306,8 @@ cp /data/install/{{dbha_conf}} $path/dbha/{{dbha_type}}; cp /data/install/dbha $path/dbha/{{dbha_type}}; chmod -R 777 $path/dbha; +systemctl start ntpd +systemctl enable ntpd # 部署dbha服务 cd $path/dbha/{{dbha_type}}