diff --git a/component/clusterd/main.go b/component/clusterd/main.go index a6a239e31897006cc37157b03aa6cbd974ac5b60..0a91bc2f09b46268885890b57ade56e63b37b94e 100644 --- a/component/clusterd/main.go +++ b/component/clusterd/main.go @@ -139,7 +139,7 @@ func initGrpcServer(ctx context.Context) { grpc.UnaryInterceptor(limitQPS)}) recoverService := recover.NewFaultRecoverService(keepAliveInterval, ctx) pubFaultSvc := publicfault.NewPubFaultService(ctx) - dataTraceSvc := &profiling.ProfilingSwitchManager{} + dataTraceSvc := &profiling.SwitchManager{} if err := server.Start(recoverService, pubFaultSvc, dataTraceSvc); err != nil { hwlog.RunLog.Errorf("clusterd grpc server start failed, error: %v", err) } diff --git a/component/clusterd/pkg/application/profiling/grpc_service.go b/component/clusterd/pkg/application/profiling/grpc_service.go index 87dd4040a0ff2196b1079bdaf9f0a869ff97db9e..886e5e44e73e46ab134a74c2fb2ce392950bf33e 100644 --- a/component/clusterd/pkg/application/profiling/grpc_service.go +++ b/component/clusterd/pkg/application/profiling/grpc_service.go @@ -22,8 +22,8 @@ const ( PartsOfJobNs = 2 ) -// ProfilingSwitchManager represents profiling switch manager -type ProfilingSwitchManager struct { +// SwitchManager represents profiling switch manager +type SwitchManager struct { profiling.UnimplementedTrainingDataTraceServer } @@ -39,7 +39,7 @@ const ( ) // ModifyTrainingDataTraceSwitch to modify the profiling marker status by updating the cm -func (ps *ProfilingSwitchManager) ModifyTrainingDataTraceSwitch(ctx context.Context, +func (ps *SwitchManager) ModifyTrainingDataTraceSwitch(ctx context.Context, in *profiling.DataTypeReq) (*profiling.DataTypeRes, error) { jobNsName := in.GetJobNsName() jobNameInfo := strings.Split(jobNsName, "/") @@ -71,7 +71,7 @@ func (ps *ProfilingSwitchManager) ModifyTrainingDataTraceSwitch(ctx context.Cont } // GetTrainingDataTraceSwitch get current profiling marker status -func (ps *ProfilingSwitchManager) GetTrainingDataTraceSwitch(ctx context.Context, +func (ps *SwitchManager) GetTrainingDataTraceSwitch(ctx context.Context, in *profiling.DataStatusReq) (*profiling.DataStatusRes, error) { jobNsName := in.GetJobNsName() jobNameInfo := strings.Split(jobNsName, "/") diff --git a/component/clusterd/pkg/application/statistics/job_output.go b/component/clusterd/pkg/application/statistics/job_output.go index f73b936c2eac28ed1cdd1eaccb0a2d15145f7591..02bee6f7273f069fd819a74641f6cba5478f995e 100644 --- a/component/clusterd/pkg/application/statistics/job_output.go +++ b/component/clusterd/pkg/application/statistics/job_output.go @@ -31,6 +31,7 @@ func init() { GlobalJobOutputMgr = &OutputMgr{} } +// JobOutput process all job statistic by periodically. func (c *OutputMgr) JobOutput(ctx context.Context) { // Create a timer that fires every 3 seconds and updates cm if the data is updated var lastVersion int64 = statistics.InitVersion @@ -56,6 +57,7 @@ func (c *OutputMgr) JobOutput(ctx context.Context) { } } +// BuildCmData build the cmData from curJobStatistic func (c *OutputMgr) BuildCmData(curJobStatistic constant.CurrJobStatistic) map[string]string { tmpSlice := make([]constant.JobStatistic, 0, len(curJobStatistic.JobStatistic)) cmData := make(map[string]string) diff --git a/component/clusterd/pkg/common/logs/log_utils.go b/component/clusterd/pkg/common/logs/log_utils.go index 76485960d1f3760d9aa584823acc028586f81737..3c6fcd11d4c66b15fd467d5a9f772dbeaba92415 100644 --- a/component/clusterd/pkg/common/logs/log_utils.go +++ b/component/clusterd/pkg/common/logs/log_utils.go @@ -18,6 +18,8 @@ const ( var ( jobEventHwLogConfig = &hwlog.LogConfig{LogFileName: jobEventLog, MaxBackups: jobEventMaxBackupLogs, MaxLineLength: jobEventMaxLogLineLength, MaxAge: jobEventMaxAge} + + // JobEventLog is the logger for job event JobEventLog *hwlog.CustomLogger ) diff --git a/component/clusterd/pkg/domain/statistics/job_collect_utils.go b/component/clusterd/pkg/domain/statistics/job_collect_utils.go index 5f17a45c13a2c1e928ce5ab98f14f982719059a9..266c5ccb946c49e275445c99de07292fb8b0c06d 100644 --- a/component/clusterd/pkg/domain/statistics/job_collect_utils.go +++ b/component/clusterd/pkg/domain/statistics/job_collect_utils.go @@ -20,12 +20,18 @@ import ( ) const ( - JobStcNamespace = "mindx-dl" - JobStcCMName = "current-job-statistic" - JobDataCmKey = "data" - TotalJobsCmKey = "totalJob" + // JobStcNamespace is the namespace for job statistics + JobStcNamespace = "mindx-dl" + // JobStcCMName is the configmap name for job statistics + JobStcCMName = "current-job-statistic" + // JobDataCmKey is the key used to store job date in configmap + JobDataCmKey = "data" + // TotalJobsCmKey is the key used to store all jobs + TotalJobsCmKey = "totalJob" + // maxCMJobStatisticNum is the max job statistics number maxCMJobStatisticNum = 10000 - InitVersion = 0 + // InitVersion is job statistics initial version + InitVersion = 0 ) // JobStcMgr used to job statistic data. diff --git a/component/clusterd/pkg/domain/superpod/superpod_storage.go b/component/clusterd/pkg/domain/superpod/superpod_storage.go index d901b60233086c1a16f1be1a1d0160217531e78e..1a76412e7453f1037f5f6efe9d57a587cd74db89 100644 --- a/component/clusterd/pkg/domain/superpod/superpod_storage.go +++ b/component/clusterd/pkg/domain/superpod/superpod_storage.go @@ -56,6 +56,7 @@ func deepCopySuperPodDevice(superPodDevice *api.SuperPodDevice) *api.SuperPodDev return copySuperPodDevice } +// Manager manages SuperPodDevices type Manager struct { snMap map[string]*api.SuperPodDevice rwLock sync.RWMutex diff --git a/component/clusterd/pkg/interface/grpc/grpc_init.go b/component/clusterd/pkg/interface/grpc/grpc_init.go index 95be6ed913b1aa6a414e26149ca71c6b138e0c6c..6bc3b9f5b260a146a450c1a755f6f640072142c0 100644 --- a/component/clusterd/pkg/interface/grpc/grpc_init.go +++ b/component/clusterd/pkg/interface/grpc/grpc_init.go @@ -54,7 +54,7 @@ func isIPValid(ipStr string) error { // Start the grpc server func (server *ClusterInfoMgrServer) Start(recoverSvc *recover.FaultRecoverService, - pubFaultSvc *publicfault.PubFaultService, dataTraceSvc *profiling.ProfilingSwitchManager) error { + pubFaultSvc *publicfault.PubFaultService, dataTraceSvc *profiling.SwitchManager) error { ipStr := os.Getenv("POD_IP") if err := isIPValid(ipStr); err != nil { return err