diff --git a/source/tools/monitor/unity/collector/guard/guardSelfStat.lua b/source/tools/monitor/unity/collector/guard/guardSelfStat.lua index bbdc5e3ddab8e7f3a96b2693358dd810590c4e5c..f02d72e48b844770300a2ae3e7d52a025a08fd80 100644 --- a/source/tools/monitor/unity/collector/guard/guardSelfStat.lua +++ b/source/tools/monitor/unity/collector/guard/guardSelfStat.lua @@ -30,8 +30,17 @@ function CguardSelfStat:_init_(proto, pffi, mnt, resYaml, jperiod) self._lastUser, self._lastSys, _, _ = readProc(self._path) self._period = jperiod - self._cpuLimit = resYaml.config.limit.cpu * jperiod / 100 - self._memLimit = resYaml.config.limit.mem * 1024 * 1024 + + self._cpuLimit = nil + self._memLimit = nil + if resYaml.config.limit then + if resYaml.config.limit.cpu then + self._cpuLimit = resYaml.config.limit.cpu * jperiod / 100 + end + if resYaml.config.limit.mem then + self._memLimit = resYaml.config.limit.mem * 1024 * 1024 + end + end end local function rssRssAnon() @@ -54,17 +63,17 @@ function CguardSelfStat:proc(elapsed, lines) local user, sys, vsize, rss = readProc(self._path) local _user, _sys = user - self._lastUser, sys - self._lastSys - local cpus = _user, _sys + local cpus = _user + _sys self._lastUser, self._lastSys = user, sys - if cpus > self._cpuLimit * elapsed then - print("last cpu usage overflow." .. cpus) + if self._cpuLimit and cpus > self._cpuLimit * elapsed then + print("last cpu usage overflow. user + sys jiffies: " .. cpus) os.exit(1) end local anon = rssRssAnon() - if anon > self._memLimit then - print("last mem usage overflow." .. rss) + if self._memLimit and anon > self._memLimit then + print("last mem usage overflow. rss bytes: " .. rss) os.exit(1) end local vs = { diff --git a/source/tools/monitor/unity/collector/plugin.yaml b/source/tools/monitor/unity/collector/plugin.yaml deleted file mode 100644 index 07e9b3c0ce7a13a3fc1db8ed99c043befd6ede56..0000000000000000000000000000000000000000 --- a/source/tools/monitor/unity/collector/plugin.yaml +++ /dev/null @@ -1,333 +0,0 @@ -config: - freq: 20 # unit second - daemon: true - port: 8400 # bind port - bind_addr: 0.0.0.0 # bind ip - backlog: 32 # listen backlog - identity: # support hostip, curl(need url arg), hostname, file(need path arg), specify(need name arg), env(need name arg) - mode: curl - url: "http://100.100.100.200/latest/meta-data/instance-id" -# name: test_specify -# mode: hostip -# real_timestamps: true -# unix_socket: "/tmp/sysom_unity.sock" - proc_path: /mnt/host/ # in container mode, like -v /:/mnt/host , should use /mnt/host/ -# proc_path: / # in container mode, like -v /:/mnt/host , should use /mnt/host/ - db: - rotate: 7 # tsdb file retention time, unit day - budget: 200 # max query buffer from tsdb. - limit: - cpu: 30 # unit % - mem: 40 # unit mb - tasks: 10 # monitor 10 pid max. - -forkRun: - - - cmd: "/usr/bin/python" - args: ["../test/curl/forkRun.py"] - -oss: - bucket: "netinfo-shenzhen" - endPoint: "oss-cn-shenzhen.aliyuncs.com" - ak: "ak" - sk: "sk" - -diagnose: - io_hang: - block: 60 - time: 15 - cmd: "../../../iosdiag" - report: - title: "iosdiag" - files: - - "/var/log/sysak/iosdiag/hangdetect/result.log.stat" - - "/var/log/sysak/iosdiag/hangdetect/result.log.seq" - net_edge: - block: 300 - time: 60 - so: - virtiostat: 15 - cmd: "../../../netCli" - jruntime: - block: 60 - time: 30 - cmd: "../../../java_collect" - -outline: - - /var/sysom/outline - -pushTo: - to: "Influx" - host: "ld-wz9d17b514mg6kjkx-proxy-tsdb.lindorm.rds.aliyuncs.com" - port: 8242 - url: "/api/v2/write?db=lua" - -container: - mode: "pods" - luaPlugin: ["cg_cpu_stat_sample", "cg_cpuacct_stat"] - directCgPath: - - "/" - - "/kubepods.slice" - - "/kubepods.slice/kubepods-besteffort.slice" - - "/kubepods.slice/kubepods-burstable.slice" - - indirectCgPath: - - "kubepods.slice" - - "kubepods.slice/kubepods-besteffort.slice" - - "kubepods.slice/kubepods-burstable.slice" - - indirectCgPath1: - - path: "/kubepods.slice" - child1: "/kubepods%-pod" - child2: "/cri%-containerd" - - path: "/kubepods.slice/kubepods-besteffort.slice" - child1: "/kubepods%-besteffort%-pod" - child2: "/cri%-containerd" - - path: "/kubepods.slice/kubepods-burstable.slice" - child1: "/kubepods%-burstable%-pod" - -luaPlugins: ["proc_buddyinfo", "proc_diskstats", "proc_meminfo", "proc_mounts", "proc_netdev", - "proc_snmp_stat", "proc_sockstat", "proc_stat", "proc_statm", "proc_vmstat", - "proc_uptime"] - -plugins: - - so: kmsg - description: "collect dmesg info." - - - so: sample - description: "just a example." - - - so: sample_threads - description: "threads example." - - - so: bpfsample2 - description: "bpf threads example." - - - - so: proc_schedstat - description: "collect schedule stat info of percpu" - - - so: proc_loadavg - description: "collect load avg" - - - - so: unity_nosched - description: "nosched:sys hold cpu and didn't scheduling" - - so: net_health - description: "tcp net health." - - so: net_retrans - description: "tcp retrans monitor." - - - so: unity_irqoff - description: "irqoff:detect irq turned off and can't response" - #- - # so: numainfo - # description: "collect numainfo" - #- - # so: cpufreq - # description: "collect cpufreq" - - - so: gpuinfo - description: "collect gpuinfo" - #- - # so: pmu_events - # description: "collect pmu events" - -metrics: - - - title: sysak_proc_cpu_total - from: cpu_total - head: mode - help: "cpu usage info for total." - type: "gauge" - - title: sysak_proc_cpus - from: cpus - head: mode - help: "cpu usage info for per-cpu." - type: "gauge" - - title: sysak_proc_sirq - from: sirq - head: type - help: "system soft irq times." - type: "gauge" - - title: sysak_proc_stat_counters - from: stat_counters - head: counter - help: "system state counter." - type: "gauge" - - title: sysak_proc_meminfo - from: meminfo - head: value - help: "meminfo from /proc/meminfo." - type: "gauge" - - title: sysak_proc_vmstat - from: vmstat - head: value - help: "vmstat info from /proc/vmstat." - type: "gauge" - - title: sysak_proc_self_statm - from: self_statm - head: value - help: "statm info from /proc/self/statm." - type: "gauge" - - title: sysak_proc_networks - from: networks - head: counter - help: "networks info from /proc/net/dev." - type: "gauge" - - title: sysak_proc_disks - from: disks - head: counter - help: "disk info from /proc/diskstats." - type: "gauge" - - title: sysak_proc_pkt_status - from: pkt_status - head: counter - help: "net status info from /proc/net/snmp and /proc/net/status." - type: "gauge" - - title: sysak_fs_stat - from: fs_stat - head: counter - help: "file system information." - type: "gauge" - - title: sysak_sock_stat - from: sock_stat - head: value - help: "sock stat counters from /proc/net/sockstat" - type: "gauge" - - title: sysak_sample_tbl1 - from: sample_tbl1 - head: value - help: "example1 for develop." - type: "gauge" - - title: sysak_sample_tbl2 - from: sample_tbl2 - head: value - help: "example2 for develop." - type: "gauge" - - title: sysak_sample_bpfsample2 - from: bpfsample2 - head: value - help: "example for bpfsample2" - type: "gauge" - - title: sysak_proc_schedstat - from: proc_schedstat - head: value - help: "schedule state of percpu." - type: "gauge" - - title: sysak_proc_loadavg - from: proc_loadavg - head: value - help: "loadavg of system from /proc/loadavg" - type: "gauge" - - title: sysak_proc_buddyinfo - from: buddyinfo - head: value - help: "buddyinfo of system from /proc/buddyinfo" - type: "gauge" - - title: sysak_IOMonIndForDisksIO - from: IOMonIndForDisksIO - head: value - help: "Disk IO indicators and abnormal events" - type: "gauge" - - title: sysak_IOMonIndForSystemIO - from: IOMonIndForSystemIO - head: value - help: "System indicators and abnormal events about IO" - type: "gauge" - - title: sysak_IOMonDiagLog - from: IOMonDiagLog - head: value - help: "Diagnose log for IO exception" - type: "gauge" - - title: sysak_sched_moni_jitter - from: sched_moni_jitter - head: value - help: "nosched/irqoff:sys and irqoff hold cpu and didn't scheduling" - type: "gauge" - - title: sysak_cpu_dist - from: cpu_dist - head: value - help: "task cpu sched dist." - type: "gauge" - - title: sysak_net_health_hist - from: net_health_hist - head: value - help: "net_health_hist" - type: "gauge" - - title: sysak_net_health_count - from: net_health_count - head: value - help: "net_health_count" - type: "gauge" - - title: sysak_net_retrans_count - from: net_retrans_count - head: value - help: "net_retrans_count" - type: "gauge" - #- title: sysak_numainfo - # from: numainfo - # head: value - # help: "numainfo of system from /sys/devices/system/" - # type: "gauge" - #- title: sysak_proc_cpufreq - # from: cpufreq - # head: value - # help: "cpufreq of system from /proc/cpuinfo" - # type: "gauge" - - title: sysak_gpuinfo - from: gpuinfo - head: value - help: "gpuinfo of system from nvidia-smi" - type: "gauge" - - #- title: sysak_pod_alloc - #from: pod_alloc - #head: value - #help: "get pod alloc page used" - #type: "gauge" - - title: sysak_pmu_events - from: pmu_events - head: value - help: "pmu events, such as cycles/instructions, llc events" - type: "gauge" - - title: sysak_pmu_events_percpu - from: pmu_events_percpu - head: value - help: "pmu events of percpu" - type: "gauge" - - title: sysak_cg_memfail_cnt - from: cg_memfail_cnt - head: value - help: "sysak_cg_memFail_cnt" - type: "gauge" - - title: sysak_cg_memdrcm_latency - from: cg_memdrcm_latency - head: value - help: "sysak_cg_memdrcm_latency" - type: "gauge" - - title: sysak_cg_memmcmp_latency - from: cg_memmcmp_latency - head: value - help: "sysak_cg_memmcmp_latency" - type: "gauge" - - title: sysak_cg_wait_latency - from: cg_wait_latency - head: value - help: "sysak_cg_wait_latency" - type: "gauge" - - title: sysak_cg_cpuacct_proc_stat - from: cg_cpuacct_proc_stat - head: value - help: "sysak_cg_cpuacct_proc_stat" - type: "gauge" - - title: sysak_cg_cpu_stat - from: cg_cpu_stat - head: value - help: "sysak_cg_cpu_stat" - type: "gauge" - - title: sysak_cg_cpuacct_stat - from: cg_cpuacct_stat - head: value - help: "cpuacct/cpuacct.stat" - type: "gauge"