diff --git a/source/tools/monitor/unity/etc/daemonset.yaml b/source/tools/monitor/unity/etc/daemonset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..da1ef1e6565230b813867dffcb962c31af2cc0e8 --- /dev/null +++ b/source/tools/monitor/unity/etc/daemonset.yaml @@ -0,0 +1,104 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sysom + namespace: kube-system +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: sysom-role +rules: + - apiGroups: + - '' + resources: + - pods + - nodes + - nodes/status + - nodes/pods + - events + verbs: + - get + - list + - watch + - apiGroups: + - '' + resources: + - nodes/proxy + verbs: + - '*' +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: sysom-binding +subjects: + - kind: ServiceAccount + name: sysom + namespace: kube-system +roleRef: + kind: ClusterRole + name: sysom-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app: sysom + name: sysom + namespace: kube-system +spec: + revisionHistoryLimit: 10 + selector: + matchLabels: + app: sysom + template: + metadata: + labels: + app: sysom + spec: + containers: + - command: + - /bin/sh + - '-c' + - cd /root/dist/app/beeQ && sh ./run.sh + image: 'ackpod-registry.cn-shanghai.cr.aliyuncs.com/mem/sysom:v2.2' + imagePullPolicy: IfNotPresent + name: sysom + ports: + - containerPort: 8889 + hostPort: 8889 + name: sysom + protocol: TCP + resources: + requests: + cpu: 250m + memory: 250Mi + securityContext: + privileged: true + volumeMounts: + - mountPath: /mnt/host + name: volume-sysom + - mountPath: /sys/kernel/debug + name: volume-debugfs + hostNetwork: true + hostPID: true + restartPolicy: Always + serviceAccount: sysom + serviceAccountName: sysom + terminationGracePeriodSeconds: 30 + volumes: + - hostPath: + path: / + type: '' + name: volume-sysom + - hostPath: + path: /sys/kernel/debug + type: '' + name: volume-debugfs + updateStrategy: + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + type: RollingUpdate diff --git a/source/tools/monitor/unity/etc/k8s.yaml b/source/tools/monitor/unity/etc/k8s.yaml new file mode 100644 index 0000000000000000000000000000000000000000..19e6059ec0fe78268c4c0c31e6a926f4ff8cff96 --- /dev/null +++ b/source/tools/monitor/unity/etc/k8s.yaml @@ -0,0 +1,310 @@ +config: + freq: 60 # unit second + port: 8889 # bind port + bind_addr: 0.0.0.0 # bind ip + backlog: 32 # listen backlog + identity: # support hostip, curl(need url arg), hostname, file(need path arg), specify(need name arg) + # mode: curl + # url: "http://100.100.100.200/latest/meta-data/instance-id" + # name: test_specify + mode: curl + url: "http://100.100.100.200/latest/meta-data/instance-id" + # real_timestamps: true + # unix_socket: "/tmp/sysom_unity.sock" + proc_path: /mnt/host/ # in container mode, like -v /:/mnt/host , should use /mnt/host/ + db: + rotate: 7 # tsdb file retention time, unit day + budget: 200 # max query buffer from tsdb. + limit: + cpu: 90 # unit % + mem: 200 # unit mb + tasks: 10 # monitor 10 pid max. + +outline: + - /var/sysom/outline + +container: + mode: "pods" + luaPlugin: ["cg_cpu_cfs_quota","cg_mem_drcm_glob_latency","cg_memory_util","cg_cpu_stat_sample", "cg_cpuacct_stat","cg_memory_drcm_latency", "cg_memory_fail_cnt","cg_memory_dcmp_latency"] + directCgPath: + - "/" + - "/kubepods.slice" + - "/kubepods.slice/kubepods-besteffort.slice" + - "/kubepods.slice/kubepods-burstable.slice" + + indirectCgPath: + - "kubepods.slice" + - "kubepods.slice/kubepods-besteffort.slice" + - "kubepods.slice/kubepods-burstable.slice" + + indirectCgPath1: + - path: "/kubepods.slice" + child1: "/kubepods%-pod" + child2: "/cri%-containerd" + - path: "/kubepods.slice/kubepods-besteffort.slice" + child1: "/kubepods%-besteffort%-pod" + child2: "/cri%-containerd" + - path: "/kubepods.slice/kubepods-burstable.slice" + child1: "/kubepods%-burstable%-pod" + + +luaPlugins: ["podmem","proc_buddyinfo", "proc_diskstats", "proc_meminfo", "proc_mounts", "proc_netdev", + "proc_snmp_stat", "proc_sockstat", "proc_stat", "proc_statm", "proc_vmstat","pod_allocpage", + "proc_uptime", "proc_arp", "proc_cgroups", "proc_softirqs", "proc_softnet_stat", +] + +plugins: + - so: kmsg + description: "collect dmesg info." + - + so: proc_schedstat + description: "collect schedule stat info of percpu" + - + so: proc_loadavg + description: "collect load avg" + - so: net_health + description: "tcp net health." + - so: net_retrans + description: "tcp retrans monitor." + - so: cpudist + description: "sched delay" + #- + # so: unity_irqoff + # description: "irqoff:detect irq turned off and can't response" + #- + # so: gpuinfo + # description: "collect gpuinfo" + +metrics: + - + title: sysom_proc_cpu_total + from: cpu_total + head: mode + help: "cpu usage info for total." + type: "gauge" + - title: sysom_proc_cpus + from: cpus + head: mode + help: "cpu usage info for per-cpu." + type: "gauge" + - title: sysom_proc_sirq + from: sirq + head: type + help: "system soft irq times." + type: "gauge" + - title: sysom_proc_stat_counters + from: stat_counters + head: counter + help: "system state counter." + type: "gauge" + - title: sysom_proc_meminfo + from: meminfo + head: value + help: "meminfo from /proc/meminfo." + type: "gauge" + - title: sysom_proc_vmstat + from: vmstat + head: value + help: "vmstat info from /proc/vmstat." + type: "gauge" + - title: sysom_proc_self_statm + from: self_statm + head: value + help: "statm info from /proc/self/statm." + type: "gauge" + - title: sysom_proc_networks + from: networks + head: counter + help: "networks info from /proc/net/dev." + type: "gauge" + - title: sysom_proc_disks + from: disks + head: counter + help: "disk info from /proc/diskstats." + type: "gauge" + - title: sysom_proc_pkt_status + from: pkt_status + head: counter + help: "net status info from /proc/net/snmp and /proc/net/status." + type: "gauge" + - title: sysom_fs_stat + from: fs_stat + head: counter + help: "file system information." + type: "gauge" + - title: sysom_sock_stat + from: sock_stat + head: value + help: "sock stat counters from /proc/net/sockstat" + type: "gauge" + - title: sysom_proc_schedstat + from: proc_schedstat + head: value + help: "schedule state of percpu." + type: "gauge" + - title: sysom_proc_loadavg + from: proc_loadavg + head: value + help: "loadavg of system from /proc/loadavg" + type: "gauge" + - title: sysom_proc_buddyinfo + from: buddyinfo + head: value + help: "buddyinfo of system from /proc/buddyinfo" + type: "gauge" + - title: sysom_IOMonIndForDisksIO + from: IOMonIndForDisksIO + head: value + help: "Disk IO indicators and abnormal events" + type: "gauge" + - title: sysom_IOMonIndForSystemIO + from: IOMonIndForSystemIO + head: value + help: "System indicators and abnormal events about IO" + type: "gauge" + - title: sysom_IOMonDiagLog + from: IOMonDiagLog + head: value + help: "Diagnose log for IO exception" + type: "gauge" + - title: sched_moni_jitter + from: sched_moni_jitter + head: value + help: "nosched/irqoff:sys and irqoff hold cpu and didn't scheduling" + type: "gauge" + - title: sysom_cpu_dist + from: cpu_dist + head: value + help: "task cpu sched dist." + type: "gauge" + - title: sysom_net_health_hist + from: net_health_hist + head: value + help: "net_health_hist" + type: "gauge" + - title: sysom_net_health_count + from: net_health_count + head: value + help: "net_health_count" + type: "gauge" + - title: sysom_net_retrans_count + from: net_retrans_count + head: value + help: "net_retrans_count" + type: "gauge" + - title: sysom_gpuinfo + from: gpuinfo + head: value + help: "gpuinfo of system from nvidia-smi" + type: "gauge" + - title: sysom_uname + from: uname + head: value + help: "uname info" + type: "gauge" + - title: sysom_uptime + from: uptime + head: value + help: "uptime from /proc/uptime" + type: "gauge" + - title: sysom_system_release + from: system_release + head: value + help: "system_release from /etc/os-release" + type: "gauge" + - title: sysom_cgroups + from: cgroups + head: value + help: "cgroup number." + type: "gauge" + - title: sysom_per_sirqs + from: per_sirqs + head: value + help: "per_sirqs." + type: "gauge" + - title: sysom_softnets + from: softnets + head: value + help: "cgroup number." + type: "gauge" + - title: sysom_interrupts + from: interrupts + head: value + help: "interrupts." + type: "gauge" + - title: sysom_net_ip_count + from: net_ip_count + head: value + help: "net snmp net_ip_count" + type: "gauge" + - title: sysom_net_icmp_count + from: net_icmp_count + head: value + help: "net snmp net_icmp_count" + type: "gauge" + - title: sysom_net_udp_count + from: net_udp_count + head: value + help: "net snmp net_udp_count" + type: "gauge" + - title: sysom_net_tcp_count + from: net_tcp_count + head: value + help: "net snmp net_tcp_count" + type: "gauge" + - title: sysom_net_tcp_ext_count + from: net_tcp_ext_count + head: value + help: "net stat net_tcp_ext_count" + type: "gauge" + - title: sysom_podmem + from: podmem + head: value + help: "file cache for pod" + type: "gauge" + - title: sysom_alloc_page + from: pod_alloc + head: value + help: "pod tcp memory" + type: "gauge" + - title: sysom_cg_memfail_cnt + from: cg_memfail_cnt + head: value + help: "sysom_cg_memFail_cnt" + type: "gauge" + - title: sysom_cg_memUtil + from: cg_memory_util + head: value + help: "sysom_cg_memory_util" + type: "gauge" + - title: sysom_cg_memgdrcm_latency + from: cgGlbDrcmLatency + head: value + help: "sysom global memory latency" + type: "gauge" + + - title: sysom_cg_memdrcm_latency + from: cg_memdrcm_latency + head: value + help: "sysom_cg_memdrcm_latency" + type: "gauge" + - title: sysom_cg_memmcmp_latency + from: cg_memmcmp_latency + head: value + help: "sysom_cg_memmcmp_latency" + type: "gauge" + - title: sysom_cg_cpu_stat + from: cg_cpu_stat + head: value + help: "sysom_cg_cpu_stat" + type: "gauge" + - title: sysom_cg_cpuacct_stat + from: cg_cpuacct_stat + head: value + help: "cpuacct/cpuacct.stat" + type: "gauge" + - title: sysom_cg_cfs_quota + from: cgCpuQuota + head: value + help: "cfs quota" + type: "gauge"