kubernetes版本:1.13.2 背景 由于operator创建的redis集群,在kubernetes apiserver重启后,redis集群被异常删除(包括redis exporter statefulset、redis statefulset)。删除后operator将其重建,重新组建集群,实例IP发生变更(中间件容器化,我们开发了固定IP,当statefulset删除后,IP会被回收),导致创建集群失败,最终集群不可用。 经多次复现,apiserver重启后,通过查询redis operator日志,并没有发现主动去删除redis集群(redis statefulset)、监控实例(redis exporter)。进一步去查看kube-controller-manager的日志,将其日志级别设置--v=5,继续复现,最终在kube-controller-manager日志中发现如下日志: 在这里插入图片描述 可以看到是garbage collector触发删除操作的。这个问题在apiserver正常的时候是不存在,要想弄其究竟,就得看看kube-controller-manager内置组件garbage collector这个控制器的逻辑。 由于内容偏长,分为多节来讲: ①、monitors作为生产者将变化的资源放入graphChanges队列;同时restMapper定期检测集群内资源类型,刷新monitors ②、runProcessGraphChanges从graphChanges队列中取出变化的item,根据情况放入attemptToDelete队列;runAttemptToDeleteWorker取出处理垃圾资源; ③、runProcessGraphChanges从graphChanges队列中取出变化的item,根据情况放入attemptToOrphan队列;runAttemptToOrphanWorker取出处理该该孤立的资源; 在这里插入图片描述 正文 想要启用GC,需要在kube-apiserver和kube-controller-manager的启动参数中都设置--enable-garbage-collector为true,1.13.2版本中默认开启GC。 需要注意:两组件该参数必须保持同步。 kube-controller-manager启动入口,app.NewControllerManagerCommand()中加载controller manager默认启动参数,创建* cobra.Command对象: func main() { rand.Seed(time.Now().UnixNano()) //加载controller manager默认启动参数,创建* cobra.Command对象 command := app.NewControllerManagerCommand() //......省略....... //执行cobra.command,并启动controller-manager if err := command.Execute(); err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } } 以下代码处去启动kube-controller-manager: 在这里插入图片描述 NewDefaultComponentConfig(ports.InsecureKubeControllerManagerPort)加载各个控制器的配置: //NewKubeControllerManagerOptions使用默认配置创建一个新的KubeControllerManagerOptions func NewKubeControllerManagerOptions() (*KubeControllerManagerOptions, error) { //加载各个控制器的默认配置 componentConfig, err := NewDefaultComponentConfig(ports.InsecureKubeControllerManagerPort) if err != nil { return nil, err } s := KubeControllerManagerOptions{ Generic: cmoptions.NewGenericControllerManagerConfigurationOptions(componentConfig.Generic), //.....省略 GarbageCollectorController: &GarbageCollectorControllerOptions{ ConcurrentGCSyncs: componentConfig.GarbageCollectorController.ConcurrentGCSyncs, EnableGarbageCollector: componentConfig.GarbageCollectorController.EnableGarbageCollector, }, //.....省略 } //gc忽略的资源对象列表 gcIgnoredResources := make([]kubectrlmgrconfig.GroupResource, 0, len(garbagecollector.DefaultIgnoredResources())) for r := range garbagecollector.DefaultIgnoredResources() { gcIgnoredResources = append(gcIgnoredResources, kubectrlmgrconfig.GroupResource{Group: r.Group, Resource: r.Resource}) } s.GarbageCollectorController.GCIgnoredResources = gcIgnoredResources return &s, nil } // NewDefaultComponentConfig返回kube-controller管理器配置对象 func NewDefaultComponentConfig(insecurePort int32) (kubectrlmgrconfig.KubeControllerManagerConfiguration, error) { scheme := runtime.NewScheme() if err := kubectrlmgrschemev1alpha1.AddToScheme(scheme); err != nil { return kubectrlmgrconfig.KubeControllerManagerConfiguration{}, err } if err := kubectrlmgrconfig.AddToScheme(scheme); err != nil { return kubectrlmgrconfig.KubeControllerManagerConfiguration{}, err } versioned := kubectrlmgrconfigv1alpha1.KubeControllerManagerConfiguration{} //加载默认参数 scheme.Default(&versioned) internal := kubectrlmgrconfig.KubeControllerManagerConfiguration{} if err := scheme.Convert(&versioned, &internal, nil); err != nil { return internal, err } internal.Generic.Port = insecurePort return internal, nil } // 根据Object,获取提供的默认参数 func (s *Scheme) Default(src Object) { if fn, ok := s.defaulterFuncs[reflect.TypeOf(src)]; ok { fn(src) } } s.defaulterFuncs类型为map[reflect.Type]func(interface{}),用于根据指针类型获取默认值函数。该map中的数据从哪里来的呢? 代码位于src\k8s.io\kubernetes\pkg\controller\apis\config\v1alpha1\zz_generated.defaults.go 在这里插入图片描述 可以看到默认参数中garbage collector中默认开启gc(EnableGarbageCollector),并发数为20(ConcurrentGCSyncs) func SetDefaults_GarbageCollectorControllerConfiguration(obj *kubectrlmgrconfigv1alpha1.GarbageCollectorControllerConfiguration) { if obj.EnableGarbageCollector == nil { obj.EnableGarbageCollector = utilpointer.BoolPtr(true) } if obj.ConcurrentGCSyncs == 0 { obj.ConcurrentGCSyncs = 20 } } 回到Run函数,里面调用了NewControllerInitializers启动所有控制器: 在这里插入图片描述 重点来到启动garbage collector的startGarbageCollectorController函数: func startGarbageCollectorController(ctx ControllerContext) (http.Handler, bool, error) { //k8s 1.13.2中默认为true,可在kube-apiserver和kube-controller-manager的启动参数中加--enable-garbage-conllector=false设置 //需保证这两个组件中参数值一致 if !ctx.ComponentConfig.GarbageCollectorController.EnableGarbageCollector { return nil, false, nil } //k8s各种原生资源对象客户端集合(默认启动参数中用SimpleControllerClientBuilder构建) gcClientset := ctx.ClientBuilder.ClientOrDie("generic-garbage-collector") discoveryClient := cacheddiscovery.NewMemCacheClient(gcClientset.Discovery()) //生成rest config config := ctx.ClientBuilder.ConfigOrDie("generic-garbage-collector") dynamicClient, err := dynamic.NewForConfig(config) if err != nil { return nil, true, err } // Get an initial set of deletable resources to prime the garbage collector. //获取一组初始可删除资源以填充垃圾收集器。 deletableResources := garbagecollector.GetDeletableResources(discoveryClient) ignoredResources := make(map[schema.GroupResource]struct{}) //忽略gc的资源类型 for _, r := range ctx.ComponentConfig.GarbageCollectorController.GCIgnoredResources { ignoredResources[schema.GroupResource{Group: r.Group, Resource: r.Resource}] = struct{}{} } garbageCollector, err := garbagecollector.NewGarbageCollector( dynamicClient, ctx.RESTMapper, deletableResources, ignoredResources, ctx.InformerFactory, ctx.InformersStarted, ) if err != nil { return nil, true, fmt.Errorf("Failed to start the generic garbage collector: %v", err) } // Start the garbage collector. //启动参数中默认是20个协程 workers := int(ctx.ComponentConfig.GarbageCollectorController.ConcurrentGCSyncs) //启动monitors和deleteWorkers、orphanWorkers go garbageCollector.Run(workers, ctx.Stop) // Periodically refresh the RESTMapper with new discovery information and sync // the garbage collector. //使用新的发现信息定期刷新RESTMapper并同步垃圾收集器。 go garbageCollector.Sync(gcClientset.Discovery(), 30*time.Second, ctx.Stop) //gc提供debug dot grap依赖关系图接口 return garbagecollector.NewDebugHandler(garbageCollector), true, nil } 该函数主要作用有: 1、deletableResources := garbagecollector.GetDeletableResources(discoveryClient)获取集群内所有可删除的资源对象;排除掉忽略的资源对象。 2、构建garbageCollector结构体对象; 3、garbageCollector.Run(workers, ctx.Stop)启动一个monitors用来监听资源对象的变化(对应的由runProcessGraphChanges死循环处理),和默认20个deleteWorkers协程处理可删除的资源对象、20个orphanWorkers协程处理孤儿对象。 4、garbageCollector.Sync(gcClientset.Discovery(), 30*time.Second, ctx.Stop) 定时去获取一个集群内是否有新类型的资源对象的加入,并重新刷新monitors,以监听新类型的资源对象。 5、garbagecollector.NewDebugHandler(garbageCollector)注册debug接口,用来提供获取dot流程图接口: curl http://127.0.0.1:10252/debug/controllers/garbagecollector/graph?uid=11211212edsaddkqedmk12 使用graphviz提供的dot.exe可以生成svg格式的图,可用google浏览器查看如下: 在这里插入图片描述 // curl http://127.0.0.1:10252/debug/controllers/garbagecollector/graph?uid=11211212edsaddkqedmk12 func (h *debugHTTPHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { if req.URL.Path != "/graph" { http.Error(w, "", http.StatusNotFound) return } var graph graph.Directed if uidStrings := req.URL.Query()["uid"]; len(uidStrings) > 0 { uids := []types.UID{} for _, uidString := range uidStrings { uids = append(uids, types.UID(uidString)) } graph = h.controller.dependencyGraphBuilder.uidToNode.ToGonumGraphForObj(uids...) } else { graph = h.controller.dependencyGraphBuilder.uidToNode.ToGonumGraph() } //生成dot流程图数据,用graphviz工具中的dot.exe工具转换为svg图(用google浏览器打开)或者png图 //API参考:https://godoc.org/gonum.org/v1/gonum/graph //graphviz下载地址:https://graphviz.gitlab.io/_pages/Download/Download_windows.html //dot.exe test.dot -T svg -o test.svg data, err := dot.Marshal(graph, "full", "", " ", false) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } w.Write(data) w.WriteHeader(http.StatusOK) } 在这里插入图片描述 GarbageCollector通过restMapper定期重置可删除的资源类型,更新GraphBuilder中的monitors,monitors将创建所有资源类型的变更通知回调函数,将变化的资源对象加入到GraphBuilder的graphChanges队列,GraphBuilder的runProcessGraphChanges()会一直从队列中获取变化,构建一个缓存对象之间依赖关系的图形,以及触发dependencyGraphBuilder将可能被垃圾收集的对象排队到attemptToDelete队列,并将其依赖项需要孤立的对象排队到attemptToOrphan队列。GarbageCollector具有使用这两个队列的工作人员runAttemptToDeleteWorker和runAttemptToOrphanWorker死循环,分别从attemptToDelete队列和attemptToOrphan队列取出,向API服务器发送请求以相应地删除更新对象。 // GarbageCollector运行反射器来监视托管API对象的更改,将结果汇总到单线程dependencyGraphBuilder, // 构建一个缓存对象之间依赖关系的图形。由图变化触发,dependencyGraphBuilder将可能被垃圾收集的对象 // 排队到`attemptToDelete`队列,并将其依赖项需要孤立的对象排队到`attemptToOrphan`队列。 // GarbageCollector具有使用这两个队列的工作人员,向API服务器发送请求以相应地删除更新对象。 // 请注意,让dependencyGraphBuilder通知垃圾收集器确保垃圾收集器使用至少与发送通知一样最新的图形进行操作。 type GarbageCollector struct { // resettableRESTMapper是一个RESTMapper,它能够在discovery资源类型时重置自己 restMapper resettableRESTMapper // dynamicClient提供操作集群内所有资源对象的接口方法,包括k8s内置、CRD生成的自定义资源 dynamicClient dynamic.Interface //垃圾收集器尝试在时间成熟时删除attemptToDelete队列中的item attemptToDelete workqueue.RateLimitingInterface //垃圾收集器尝试孤立attemptToOrphan队列中item的依赖项,然后删除item attemptToOrphan workqueue.RateLimitingInterface dependencyGraphBuilder *GraphBuilder // 有owner的资源对象,才会给absentOwnerCache填充不存在的Owner信息 absentOwnerCache *UIDCache sharedInformers informers.SharedInformerFactory workerLock sync.RWMutex } // GraphBuilder:基于informers提供的事件,GraphBuilder更新 // uidToNode,一个缓存我们所知的依赖关系的图,并将 // 项放入attemptToDelete和attemptToOrphan队列 type GraphBuilder struct { restMapper meta.RESTMapper //每个监视器列表/监视资源,结果汇集到dependencyGraphBuilder monitors monitors monitorLock sync.RWMutex // informersStarted is closed after after all of the controllers have been initialized and are running. // After that it is safe to start them here, before that it is not. // informersStarted在所有控制器初始化并运行后关闭。之后在这里启动它们是安全的,在此之前它不是。 informersStarted <-chan struct{} // stopCh drives shutdown. When a receive from it unblocks, monitors will shut down. // This channel is also protected by monitorLock. // stopCh驱动器关闭当来自它的接收解除阻塞时,监视器将关闭。 此channel也受monitorLock保护。 stopCh <-chan struct{} // running tracks whether Run() has been called. // it is protected by monitorLock. //运行轨道是否已调用Run()它受monitorLock保护。 running bool dynamicClient dynamic.Interface // monitors are the producer of the graphChanges queue, graphBuilder alters // the in-memory graph according to the changes. // monitor是graphChanges队列的生成者,graphBuilder根据更改改变了内存中的图形。 graphChanges workqueue.RateLimitingInterface // uidToNode doesn't require a lock to protect, because only the // single-threaded GraphBuilder.processGraphChanges() reads/writes it. //uidToNode不需要锁保护,因为只有单线程GraphBuilder.processGraphChanges()读写它。 uidToNode *concurrentUIDToNode // GraphBuilder is the producer of attemptToDelete and attemptToOrphan, GC is the consumer. // GraphBuilder是attemptToDelete和attemptToOrphan的生产者,GC是消费者。 attemptToDelete workqueue.RateLimitingInterface attemptToOrphan workqueue.RateLimitingInterface // GraphBuilder and GC share the absentOwnerCache. Objects that are known to // be non-existent are added to the cached. // GraphBuilder和GC共享absentOwnerCache。已知不存在的对象将添加到缓存中。 absentOwnerCache *UIDCache //所有k8s资源对象集的informer sharedInformers informers.SharedInformerFactory //监视器忽略的资源对象集 ignoredResources map[schema.GroupResource]struct{} } 创建NewGarbageCollector结构体: func NewGarbageCollector( dynamicClient dynamic.Interface, mapper resettableRESTMapper, deletableResources map[schema.GroupVersionResource]struct{}, ignoredResources map[schema.GroupResource]struct{}, sharedInformers informers.SharedInformerFactory, informersStarted <-chan struct{}, ) (*GarbageCollector, error) { attemptToDelete := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "garbage_collector_attempt_to_delete") attemptToOrphan := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "garbage_collector_attempt_to_orphan") absentOwnerCache := NewUIDCache(500) gc := &GarbageCollector{ dynamicClient: dynamicClient, restMapper: mapper, attemptToDelete: attemptToDelete, attemptToOrphan: attemptToOrphan, absentOwnerCache: absentOwnerCache, } gb := &GraphBuilder{ dynamicClient: dynamicClient, informersStarted: informersStarted, restMapper: mapper, graphChanges: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "garbage_collector_graph_changes"), uidToNode: &concurrentUIDToNode{ uidToNode: make(map[types.UID]*node), }, attemptToDelete: attemptToDelete, attemptToOrphan: attemptToOrphan, absentOwnerCache: absentOwnerCache, sharedInformers: sharedInformers, ignoredResources: ignoredResources