2016-07-11 19:23:53 +08:00
/ *
Copyright 2016 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package node
import (
"fmt"
"strings"
2017-01-14 01:48:50 +08:00
"k8s.io/apimachinery/pkg/api/errors"
2017-01-11 22:09:48 +08:00
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2016-07-11 19:23:53 +08:00
"k8s.io/kubernetes/pkg/api"
2016-11-19 04:50:17 +08:00
"k8s.io/kubernetes/pkg/api/v1"
2016-07-11 19:23:53 +08:00
"k8s.io/kubernetes/pkg/client/cache"
2017-01-06 14:34:29 +08:00
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
2016-07-11 19:23:53 +08:00
"k8s.io/kubernetes/pkg/client/record"
"k8s.io/kubernetes/pkg/cloudprovider"
"k8s.io/kubernetes/pkg/fields"
"k8s.io/kubernetes/pkg/kubelet/util/format"
2016-11-02 03:59:06 +08:00
"k8s.io/kubernetes/pkg/util/node"
2016-10-23 00:49:18 +08:00
utilversion "k8s.io/kubernetes/pkg/util/version"
2016-07-11 19:23:53 +08:00
"github.com/golang/glog"
)
2016-07-13 22:57:22 +08:00
const (
// Number of Nodes that needs to be in the cluster for it to be treated as "large"
LargeClusterThreshold = 20
)
2016-07-11 19:23:53 +08:00
// deletePods will delete all pods from master running on given node, and return true
2016-08-15 22:21:47 +08:00
// if any pods were deleted, or were found pending deletion.
2016-08-14 09:41:20 +08:00
func deletePods ( kubeClient clientset . Interface , recorder record . EventRecorder , nodeName , nodeUID string , daemonStore cache . StoreToDaemonSetLister ) ( bool , error ) {
2016-07-11 19:23:53 +08:00
remaining := false
2016-11-19 04:50:17 +08:00
selector := fields . OneTermEqualSelector ( api . PodHostField , nodeName ) . String ( )
options := v1 . ListOptions { FieldSelector : selector }
pods , err := kubeClient . Core ( ) . Pods ( v1 . NamespaceAll ) . List ( options )
2016-11-02 03:59:06 +08:00
var updateErrList [ ] error
2016-07-11 19:23:53 +08:00
if err != nil {
return remaining , err
}
if len ( pods . Items ) > 0 {
2016-11-19 04:50:17 +08:00
recordNodeEvent ( recorder , nodeName , nodeUID , v1 . EventTypeNormal , "DeletingAllPods" , fmt . Sprintf ( "Deleting all Pods from Node %v." , nodeName ) )
2016-07-11 19:23:53 +08:00
}
for _ , pod := range pods . Items {
// Defensive check, also needed for tests.
if pod . Spec . NodeName != nodeName {
continue
}
2016-11-02 03:59:06 +08:00
// Set reason and message in the pod object.
if _ , err = setPodTerminationReason ( kubeClient , & pod , nodeName ) ; err != nil {
if errors . IsConflict ( err ) {
updateErrList = append ( updateErrList ,
fmt . Errorf ( "update status failed for pod %q: %v" , format . Pod ( & pod ) , err ) )
continue
}
}
2016-08-15 22:21:47 +08:00
// if the pod has already been marked for deletion, we still return true that there are remaining pods.
2016-07-11 19:23:53 +08:00
if pod . DeletionGracePeriodSeconds != nil {
2016-08-15 22:21:47 +08:00
remaining = true
2016-07-11 19:23:53 +08:00
continue
}
// if the pod is managed by a daemonset, ignore it
_ , err := daemonStore . GetPodDaemonSets ( & pod )
if err == nil { // No error means at least one daemonset was found
continue
}
glog . V ( 2 ) . Infof ( "Starting deletion of pod %v" , pod . Name )
2016-11-19 04:50:17 +08:00
recorder . Eventf ( & pod , v1 . EventTypeNormal , "NodeControllerEviction" , "Marking for deletion Pod %s from Node %s" , pod . Name , nodeName )
2016-07-11 19:23:53 +08:00
if err := kubeClient . Core ( ) . Pods ( pod . Namespace ) . Delete ( pod . Name , nil ) ; err != nil {
return false , err
}
remaining = true
}
2016-11-02 03:59:06 +08:00
if len ( updateErrList ) > 0 {
return false , utilerrors . NewAggregate ( updateErrList )
}
2016-07-11 19:23:53 +08:00
return remaining , nil
}
2016-11-02 03:59:06 +08:00
// setPodTerminationReason attempts to set a reason and message in the pod status, updates it in the apiserver,
// and returns an error if it encounters one.
2016-11-19 04:50:17 +08:00
func setPodTerminationReason ( kubeClient clientset . Interface , pod * v1 . Pod , nodeName string ) ( * v1 . Pod , error ) {
2016-11-02 03:59:06 +08:00
if pod . Status . Reason == node . NodeUnreachablePodReason {
return pod , nil
}
pod . Status . Reason = node . NodeUnreachablePodReason
pod . Status . Message = fmt . Sprintf ( node . NodeUnreachablePodMessage , nodeName , pod . Name )
2016-11-19 04:50:17 +08:00
var updatedPod * v1 . Pod
2016-11-02 03:59:06 +08:00
var err error
if updatedPod , err = kubeClient . Core ( ) . Pods ( pod . Namespace ) . UpdateStatus ( pod ) ; err != nil {
return nil , err
}
return updatedPod , nil
}
2016-11-19 04:50:17 +08:00
func forcefullyDeletePod ( c clientset . Interface , pod * v1 . Pod ) error {
2016-07-11 19:23:53 +08:00
var zero int64
2016-11-16 22:00:01 +08:00
glog . Infof ( "NodeController is force deleting Pod: %v:%v" , pod . Namespace , pod . Name )
2016-11-19 04:50:17 +08:00
err := c . Core ( ) . Pods ( pod . Namespace ) . Delete ( pod . Name , & v1 . DeleteOptions { GracePeriodSeconds : & zero } )
2016-07-11 19:23:53 +08:00
if err == nil {
glog . V ( 4 ) . Infof ( "forceful deletion of %s succeeded" , pod . Name )
}
return err
}
2016-10-29 01:45:04 +08:00
// forcefullyDeleteNode immediately the node. The pods on the node are cleaned
// up by the podGC.
func forcefullyDeleteNode ( kubeClient clientset . Interface , nodeName string ) error {
2016-07-11 19:23:53 +08:00
if err := kubeClient . Core ( ) . Nodes ( ) . Delete ( nodeName , nil ) ; err != nil {
return fmt . Errorf ( "unable to delete node %q: %v" , nodeName , err )
}
return nil
}
// maybeDeleteTerminatingPod non-gracefully deletes pods that are terminating
// that should not be gracefully terminated.
2016-07-21 03:59:33 +08:00
func ( nc * NodeController ) maybeDeleteTerminatingPod ( obj interface { } ) {
2016-11-19 04:50:17 +08:00
pod , ok := obj . ( * v1 . Pod )
2016-07-11 19:23:53 +08:00
if ! ok {
2016-10-14 09:38:15 +08:00
tombstone , ok := obj . ( cache . DeletedFinalStateUnknown )
if ! ok {
glog . Errorf ( "Couldn't get object from tombstone %#v" , obj )
return
}
2016-11-19 04:50:17 +08:00
pod , ok = tombstone . Obj . ( * v1 . Pod )
2016-10-14 09:38:15 +08:00
if ! ok {
glog . Errorf ( "Tombstone contained object that is not a Pod %#v" , obj )
return
}
2016-07-11 19:23:53 +08:00
}
// consider only terminating pods
if pod . DeletionTimestamp == nil {
return
}
2016-07-21 03:59:33 +08:00
nodeObj , found , err := nc . nodeStore . Store . GetByKey ( pod . Spec . NodeName )
2016-07-11 19:23:53 +08:00
if err != nil {
// this can only happen if the Store.KeyFunc has a problem creating
// a key for the pod. If it happens once, it will happen again so
// don't bother requeuing the pod.
utilruntime . HandleError ( err )
return
}
2016-10-26 02:18:58 +08:00
// if there is no such node, do nothing and let the podGC clean it up.
2016-07-11 19:23:53 +08:00
if ! found {
return
}
// delete terminating pods that have been scheduled on
// nodes that do not support graceful termination
// TODO(mikedanese): this can be removed when we no longer
// guarantee backwards compatibility of master API to kubelets with
// versions less than 1.1.0
2016-11-19 04:50:17 +08:00
node := nodeObj . ( * v1 . Node )
2016-10-23 00:49:18 +08:00
v , err := utilversion . ParseSemantic ( node . Status . NodeInfo . KubeletVersion )
2016-07-11 19:23:53 +08:00
if err != nil {
2017-01-03 11:35:22 +08:00
glog . V ( 0 ) . Infof ( "Couldn't parse version %q of node: %v" , node . Status . NodeInfo . KubeletVersion , err )
2016-07-21 03:59:33 +08:00
utilruntime . HandleError ( nc . forcefullyDeletePod ( pod ) )
2016-07-11 19:23:53 +08:00
return
}
2016-10-23 00:49:18 +08:00
if v . LessThan ( gracefulDeletionVersion ) {
2016-07-21 03:59:33 +08:00
utilruntime . HandleError ( nc . forcefullyDeletePod ( pod ) )
2016-07-11 19:23:53 +08:00
return
}
}
// update ready status of all pods running on given node from master
// return true if success
2016-11-19 04:50:17 +08:00
func markAllPodsNotReady ( kubeClient clientset . Interface , node * v1 . Node ) error {
2016-08-18 06:33:35 +08:00
// Don't set pods to NotReady if the kubelet is running a version that
// doesn't understand how to correct readiness.
// TODO: Remove this check when we no longer guarantee backward compatibility
// with node versions < 1.2.0.
if nodeRunningOutdatedKubelet ( node ) {
return nil
}
nodeName := node . Name
2016-07-11 19:23:53 +08:00
glog . V ( 2 ) . Infof ( "Update ready status of pods on node [%v]" , nodeName )
2016-11-19 04:50:17 +08:00
opts := v1 . ListOptions { FieldSelector : fields . OneTermEqualSelector ( api . PodHostField , nodeName ) . String ( ) }
pods , err := kubeClient . Core ( ) . Pods ( v1 . NamespaceAll ) . List ( opts )
2016-07-11 19:23:53 +08:00
if err != nil {
return err
}
errMsg := [ ] string { }
for _ , pod := range pods . Items {
// Defensive check, also needed for tests.
if pod . Spec . NodeName != nodeName {
continue
}
for i , cond := range pod . Status . Conditions {
2016-11-19 04:50:17 +08:00
if cond . Type == v1 . PodReady {
pod . Status . Conditions [ i ] . Status = v1 . ConditionFalse
2016-07-11 19:23:53 +08:00
glog . V ( 2 ) . Infof ( "Updating ready status of pod %v to false" , pod . Name )
_ , err := kubeClient . Core ( ) . Pods ( pod . Namespace ) . UpdateStatus ( & pod )
if err != nil {
glog . Warningf ( "Failed to update status for pod %q: %v" , format . Pod ( & pod ) , err )
errMsg = append ( errMsg , fmt . Sprintf ( "%v" , err ) )
}
break
}
}
}
if len ( errMsg ) == 0 {
return nil
}
return fmt . Errorf ( "%v" , strings . Join ( errMsg , "; " ) )
}
2016-08-18 06:33:35 +08:00
// nodeRunningOutdatedKubelet returns true if the kubeletVersion reported
// in the nodeInfo of the given node is "outdated", meaning < 1.2.0.
// Older versions were inflexible and modifying pod.Status directly through
// the apiserver would result in unexpected outcomes.
2016-11-19 04:50:17 +08:00
func nodeRunningOutdatedKubelet ( node * v1 . Node ) bool {
2016-10-23 00:49:18 +08:00
v , err := utilversion . ParseSemantic ( node . Status . NodeInfo . KubeletVersion )
2016-08-18 06:33:35 +08:00
if err != nil {
glog . Errorf ( "couldn't parse version %q of node %v" , node . Status . NodeInfo . KubeletVersion , err )
return true
}
2016-10-23 00:49:18 +08:00
if v . LessThan ( podStatusReconciliationVersion ) {
2016-08-18 06:33:35 +08:00
glog . Infof ( "Node %v running kubelet at (%v) which is less than the minimum version that allows nodecontroller to mark pods NotReady (%v)." , node . Name , v , podStatusReconciliationVersion )
return true
}
return false
}
2016-07-16 14:10:29 +08:00
func nodeExistsInCloudProvider ( cloud cloudprovider . Interface , nodeName types . NodeName ) ( bool , error ) {
2016-07-11 19:23:53 +08:00
instances , ok := cloud . Instances ( )
if ! ok {
return false , fmt . Errorf ( "%v" , ErrCloudInstance )
}
if _ , err := instances . ExternalID ( nodeName ) ; err != nil {
if err == cloudprovider . InstanceNotFound {
return false , nil
}
return false , err
}
return true , nil
}
2016-08-14 09:41:20 +08:00
func recordNodeEvent ( recorder record . EventRecorder , nodeName , nodeUID , eventtype , reason , event string ) {
2016-11-19 04:50:17 +08:00
ref := & v1 . ObjectReference {
2016-07-11 19:23:53 +08:00
Kind : "Node" ,
Name : nodeName ,
2016-08-14 09:41:20 +08:00
UID : types . UID ( nodeUID ) ,
2016-07-11 19:23:53 +08:00
Namespace : "" ,
}
glog . V ( 2 ) . Infof ( "Recording %s event message for node %s" , event , nodeName )
recorder . Eventf ( ref , eventtype , reason , "Node %s event: %s" , nodeName , event )
}
2016-11-19 04:50:17 +08:00
func recordNodeStatusChange ( recorder record . EventRecorder , node * v1 . Node , new_status string ) {
ref := & v1 . ObjectReference {
2016-07-11 19:23:53 +08:00
Kind : "Node" ,
Name : node . Name ,
2016-08-14 09:41:20 +08:00
UID : node . UID ,
2016-07-11 19:23:53 +08:00
Namespace : "" ,
}
glog . V ( 2 ) . Infof ( "Recording status change %s event message for node %s" , new_status , node . Name )
// TODO: This requires a transaction, either both node status is updated
// and event is recorded or neither should happen, see issue #6055.
2016-11-19 04:50:17 +08:00
recorder . Eventf ( ref , v1 . EventTypeNormal , new_status , "Node %s status is now: %s" , node . Name , new_status )
2016-07-11 19:23:53 +08:00
}