2446 lines
		
	
	
		
			89 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			2446 lines
		
	
	
		
			89 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
| #!/bin/sh
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| # https://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| #
 | |
| # See usage() function below for more details ...
 | |
| #
 | |
| # Note that the script uses an external file to setup RabbitMQ policies
 | |
| # so make sure to create it from an example shipped with the package.
 | |
| #
 | |
| #######################################################################
 | |
| # Initialization:
 | |
| 
 | |
| : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
 | |
| . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
 | |
| 
 | |
| #######################################################################
 | |
| 
 | |
| # Fill in some defaults if no values are specified
 | |
| 
 | |
| PATH=/sbin:/usr/sbin:/bin:/usr/bin
 | |
| 
 | |
| OCF_RESKEY_binary_default="/usr/sbin/rabbitmq-server"
 | |
| OCF_RESKEY_ctl_default="/usr/sbin/rabbitmqctl"
 | |
| OCF_RESKEY_debug_default=false
 | |
| OCF_RESKEY_username_default="rabbitmq"
 | |
| OCF_RESKEY_groupname_default="rabbitmq"
 | |
| OCF_RESKEY_admin_user_default="guest"
 | |
| OCF_RESKEY_admin_password_default="guest"
 | |
| OCF_RESKEY_definitions_dump_file_default="/etc/rabbitmq/definitions"
 | |
| OCF_RESKEY_pid_file_default="/var/run/rabbitmq/pid"
 | |
| OCF_RESKEY_log_dir_default="/var/log/rabbitmq"
 | |
| OCF_RESKEY_mnesia_base_default="/var/lib/rabbitmq/mnesia"
 | |
| OCF_RESKEY_mnesia_schema_base_default="/var/lib/rabbitmq"
 | |
| OCF_RESKEY_host_ip_default="127.0.0.1"
 | |
| OCF_RESKEY_node_port_default=5672
 | |
| OCF_RESKEY_default_vhost_default="/"
 | |
| OCF_RESKEY_erlang_cookie_default=false
 | |
| OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie"
 | |
| OCF_RESKEY_use_fqdn_default=false
 | |
| OCF_RESKEY_fqdn_prefix_default=""
 | |
| OCF_RESKEY_max_rabbitmqctl_timeouts_default=3
 | |
| OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy"
 | |
| OCF_RESKEY_rmq_feature_health_check_default=true
 | |
| OCF_RESKEY_rmq_feature_local_list_queues_default=true
 | |
| OCF_RESKEY_limit_nofile_default=65535
 | |
| OCF_RESKEY_avoid_using_iptables_default=false
 | |
| OCF_RESKEY_allowed_cluster_nodes_default=""
 | |
| 
 | |
| : ${HA_LOGTAG="lrmd"}
 | |
| : ${HA_LOGFACILITY="daemon"}
 | |
| : ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
 | |
| : ${OCF_RESKEY_ctl=${OCF_RESKEY_ctl_default}}
 | |
| : ${OCF_RESKEY_debug=${OCF_RESKEY_debug_default}}
 | |
| : ${OCF_RESKEY_username=${OCF_RESKEY_username_default}}
 | |
| : ${OCF_RESKEY_groupname=${OCF_RESKEY_groupname_default}}
 | |
| : ${OCF_RESKEY_admin_user=${OCF_RESKEY_admin_user_default}}
 | |
| : ${OCF_RESKEY_admin_password=${OCF_RESKEY_admin_password_default}}
 | |
| : ${OCF_RESKEY_definitions_dump_file=${OCF_RESKEY_definitions_dump_file_default}}
 | |
| : ${OCF_RESKEY_log_dir=${OCF_RESKEY_log_dir_default}}
 | |
| : ${OCF_RESKEY_mnesia_base=${OCF_RESKEY_mnesia_base_default}}
 | |
| : ${OCF_RESKEY_mnesia_schema_base=${OCF_RESKEY_mnesia_schema_base_default}}
 | |
| : ${OCF_RESKEY_pid_file=${OCF_RESKEY_pid_file_default}}
 | |
| : ${OCF_RESKEY_node_port=${OCF_RESKEY_node_port_default}}
 | |
| : ${OCF_RESKEY_default_vhost=${OCF_RESKEY_default_vhost_default}}
 | |
| : ${OCF_RESKEY_erlang_cookie=${OCF_RESKEY_erlang_cookie_default}}
 | |
| : ${OCF_RESKEY_erlang_cookie_file=${OCF_RESKEY_erlang_cookie_file_default}}
 | |
| : ${OCF_RESKEY_use_fqdn=${OCF_RESKEY_use_fqdn_default}}
 | |
| : ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}}
 | |
| : ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}}
 | |
| : ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}}
 | |
| : ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}}
 | |
| : ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}}
 | |
| : ${OCF_RESKEY_limit_nofile=${OCF_RESKEY_limit_nofile_default}}
 | |
| : ${OCF_RESKEY_avoid_using_iptables=${OCF_RESKEY_avoid_using_iptables_default}}
 | |
| : ${OCF_RESKEY_allowed_cluster_nodes=${OCF_RESKEY_allowed_cluster_nodes_default}}
 | |
| 
 | |
| #######################################################################
 | |
| 
 | |
| OCF_RESKEY_start_time_default=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 2))
 | |
| : ${OCF_RESKEY_start_time=${OCF_RESKEY_start_time_default}}
 | |
| OCF_RESKEY_stop_time_default=${OCF_RESKEY_start_time_default}
 | |
| : ${OCF_RESKEY_stop_time=${OCF_RESKEY_start_time_default}}
 | |
| OCF_RESKEY_command_timeout_default=""
 | |
| : ${OCF_RESKEY_command_timeout=${OCF_RESKEY_command_timeout_default}}
 | |
| TIMEOUT_ARG=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 30))
 | |
| COMMAND_TIMEOUT="/usr/bin/timeout ${OCF_RESKEY_command_timeout} ${TIMEOUT_ARG}"
 | |
| RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1`
 | |
| 
 | |
| #######################################################################
 | |
| 
 | |
| usage() {
 | |
|     cat <<UEND
 | |
|         usage: $0 (start|stop|validate-all|meta-data|status|monitor)
 | |
| 
 | |
|         $0 manages an ${OCF_RESKEY_binary} process as an HA resource
 | |
| 
 | |
|         The 'start' operation starts the networking service.
 | |
|         The 'stop' operation stops the networking service.
 | |
|         The 'validate-all' operation reports whether the parameters are valid
 | |
|         The 'meta-data' operation reports this RA's meta-data information
 | |
|         The 'status' operation reports whether the networking service is running
 | |
|         The 'monitor' operation reports whether the networking service seems to be working
 | |
| 
 | |
| UEND
 | |
| }
 | |
| 
 | |
| meta_data() {
 | |
|     # The EXTENDED_OCF_PARAMS parameter below does not exist by default
 | |
|     # and hence converted to an empty string unless overridden. It
 | |
|     # could be used by an extention script to add new parameters. For
 | |
|     # example see https://review.openstack.org/#/c/249180/10
 | |
| 
 | |
|     cat <<END
 | |
| <?xml version="1.0"?>
 | |
| <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
 | |
| <resource-agent name="${OCF_RESKEY_binary}">
 | |
| <version>1.0</version>
 | |
| 
 | |
| <longdesc lang="en">
 | |
| Resource agent for ${OCF_RESKEY_binary}
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Resource agent for ${OCF_RESKEY_binary}</shortdesc>
 | |
| <parameters>
 | |
| 
 | |
| <parameter name="binary" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| RabbitMQ binary
 | |
| </longdesc>
 | |
| <shortdesc lang="en">RabbitMQ binary</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_binary_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="ctl" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| rabbitctl binary
 | |
| </longdesc>
 | |
| <shortdesc lang="en">rabbitctl binary binary</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_ctl_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="pid_file" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| RabbitMQ PID file
 | |
| </longdesc>
 | |
| <shortdesc lang="en">RabbitMQ PID file</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_pid_file_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="log_dir" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| RabbitMQ log directory
 | |
| </longdesc>
 | |
| <shortdesc lang="en">RabbitMQ log directory</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_log_dir_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="username" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| RabbitMQ user name
 | |
| </longdesc>
 | |
| <shortdesc lang="en">RabbitMQ user name</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_username_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="groupname" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| RabbitMQ group name
 | |
| </longdesc>
 | |
| <shortdesc lang="en">RabbitMQ group name</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_groupname_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="admin_user" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| RabbitMQ default admin user for API
 | |
| </longdesc>
 | |
| <shortdesc lang="en">RabbitMQ admin user</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_admin_user_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="admin_password" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| RabbitMQ default admin user password for API
 | |
| </longdesc>
 | |
| <shortdesc lang="en">RabbitMQ admin password</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_admin_password_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="definitions_dump_file" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| RabbitMQ default definitions dump file
 | |
| </longdesc>
 | |
| <shortdesc lang="en">RabbitMQ definitions dump file</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_definitions_dump_file}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="command_timeout" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Timeout command arguments for issued commands termination (value is auto evaluated)
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Arguments for timeout wrapping command</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_command_timeout_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="start_time" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Timeout for start rabbitmq server
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Timeout for start rabbitmq server</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_start_time_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="stop_time" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Timeout for stopping rabbitmq server
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Timeout for stopping rabbitmq server</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_stop_time_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="debug" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| The debug flag for agent (${OCF_RESKEY_binary}) instance.
 | |
| In the /tmp/ directory will be created rmq-* files for log
 | |
| some operations and ENV values inside OCF-script.
 | |
| </longdesc>
 | |
| <shortdesc lang="en">AMQP server (${OCF_RESKEY_binary}) debug flag</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_debug_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="mnesia_base" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Base directory for storing Mnesia files
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Base directory for storing Mnesia files</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_mnesia_base_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="mnesia_schema_base" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Parent directory for Mnesia schema directory
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Parent directory for Mnesia schema directory</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_mnesia_schema_base_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="host_ip" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| ${OCF_RESKEY_binary} should listen on this IP address
 | |
| </longdesc>
 | |
| <shortdesc lang="en">${OCF_RESKEY_binary} should listen on this IP address</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_host_ip_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="node_port" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| ${OCF_RESKEY_binary} should listen on this port
 | |
| </longdesc>
 | |
| <shortdesc lang="en">${OCF_RESKEY_binary} should listen on this port</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_node_port_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="default_vhost" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Default virtual host used for monitoring if a node is fully synchronized with
 | |
| the rest of the cluster. In normal operation, the resource agent will wait for
 | |
| queues from this virtual host on this node to be synchronized elsewhere before
 | |
| stopping RabbitMQ. This also means queues in other virtual hosts may not be
 | |
| fully synchronized on stop operations.
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Default virtual host used for waiting for synchronization</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_default_vhost_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="erlang_cookie" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Erlang cookie for clustering. If specified, will be updated at the mnesia reset
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Erlang cookie</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_erlang_cookie_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="erlang_cookie_file" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Erlang cookie file path where the cookie will be put, if requested
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Erlang cookie file</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_erlang_cookie_file_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="use_fqdn" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Either to use FQDN or a shortname for the rabbitmq node
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Use FQDN</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_use_fqdn_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="fqdn_prefix" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Optional FQDN prefix for RabbitMQ nodes in cluster.
 | |
| FQDN prefix can be specified to host multiple RabbitMQ instances on a node or
 | |
| in case of RabbitMQ running in dedicated network/interface.
 | |
| </longdesc>
 | |
| <shortdesc lang="en">FQDN prefix</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_fqdn_prefix_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="max_rabbitmqctl_timeouts" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| If during monitor call rabbitmqctl times out, the timeout is ignored
 | |
| unless it is Nth timeout in a row. Here N is the value of the current parameter.
 | |
| If too many timeouts happen in a raw, the monitor call will return with error.
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Fail only if that many rabbitmqctl timeouts in a row occurred</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_max_rabbitmqctl_timeouts_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="policy_file" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| A path to the shell script to setup RabbitMQ policies
 | |
| </longdesc>
 | |
| <shortdesc lang="en">A policy file path</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_policy_file_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="rmq_feature_health_check" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Since rabbit 3.6.4 list_queues/list_channels-based monitoring should
 | |
| be replaced with "node_health_check" command, as it creates no network
 | |
| load at all.
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Use node_health_check for monitoring</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_rmq_feature_health_check_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="rmq_feature_local_list_queues" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| For rabbit version that implements --local flag for list_queues, this
 | |
| can greatly reduce network overhead in cases when node is
 | |
| stopped/demoted.
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Use --local option for list_queues</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_rmq_feature_local_list_queues_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="limit_nofile" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| Soft and hard limit for NOFILE
 | |
| </longdesc>
 | |
| <shortdesc lang="en">NOFILE limit</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_limit_nofile_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="avoid_using_iptables" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| When set to true the iptables calls to block client access become
 | |
| noops. This is useful when we run inside containers.
 | |
| </longdesc>
 | |
| <shortdesc lang="en">Disable iptables use entirely</shortdesc>
 | |
| <content type="boolean" default="${OCF_RESKEY_avoid_using_iptables_default}" />
 | |
| </parameter>
 | |
| 
 | |
| <parameter name="allowed_cluster_nodes" unique="0" required="0">
 | |
| <longdesc lang="en">
 | |
| When set to anything other than the empty string it must container the list of
 | |
| cluster node names, separated by spaces, where the rabbitmq resource is allowed to run.
 | |
| Tis is needed when rabbitmq is running on a subset of nodes part of a larger
 | |
| cluster. The default ("") is to assume that all nodes part of the cluster will
 | |
| run the rabbitmq resource.
 | |
| </longdesc>
 | |
| <shortdesc lang="en">List of cluster nodes where rabbitmq is allowed to run</shortdesc>
 | |
| <content type="string" default="${OCF_RESKEY_allowed_cluster_nodes}" />
 | |
| </parameter>
 | |
| 
 | |
| $EXTENDED_OCF_PARAMS
 | |
| 
 | |
| </parameters>
 | |
| 
 | |
| <actions>
 | |
| <action name="start" timeout="20" />
 | |
| <action name="stop" timeout="20" />
 | |
| <action name="status" timeout="20" />
 | |
| <action name="monitor" depth="0" timeout="30" interval="5" />
 | |
| <action name="monitor" depth="0" timeout="30" interval="3" role="Master"/>
 | |
| <action name="promote" timeout="30" />
 | |
| <action name="demote"  timeout="30" />
 | |
| <action name="notify"   timeout="20" />
 | |
| <action name="validate-all" timeout="5" />
 | |
| <action name="meta-data" timeout="5" />
 | |
| </actions>
 | |
| </resource-agent>
 | |
| END
 | |
| }
 | |
| 
 | |
| 
 | |
| MIN_MASTER_SCORE=100
 | |
| BEST_MASTER_SCORE=1000
 | |
| 
 | |
| 
 | |
| #######################################################################
 | |
| # Functions invoked by resource manager actions
 | |
| 
 | |
| #TODO(bogdando) move proc_kill, proc_stop to shared OCF functions
 | |
| #  to be shipped with HA cluster packages
 | |
| ###########################################################
 | |
| # Attempts to kill a process with retries and checks procfs
 | |
| # to make sure the process is stopped.
 | |
| #
 | |
| # Globals:
 | |
| #   LL
 | |
| # Arguments:
 | |
| #   $1 - pid of the process to try and kill
 | |
| #   $2 - service name used for logging and match-based kill, if the pid is "none"
 | |
| #   $3 - signal to use, defaults to SIGTERM
 | |
| #   $4 - number of retries, defaults to 5
 | |
| #   $5 - time to sleep between retries, defaults to 2
 | |
| # Returns:
 | |
| #   0 - if successful
 | |
| #   1 - if process is still running according to procfs
 | |
| #   2 - if invalid parameters passed in
 | |
| ###########################################################
 | |
| proc_kill()
 | |
| {
 | |
|     local pid="${1}"
 | |
|     local service_name="${2}"
 | |
|     local signal="${3:-SIGTERM}"
 | |
|     local count="${4:-5}"
 | |
|     local process_sleep="${5:-2}"
 | |
|     local LH="${LL} proc_kill():"
 | |
|     local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')"
 | |
| 
 | |
|     if [ "${pid}" -a "${pgrp}" = "1" ] ; then
 | |
|         ocf_log err "${LH} shall not kill by the bad pid 1 (init)!"
 | |
|         return 2
 | |
|     fi
 | |
| 
 | |
|     if [ "${pid}" = "none" ]; then
 | |
|         local matched
 | |
|         matched="$(pgrep -fla ${service_name})"
 | |
|         if [ -z "${matched}" ] ; then
 | |
|             ocf_log info "${LH} cannot find any processes matching the ${service_name}, considering target process to be already dead"
 | |
|             return 0
 | |
|         fi
 | |
|         ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}"
 | |
|         while [ $count -gt 0 ]; do
 | |
|             if [ -z "${matched}" ]; then
 | |
|                 break
 | |
|             else
 | |
|                 matched="$(pgrep -fla ${service_name})"
 | |
|                 ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
 | |
|                 ocf_run pkill -f -"${signal}" "${service_name}"
 | |
|             fi
 | |
|             sleep $process_sleep
 | |
|             count=$(( count-1 ))
 | |
|         done
 | |
|         pgrep -f "${service_name}" > /dev/null
 | |
|         if [ $? -ne 0 ] ; then
 | |
|             ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
 | |
|             return 0
 | |
|         else
 | |
|             ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}"
 | |
|             return 1
 | |
|         fi
 | |
|     else
 | |
|     # pid is not none
 | |
|         while [ $count -gt 0 ]; do
 | |
|             if [ ! -d "/proc/${pid}" ]; then
 | |
|                 break
 | |
|             else
 | |
|                 ocf_log debug "${LH} Stopping ${service_name} with ${signal}..."
 | |
|                 ocf_run pkill -"${signal}" -g "${pgrp}"
 | |
|             fi
 | |
|             sleep $process_sleep
 | |
|             count=$(( count-1 ))
 | |
|         done
 | |
| 
 | |
|         # Check if the process ended after the last sleep
 | |
|         if [ ! -d "/proc/${pid}" ] ; then
 | |
|             ocf_log debug "${LH} Stopped ${service_name} with ${signal}"
 | |
|             return 0
 | |
|         fi
 | |
| 
 | |
|         ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}"
 | |
|         return 1
 | |
|     fi
 | |
| }
 | |
| 
 | |
| ###########################################################
 | |
| # Attempts to kill a process with the given pid or pid file
 | |
| # using proc_kill and will retry with sigkill if sigterm is
 | |
| # unsuccessful.
 | |
| #
 | |
| # Globals:
 | |
| #   OCF_ERR_GENERIC
 | |
| #   OCF_SUCCESS
 | |
| #   LL
 | |
| # Arguments:
 | |
| #   $1 - pidfile or pid or 'none', if stopping by the name matching
 | |
| #   $2 - service name used for logging or for the failback stopping method
 | |
| #   $3 - stop process timeout (in sec), used to determine how many times we try
 | |
| #        SIGTERM and an upper limit on how long this function should try and
 | |
| #        stop the process. Defaults to 15.
 | |
| # Returns:
 | |
| #   OCF_SUCCESS - if successful
 | |
| #   OCF_ERR_GENERIC - if process is still running according to procfs
 | |
| ###########################################################
 | |
| proc_stop()
 | |
| {
 | |
|     local pid_param="${1}"
 | |
|     local service_name="${2}"
 | |
|     local timeout="${3:-15}"
 | |
|     local LH="${LL} proc_stop():"
 | |
|     local i
 | |
|     local pid
 | |
|     local pidfile
 | |
|     if [ "${pid_param}" = "none" ] ; then
 | |
|         pid="none"
 | |
|     else
 | |
|         # check if provide just a number
 | |
|         echo "${pid_param}" | egrep -q '^[0-9]+$'
 | |
|         if [ $? -eq 0 ]; then
 | |
|             pid="${pid_param}"
 | |
|         elif [ -e "${pid_param}" ]; then # check if passed in a pid file
 | |
|             pidfile="${pid_param}"
 | |
|             pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u)
 | |
|         else
 | |
|             ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}"
 | |
|             pid="none"
 | |
|         fi
 | |
|     fi
 | |
|     # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds
 | |
|     local stop_count=$(( ($timeout-5)/2 ))
 | |
| 
 | |
|     # make sure we stop at least once
 | |
|     if [ $stop_count -le 0 ]; then
 | |
|         stop_count=1
 | |
|     fi
 | |
| 
 | |
|     if [ -z "${pid}" ] ; then
 | |
|         ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}"
 | |
|         pid="none"
 | |
|     fi
 | |
| 
 | |
|     if [ -n "${pid}" ]; then
 | |
|         for i in ${pid} ; do
 | |
|             [ "${i}" ] || break
 | |
|             ocf_log info "${LH} Stopping ${service_name} by PID ${i}"
 | |
|             proc_kill "${i}" "${service_name}" SIGTERM $stop_count
 | |
|             if [ $? -ne 0 ]; then
 | |
|                 # SIGTERM failed, send a single SIGKILL
 | |
|                 proc_kill "${i}" "${service_name}" SIGKILL 1 2
 | |
|                 if [ $? -ne 0 ]; then
 | |
|                     ocf_log err "${LH} ERROR: could not stop ${service_name}"
 | |
|                     return "${OCF_ERR_GENERIC}"
 | |
|                 fi
 | |
|             fi
 | |
|         done
 | |
|     fi
 | |
| 
 | |
|     # Remove the pid file here which will remove empty pid files as well
 | |
|     if [ -n "${pidfile}" ]; then
 | |
|         rm -f "${pidfile}"
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} Stopped ${service_name}"
 | |
|     return "${OCF_SUCCESS}"
 | |
| }
 | |
| 
 | |
| # Invokes the given command as a rabbitmq user and wrapped in the
 | |
| # timeout command.
 | |
| su_rabbit_cmd() {
 | |
|     local timeout
 | |
|     if [ "$1" = "-t" ]; then
 | |
|       timeout="/usr/bin/timeout ${OCF_RESKEY_command_timeout} $2"
 | |
|       shift 2
 | |
|     else
 | |
|       timeout=$COMMAND_TIMEOUT
 | |
|     fi
 | |
|     local cmd="${1:-status}"
 | |
|     local LH="${LL} su_rabbit_cmd():"
 | |
|     local rc=1
 | |
|     local user=$OCF_RESKEY_username
 | |
|     local mail=/var/spool/mail/rabbitmq
 | |
|     local pwd=/var/lib/rabbitmq
 | |
|     local home=/var/lib/rabbitmq
 | |
| 
 | |
|     ocf_log debug "${LH} invoking a command: ${cmd}"
 | |
|     su $user -s /bin/sh -c "USER=${user} MAIL=${mail} PWD=${pwd} HOME=${home} LOGNAME=${user} \
 | |
|       ${timeout} ${cmd}"
 | |
|     rc=$?
 | |
|     ocf_log info "${LH} the invoked command exited ${rc}: ${cmd}"
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| now() {
 | |
|     date -u +%s
 | |
| }
 | |
| 
 | |
| set_limits() {
 | |
|     local current_limit=$(su $OCF_RESKEY_username -s /bin/sh -c "ulimit -n")
 | |
|     if [ ! -z $OCF_RESKEY_limit_nofile -a $OCF_RESKEY_limit_nofile -gt $current_limit ] ; then
 | |
|         ulimit -n $OCF_RESKEY_limit_nofile
 | |
|     fi
 | |
| }
 | |
| 
 | |
| master_score() {
 | |
|     local LH="${LL} master_score():"
 | |
|     local score=$1
 | |
|     if [ -z $score ] ; then
 | |
|         score=0
 | |
|     fi
 | |
|     ocf_log info "${LH} Updating master score attribute with ${score}"
 | |
|     ocf_run crm_master -N $THIS_PCMK_NODE -l reboot -v $score || return $OCF_ERR_GENERIC
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| 
 | |
| # Return either FQDN or shortname, depends on the OCF_RESKEY_use_fqdn.
 | |
| get_hostname() {
 | |
|     if [ "${OCF_RESKEY_use_fqdn}" = 'false' ] ; then
 | |
|         echo "$(hostname -s)"
 | |
|     else
 | |
|         echo "$(hostname -f)"
 | |
|     fi
 | |
| }
 | |
| 
 | |
| # Strip the FQDN to the shortname, if OCF_RESKEY_use_fqdn was set;
 | |
| # Prepend prefix to the hostname
 | |
| process_fqdn() {
 | |
|     if [ "${OCF_RESKEY_use_fqdn}" = 'false' ] ; then
 | |
|         echo "${OCF_RESKEY_fqdn_prefix}$1" | awk -F. '{print $1}'
 | |
|     else
 | |
|         echo "${OCF_RESKEY_fqdn_prefix}$1"
 | |
|     fi
 | |
| }
 | |
| 
 | |
| # Return OCF_SUCCESS, if current host is in the list of given hosts.
 | |
| # Otherwise, return 10
 | |
| my_host() {
 | |
|     local hostlist="$1"
 | |
|     local hostname
 | |
|     local hn
 | |
|     local rc=10
 | |
|     local LH="${LL} my_host():"
 | |
| 
 | |
|     hostname=$(process_fqdn $(get_hostname))
 | |
|     ocf_log info "${LH} hostlist is: $hostlist"
 | |
|     for host in $hostlist ; do
 | |
|         hn=$(process_fqdn "${host}")
 | |
|         ocf_log debug "${LH} comparing '$hostname' with '$hn'"
 | |
|         if [ "${hostname}" = "${hn}" ] ; then
 | |
|             rc=$OCF_SUCCESS
 | |
|             break
 | |
|         fi
 | |
|     done
 | |
| 
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| get_integer_node_attr() {
 | |
|     local value
 | |
|     value=$(crm_attribute -N $1 -l reboot --name "$2" --query 2>/dev/null | awk '{ split($3, vals, "="); if (vals[2] != "(null)") print vals[2] }')
 | |
|     if [ $? -ne 0 -o -z "$value" ] ; then
 | |
|         value=0
 | |
|     fi
 | |
|     echo $value
 | |
| }
 | |
| 
 | |
| get_node_start_time() {
 | |
|     get_integer_node_attr $1 'rabbit-start-time'
 | |
| }
 | |
| 
 | |
| get_node_master_score() {
 | |
|     get_integer_node_attr $1 "master-${RESOURCE_NAME}"
 | |
| }
 | |
| 
 | |
| # Return either rabbit node name as FQDN or shortname, depends on the OCF_RESKEY_use_fqdn.
 | |
| rabbit_node_name() {
 | |
|     echo "rabbit@$(process_fqdn $(ocf_attribute_target $1))"
 | |
| }
 | |
| 
 | |
| rmq_setup_env() {
 | |
|     local H
 | |
|     local dir
 | |
|     H="$(get_hostname)"
 | |
|     export RABBITMQ_NODENAME=$(rabbit_node_name $H)
 | |
|     export RABBITMQ_NODE_PORT=$OCF_RESKEY_node_port
 | |
|     export RABBITMQ_PID_FILE=$OCF_RESKEY_pid_file
 | |
|     MNESIA_FILES="${OCF_RESKEY_mnesia_base}/$(rabbit_node_name $H)"
 | |
|     RMQ_START_TIME="${MNESIA_FILES}/ocf_server_start_time.txt"
 | |
|     MASTER_FLAG_FILE="${MNESIA_FILES}/ocf_master_for_${OCF_RESOURCE_INSTANCE}"
 | |
|     THIS_PCMK_NODE=$(ocf_attribute_target)
 | |
|     TOTALVMEM=`free -mt | awk '/Total:/ {print $2}'`
 | |
|     # check and make PID file dir
 | |
|     local PID_DIR=$( dirname $OCF_RESKEY_pid_file )
 | |
|     if [ ! -d ${PID_DIR} ] ; then
 | |
|         mkdir -p ${PID_DIR}
 | |
|         chown -R ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} ${PID_DIR}
 | |
|         chmod 755 ${PID_DIR}
 | |
|     fi
 | |
| 
 | |
|     # Regardless of whether we just created the directory or it
 | |
|     # already existed, check whether it is writable by the configured
 | |
|     # user
 | |
|     for dir in ${PID_DIR} "${OCF_RESKEY_mnesia_base}" "${OCF_RESKEY_log_dir}"; do
 | |
|         if test -e ${dir}; then
 | |
|             local files
 | |
|             files=$(su -s /bin/sh - $OCF_RESKEY_username -c "find ${dir} ! -writable")
 | |
|             if [ "${files}" ]; then
 | |
|                 ocf_log warn "Directory ${dir} is not writable by ${OCF_RESKEY_username}, chowning."
 | |
|                 chown -R ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} "${dir}"
 | |
|             fi
 | |
|         fi
 | |
|     done
 | |
| 
 | |
|     export LL="${OCF_RESOURCE_INSTANCE}[$$]:"
 | |
|     update_cookie
 | |
| }
 | |
| 
 | |
| # Return a RabbitMQ node to its virgin state.
 | |
| # For reset and force_reset to succeed the RabbitMQ application must have been stopped.
 | |
| # If the app cannot be stopped, beam will be killed and mnesia files will be removed.
 | |
| reset_mnesia() {
 | |
|     local LH="${LL} reset_mnesia():"
 | |
|     local make_amnesia=false
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
| 
 | |
|     # check status of a beam process
 | |
|     get_status
 | |
|     rc=$?
 | |
|     if [ $rc -eq 0 ] ; then
 | |
|         # beam is running
 | |
|         # check status of rabbit app and stop it, if it is running
 | |
|         get_status rabbit
 | |
|         rc=$?
 | |
|         if [ $rc -eq 0 ] ; then
 | |
|             # rabbit app is running, have to stop it
 | |
|             ocf_log info "${LH} Stopping RMQ-app prior to reset the mnesia."
 | |
|             stop_rmq_server_app
 | |
|             rc=$?
 | |
|             if [ $rc -ne 0 ] ; then
 | |
|                  ocf_log warn "${LH} RMQ-app can't be stopped."
 | |
|                  make_amnesia=true
 | |
|             fi
 | |
|         fi
 | |
| 
 | |
|         if ! $make_amnesia ; then
 | |
|             # rabbit app is not running, reset mnesia
 | |
|             ocf_log info "${LH} Execute reset with timeout: ${TIMEOUT_ARG}"
 | |
|             su_rabbit_cmd "${OCF_RESKEY_ctl} reset"
 | |
|             rc=$?
 | |
|             if [ $rc -ne 0 ] ; then
 | |
|                 ocf_log info "${LH} Execute force_reset with timeout: ${TIMEOUT_ARG}"
 | |
|                 su_rabbit_cmd "${OCF_RESKEY_ctl} force_reset"
 | |
|                 rc=$?
 | |
|                 if [ $rc -ne 0 ] ; then
 | |
|                     ocf_log warn "${LH} Mnesia couldn't cleaned, even by force-reset command."
 | |
|                     make_amnesia=true
 | |
|                 fi
 | |
|             fi
 | |
|         fi
 | |
|     else
 | |
|         # there is no beam running
 | |
|         make_amnesia=true
 | |
|         ocf_log warn "${LH} There is no Beam process running."
 | |
|     fi
 | |
| 
 | |
|     # remove mnesia files, if required
 | |
|     if $make_amnesia ; then
 | |
|         kill_rmq_and_remove_pid
 | |
|         ocf_run rm -rf "${MNESIA_FILES}"
 | |
|         mnesia_schema_location="${OCF_RESKEY_mnesia_schema_base}/Mnesia.$(rabbit_node_name $(get_hostname))"
 | |
|         ocf_run rm -rf "$mnesia_schema_location"
 | |
|         ocf_log warn "${LH} Mnesia files appear corrupted and have been removed from ${MNESIA_FILES} and $mnesia_schema_location"
 | |
|     fi
 | |
|     # always return OCF SUCCESS
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| 
 | |
| 
 | |
| block_client_access()
 | |
| {
 | |
|     # When OCF_RESKEY_avoid_using_iptables is true iptables calls are noops
 | |
|     if [ "${OCF_RESKEY_avoid_using_iptables}" == 'true' ] ; then
 | |
|         return $OCF_SUCCESS
 | |
|     fi
 | |
|     # do not add temporary RMQ blocking rule, if it is already exist
 | |
|     # otherwise, try to add a blocking rule with max of 5 retries
 | |
|     local tries=5
 | |
|     until $(iptables -nvL --wait | grep -q 'temporary RMQ block') || [ $tries -eq 0 ]; do
 | |
|       tries=$((tries-1))
 | |
|       iptables --wait -I INPUT -p tcp -m tcp --dport ${OCF_RESKEY_node_port} -m state --state NEW,RELATED,ESTABLISHED \
 | |
|       -m comment --comment 'temporary RMQ block' -j REJECT --reject-with tcp-reset
 | |
|       sleep 1
 | |
|     done
 | |
|     if [ $tries -eq 0 ]; then
 | |
|         return $OCF_ERR_GENERIC
 | |
|     else
 | |
|         return $OCF_SUCCESS
 | |
|     fi
 | |
| }
 | |
| 
 | |
| unblock_client_access()
 | |
| {
 | |
|     # When OCF_RESKEY_avoid_using_iptables is true iptables calls are noops
 | |
|     if [ "${OCF_RESKEY_avoid_using_iptables}" == 'true' ] ; then
 | |
|         return
 | |
|     fi
 | |
|     # remove all temporary RMQ blocking rules, if there are more than one exist
 | |
|     for i in $(iptables -nvL --wait --line-numbers | awk '/temporary RMQ block/ {print $1}'); do
 | |
|       iptables --wait -D INPUT -p tcp -m tcp --dport ${OCF_RESKEY_node_port} -m state --state NEW,RELATED,ESTABLISHED \
 | |
|       -m comment --comment 'temporary RMQ block' -j REJECT --reject-with tcp-reset
 | |
|     done
 | |
| }
 | |
| 
 | |
| get_nodes__base(){
 | |
|     local infotype=''
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local c_status
 | |
| 
 | |
|     if [ "$1" = 'nodes' ]
 | |
|     then
 | |
|         infotype='db_nodes'
 | |
|     elif [ "$1" = 'running' ]
 | |
|     then
 | |
|         infotype='running_db_nodes'
 | |
|     fi
 | |
|     c_status=`${OCF_RESKEY_ctl} eval "mnesia:system_info(${infotype})." 2>/dev/null`
 | |
|     rc=$?
 | |
|     if [ $rc -ne 0 ] ; then
 | |
|         echo ''
 | |
|         return $OCF_ERR_GENERIC
 | |
|     fi
 | |
|     # translate line like '{running_nodes,['rabbit@node-1','rabbit@node-2','rabbit@node-3']},' to node_list
 | |
|     echo $(echo "${c_status}" | awk -F, '{ for (i=1;i<=NF;i++) { if ($i ~ /@/) { gsub(/[\[\]}{]/,"",$i); print $i; } }}' | tr -d  "\'")
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| 
 | |
| get_nodes() {
 | |
|     echo $(get_nodes__base nodes)
 | |
|     return $?
 | |
| }
 | |
| 
 | |
| get_running_nodes() {
 | |
|     echo $(get_nodes__base running)
 | |
|     return $?
 | |
| }
 | |
| 
 | |
| # Get alive cluster nodes in visible partition, but the specified one
 | |
| get_alive_pacemaker_nodes_but()
 | |
| {
 | |
|     if [ -z "$1" ]; then
 | |
|         tmp_pcmk_node_list=`crm_node -l -p | sed -e '/(null)/d'`
 | |
|     else
 | |
|         tmp_pcmk_node_list=`crm_node -l -p | sed -e "s/${1}//g" | sed -e '/(null)/d'`
 | |
|     fi
 | |
|     # If OCF_RESKEY_allowed_cluster_nodes is set then we only want the intersection
 | |
|     # of the cluster node output and the allowed_cluster_nodes list
 | |
|     if [ -z "${OCF_RESKEY_allowed_cluster_nodes}" ]; then
 | |
|       pcmk_node_list=$tmp_pcmk_node_list
 | |
|     else
 | |
|       pcmk_node_list=`for i in $tmp_pcmk_node_list ${OCF_RESKEY_allowed_cluster_nodes}; do echo $i; done | sort | uniq -d`
 | |
|     fi
 | |
|     echo $pcmk_node_list
 | |
| }
 | |
| 
 | |
| # Get current master. If a parameter is provided,
 | |
| # do not check node with that name
 | |
| get_master_name_but()
 | |
| {
 | |
|     local node
 | |
|     for node in $(get_alive_pacemaker_nodes_but "$@")
 | |
|     do
 | |
|         ocf_log info "${LH} looking if $node is master"
 | |
| 
 | |
|         if is_master $node; then
 | |
|             ocf_log info "${LH} master is $node"
 | |
|             echo $node
 | |
|             break
 | |
|         fi
 | |
|     done
 | |
| }
 | |
| 
 | |
| # Evals some erlang code on current node
 | |
| erl_eval() {
 | |
|     local fmt="${1:?}"
 | |
|     shift
 | |
| 
 | |
|     $COMMAND_TIMEOUT ${OCF_RESKEY_ctl} eval "$(printf "$fmt" "$@")"
 | |
| }
 | |
| 
 | |
| # Returns 0 if we are clustered with provideded node
 | |
| is_clustered_with()
 | |
| {
 | |
|     local LH="${LH}: is_clustered_with: "
 | |
|     local node_name
 | |
|     local rc
 | |
|     node_name=$(rabbit_node_name $1)
 | |
| 
 | |
|     local seen_as_running
 | |
|     seen_as_running=$(erl_eval "lists:member('%s', rabbit_mnesia:cluster_nodes(running))." "$node_name")
 | |
|     rc=$?
 | |
|     if [ "$rc" -ne 0 ]; then
 | |
|         ocf_log err "${LH} Failed to check whether '$node_name' is considered running by us"
 | |
|         # We had a transient local error; that doesn't mean the remote node is
 | |
|         # not part of the cluster, so ignore this
 | |
|     elif [ "$seen_as_running" != true ]; then
 | |
|         ocf_log info "${LH} Node $node_name is not running, considering it not clustered with us"
 | |
|         return 1
 | |
|     fi
 | |
| 
 | |
|     local seen_as_partitioned
 | |
|     seen_as_partitioned=$(erl_eval "lists:member('%s', rabbit_node_monitor:partitions())." "$node_name")
 | |
|     rc=$?
 | |
|     if [ "$rc" -ne 0 ]; then
 | |
|         ocf_log err "${LH} Failed to check whether '$node_name' is partitioned with us"
 | |
|         # We had a transient local error; that doesn't mean the remote node is
 | |
|         # partitioned with us, so ignore this
 | |
|     elif [ "$seen_as_partitioned" != false ]; then
 | |
|         ocf_log info "${LH} Node $node_name is partitioned from us"
 | |
|         return 1
 | |
|     fi
 | |
| 
 | |
|     return $?
 | |
| }
 | |
| 
 | |
| 
 | |
| check_need_join_to() {
 | |
|     local join_to
 | |
|     local node
 | |
|     local running_nodes
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
| 
 | |
|     rc=0
 | |
|     join_to=$(rabbit_node_name $1)
 | |
|     running_nodes=$(get_running_nodes)
 | |
|     for node in $running_nodes ; do
 | |
|         if [ "${join_to}" = "${node}" ] ; then
 | |
|             rc=1
 | |
|             break
 | |
|         fi
 | |
|     done
 | |
| 
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| # Update erlang cookie, if it has been specified
 | |
| update_cookie() {
 | |
|     local cookie_file_content
 | |
|     if [ "${OCF_RESKEY_erlang_cookie}" != 'false' ] ; then
 | |
|         if [ -f "${OCF_RESKEY_erlang_cookie_file}" ]; then
 | |
|             # First line of cookie file without newline
 | |
|             cookie_file_content=$(head -n1 "${OCF_RESKEY_erlang_cookie_file}" | perl -pe chomp)
 | |
|         fi
 | |
|         # As there is a brief period of time when the file is empty
 | |
|         # (shell redirection has already opened and truncated file,
 | |
|         # and echo hasn't finished its job), we are doing this write
 | |
|         # only when cookie has changed.
 | |
|         if [ "${OCF_RESKEY_erlang_cookie}" != "${cookie_file_content}" ]; then
 | |
|             echo "${OCF_RESKEY_erlang_cookie}" > "${OCF_RESKEY_erlang_cookie_file}"
 | |
|         fi
 | |
|         # And this are idempotent operations, so we don't have to
 | |
|         # check any preconditions for running them.
 | |
|         chown ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} "${OCF_RESKEY_erlang_cookie_file}"
 | |
|         chmod 600 "${OCF_RESKEY_erlang_cookie_file}"
 | |
|     fi
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| 
 | |
| # Stop rmq beam process by pid and by rabbit node name match. Returns SUCCESS/ERROR
 | |
| kill_rmq_and_remove_pid() {
 | |
|     local LH="${LL} kill_rmq_and_remove_pid():"
 | |
|     # Stop the rabbitmq-server by its pidfile, use the name matching as a fallback,
 | |
|     # and ignore the exit code
 | |
|     proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}"
 | |
|     # Ensure the beam.smp stopped by the rabbit node name matching as well
 | |
|     proc_stop none "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}"
 | |
|     if [ $? -eq 0 ] ; then
 | |
|         return $OCF_SUCCESS
 | |
|     else
 | |
|         return $OCF_ERR_GENERIC
 | |
|     fi
 | |
| }
 | |
| 
 | |
| trim_var(){
 | |
|     local string="$*"
 | |
|     echo ${string%% }
 | |
| }
 | |
| 
 | |
| action_validate() {
 | |
|     # todo(sv): validate some incoming parameters
 | |
|     OCF_RESKEY_CRM_meta_notify_post=$(trim_var $OCF_RESKEY_CRM_meta_notify_post)
 | |
|     OCF_RESKEY_CRM_meta_notify_pre=$(trim_var $OCF_RESKEY_CRM_meta_notify_pre)
 | |
|     OCF_RESKEY_CRM_meta_notify_start=$(trim_var $OCF_RESKEY_CRM_meta_notify_start)
 | |
|     OCF_RESKEY_CRM_meta_notify_stop=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop)
 | |
|     OCF_RESKEY_CRM_meta_notify_start_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_start_resource)
 | |
|     OCF_RESKEY_CRM_meta_notify_stop_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop_resource)
 | |
|     OCF_RESKEY_CRM_meta_notify_active_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_active_resource)
 | |
|     OCF_RESKEY_CRM_meta_notify_inactive_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_inactive_resource)
 | |
|     OCF_RESKEY_CRM_meta_notify_start_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_start_uname)
 | |
|     OCF_RESKEY_CRM_meta_notify_stop_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop_uname)
 | |
|     OCF_RESKEY_CRM_meta_notify_active_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_active_uname)
 | |
|     OCF_RESKEY_CRM_meta_notify_master_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_master_resource)
 | |
|     OCF_RESKEY_CRM_meta_notify_master_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_master_uname)
 | |
|     OCF_RESKEY_CRM_meta_notify_demote_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_demote_resource)
 | |
|     OCF_RESKEY_CRM_meta_notify_demote_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_demote_uname)
 | |
|     OCF_RESKEY_CRM_meta_notify_slave_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_slave_resource)
 | |
|     OCF_RESKEY_CRM_meta_notify_slave_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_slave_uname)
 | |
|     OCF_RESKEY_CRM_meta_notify_promote_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_promote_resource)
 | |
|     OCF_RESKEY_CRM_meta_notify_promote_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_promote_uname)
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| 
 | |
| update_rabbit_start_time_if_rc() {
 | |
|     local nowtime
 | |
|     local rc=$1
 | |
|     if [ $rc -eq 0 ]; then
 | |
|         nowtime="$(now)"
 | |
|         ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}"
 | |
|         ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
 | |
|     fi
 | |
| }
 | |
| 
 | |
| join_to_cluster() {
 | |
|     local node="$1"
 | |
|     local rmq_node
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} join_to_cluster():"
 | |
| 
 | |
|     ocf_log info "${LH} start."
 | |
| 
 | |
|     rmq_node=$(rabbit_node_name $node)
 | |
|     ocf_log info "${LH} Joining to cluster by node '${rmq_node}'."
 | |
|     get_status rabbit
 | |
|     rc=$?
 | |
|     if [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|         ocf_log info "${LH} rabbitmq app will be stopped."
 | |
|         stop_rmq_server_app
 | |
|         rc=$?
 | |
|         if [ $rc -ne 0 ] ; then
 | |
|             ocf_log err "${LH} Can't stop rabbitmq app by stop_app command. Stopping."
 | |
|             action_stop
 | |
|             return $OCF_ERR_GENERIC
 | |
|         fi
 | |
|     fi
 | |
|     ocf_log info "${LH} Execute join_cluster with timeout: ${TIMEOUT_ARG}"
 | |
|     su_rabbit_cmd "${OCF_RESKEY_ctl} join_cluster $rmq_node"
 | |
|     rc=$?
 | |
|     if [ $rc -ne 0 ] ; then
 | |
|         ocf_log err "${LH} Can't join to cluster by node '${rmq_node}'. Stopping."
 | |
|         action_stop
 | |
|         return $OCF_ERR_GENERIC
 | |
|     fi
 | |
|     sleep 2
 | |
|     try_to_start_rmq_app
 | |
|     rc=$?
 | |
|     if [ $rc -ne 0 ] ; then
 | |
|         ocf_log err "${LH} Can't start RMQ app after join to cluster. Stopping."
 | |
|         action_stop
 | |
|         return $OCF_ERR_GENERIC
 | |
|     else
 | |
|         update_rabbit_start_time_if_rc 0
 | |
|         ocf_log info "${LH} Joined to cluster succesfully."
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} end."
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| unjoin_nodes_from_cluster() {
 | |
|     # node names of the nodes where the pcs resource is being stopped
 | |
|     local nodelist="$1"
 | |
|     local hostname
 | |
|     local nodename
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local rnode
 | |
|     # nodes in rabbit cluster db
 | |
|     local nodes_in_cluster
 | |
|     local LH="${LL} unjoin_nodes_from_cluster():"
 | |
| 
 | |
|     nodes_in_cluster=$(get_nodes)
 | |
|     rc=$?
 | |
|     if [ $rc -ne 0 ] ; then
 | |
|         # no nodes in node list, nothing to do
 | |
|         return $OCF_SUCCESS
 | |
|     fi
 | |
| 
 | |
|     # unjoin all cluster nodes which are being stopped (i.e. recieved post-stop notify), except *this* node
 | |
|     # before to unjoin the nodes, make sure they were disconnected from *this* node
 | |
|     for hostname in $nodelist ; do
 | |
|         nodename=$(rabbit_node_name $hostname)
 | |
|         if [ "${nodename}" = "${RABBITMQ_NODENAME}" ] ; then
 | |
|             continue
 | |
|         fi
 | |
|         for rnode in $nodes_in_cluster ; do
 | |
|             if [ "${nodename}" = "${rnode}" ] ; then
 | |
|                 # disconnect node being unjoined from this node
 | |
|                 ocf_run ${OCF_RESKEY_ctl} eval "disconnect_node(list_to_atom(\"${nodename}\"))." 2>&1
 | |
|                 rc=$?
 | |
|                 if [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|                     ocf_log info "${LH} node '${nodename}' disconnected succesfully."
 | |
|                 else
 | |
|                     ocf_log info "${LH} disconnecting node '${nodename}' failed."
 | |
|                 fi
 | |
| 
 | |
|                 # unjoin node
 | |
|                 # when the rabbit node went down, its status
 | |
|                 # remains 'running' for a while, so few retries are required
 | |
|                 local tries=0
 | |
|                 until [ $tries -eq 5 ]; do
 | |
|                     tries=$((tries+1))
 | |
|                     if is_clustered_with $nodename; then
 | |
|                         ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet"
 | |
|                     else
 | |
|                         break
 | |
|                     fi
 | |
|                     sleep 10
 | |
|                 done
 | |
|                 ocf_log info "${LH} Execute forget_cluster_node with timeout: ${TIMEOUT_ARG}"
 | |
|                 su_rabbit_cmd "${OCF_RESKEY_ctl} forget_cluster_node ${nodename}"
 | |
|                 rc=$?
 | |
|                 if [ $rc -eq 0 ] ; then
 | |
|                    ocf_log info "${LH} node '${nodename}' unjoined succesfully."
 | |
|                 else
 | |
|                    ocf_log warn "${LH} unjoining node '${nodename}' failed."
 | |
|                 fi
 | |
|             fi
 | |
|         done
 | |
|     done
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| 
 | |
| # Stop RMQ beam server process. Returns SUCCESS/ERROR
 | |
| stop_server_process() {
 | |
|     local pid
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} stop_server_process():"
 | |
| 
 | |
|     pid=$(cat ${OCF_RESKEY_pid_file})
 | |
|     rc=$?
 | |
|     if [ $rc -ne 0 ] ; then
 | |
|         # Try to stop without known PID
 | |
|         ocf_log err "${LH} RMQ-server process PIDFILE was not found!"
 | |
|         su_rabbit_cmd "${OCF_RESKEY_ctl} stop >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1"
 | |
|         if [ $? -eq 0 ] ; then
 | |
|             ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found."
 | |
|             ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam"
 | |
|             sleep "${OCF_RESKEY_stop_time}"
 | |
|         else
 | |
|             kill_rmq_and_remove_pid
 | |
|         fi
 | |
|     elif [ "${pid}" ] ; then
 | |
|         # Try to stop gracefully by known PID
 | |
|         ocf_log info "${LH} Execute stop with timeout: ${TIMEOUT_ARG}"
 | |
|         su_rabbit_cmd "${OCF_RESKEY_ctl} stop ${OCF_RESKEY_pid_file} >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1"
 | |
|         [ $? -eq 0 ] && ocf_log info "${LH} RMQ-server process (PID=${pid}) stopped succesfully."
 | |
|     fi
 | |
| 
 | |
|     # Ensure there is no beam process and pidfile left
 | |
|     pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null
 | |
|     rc=$?
 | |
|     if [ -f ${OCF_RESKEY_pid_file} -o $rc -eq 0 ] ; then
 | |
|         ocf_log warn "${LH} The pidfile or beam's still exist, forcing the RMQ-server cleanup"
 | |
|         kill_rmq_and_remove_pid
 | |
|         return $?
 | |
|     else
 | |
|         return $OCF_SUCCESS
 | |
|     fi
 | |
| }
 | |
| 
 | |
| # Stop RMQ-app. Return OCF_SUCCESS, if the app was stopped,
 | |
| # otherwise return OCF_ERR_GENERIC
 | |
| stop_rmq_server_app() {
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
| 
 | |
|     # if the beam process isn't running, then rabbit app is stopped as well
 | |
|     get_status
 | |
|     rc=$?
 | |
|     if [ $rc -ne 0 ] ; then
 | |
|         return $OCF_SUCCESS
 | |
|     fi
 | |
| 
 | |
|     # stop the app
 | |
|     ocf_log info "${LH} Execute stop_app with timeout: ${TIMEOUT_ARG}"
 | |
|     su_rabbit_cmd "${OCF_RESKEY_ctl} stop_app >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1"
 | |
|     rc=$?
 | |
|     if [ $rc -ne 0 ] ; then
 | |
|          ocf_log err "${LH} RMQ-server app cannot be stopped."
 | |
|          return $OCF_ERR_GENERIC
 | |
|     fi
 | |
| 
 | |
|     get_status rabbit
 | |
|     rc=$?
 | |
|     if [ $rc -ne $OCF_SUCCESS ] ; then
 | |
|         ocf_log info "${LH} RMQ-server app stopped succesfully."
 | |
|         rc=$OCF_SUCCESS
 | |
|     else
 | |
|         ocf_log err "${LH} RMQ-server app cannot be stopped."
 | |
|         rc=$OCF_ERR_GENERIC
 | |
|     fi
 | |
| 
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| start_beam_process() {
 | |
|     local command
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local ts_end
 | |
|     local pf_end
 | |
|     local pid
 | |
|     local LH="${LL} start_beam_process():"
 | |
| 
 | |
|     # remove old PID-file if it exists
 | |
|     if [ -f "${OCF_RESKEY_pid_file}" ] ; then
 | |
|         ocf_log warn "${LH} found old PID-file '${OCF_RESKEY_pid_file}'."
 | |
|         pid=$(cat ${OCF_RESKEY_pid_file})
 | |
|         if [ "${pid}" -a -d "/proc/${pid}" ] ; then
 | |
|             ocf_run cat /proc/${pid}/cmdline | grep -c 'bin/beam' > /dev/null 2>&1
 | |
|             rc=$?
 | |
|             if [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|                 ocf_log warn "${LH} found beam process with PID=${pid}, killing...'."
 | |
|                 ocf_run kill -TERM $pid
 | |
|             else
 | |
|                 ocf_log err "${LH} found unknown process with PID=${pid} from '${OCF_RESKEY_pid_file}'."
 | |
|                 return $OCF_ERR_GENERIC
 | |
|             fi
 | |
|         fi
 | |
|         ocf_run rm -f $OCF_RESKEY_pid_file
 | |
|     fi
 | |
| 
 | |
|     [ -f /etc/default/rabbitmq-server ] && . /etc/default/rabbitmq-server
 | |
| 
 | |
|     # RabbitMQ requires high soft and hard limits for NOFILE
 | |
|     set_limits
 | |
| 
 | |
|     # run beam process
 | |
|     command="${OCF_RESKEY_binary} >> \"${OCF_RESKEY_log_dir}/startup_log\" 2>/dev/null"
 | |
|     RABBITMQ_NODE_ONLY=1 su rabbitmq -s /bin/sh -c "${command}"&
 | |
|     ts_end=$(( $(now) + ${OCF_RESKEY_start_time} ))
 | |
|     rc=$OCF_ERR_GENERIC
 | |
|     while [ $(now) -lt ${ts_end} ]; do
 | |
|         # waiting for normal start of beam
 | |
|         pid=0
 | |
|         pf_end=$(( $(now) + 3 ))
 | |
|         while [ $(now) -lt ${pf_end} ]; do
 | |
|             # waiting for OCF_RESKEY_pid_file of beam process
 | |
|             if [ -f "${OCF_RESKEY_pid_file}" ] ; then
 | |
|                 pid=$(cat ${OCF_RESKEY_pid_file})
 | |
|                 break
 | |
|             fi
 | |
|             sleep 1
 | |
|         done
 | |
|         if [ "${pid}" != "0" -a -d "/proc/${pid}" ] ; then
 | |
|             rc=$OCF_SUCCESS
 | |
|             break
 | |
|         fi
 | |
|         sleep 2
 | |
|     done
 | |
|     if [ $rc -ne $OCF_SUCCESS ]; then
 | |
|         if [ "${pid}" = "0" ] ; then
 | |
|             ocf_log warn "${LH} PID-file '${OCF_RESKEY_pid_file}' not found"
 | |
|         fi
 | |
|         ocf_log err "${LH} RMQ-runtime (beam) didn't start succesfully (rc=${rc})."
 | |
|     fi
 | |
| 
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| check_plugins() {
 | |
|   # Check if it's safe to load plugins and if we need to do so. Logic is:
 | |
|   #   if (EnabledPlugins > 0) and (ActivePlugins == 0) ; then it's safe to load
 | |
|   # If we have at least one active plugin, then it's not safe to re-load them
 | |
|   # because plugins:setup() would remove existing dependency plugins in plugins_expand_dir.
 | |
|   ${OCF_RESKEY_ctl} eval '{ok, EnabledFile} = application:get_env(rabbit, enabled_plugins_file), EnabledPlugins = rabbit_plugins:read_enabled(EnabledFile), ActivePlugins = rabbit_plugins:active(), if length(EnabledPlugins)>0 -> if length(ActivePlugins)==0 -> erlang:error("need_to_load_plugins"); true -> false end; true -> false end.'
 | |
|   return $?
 | |
| }
 | |
| 
 | |
| load_plugins() {
 | |
|   check_plugins
 | |
|   local rc=$?
 | |
|   if [ $rc -eq 0 ] ; then
 | |
|     return 0
 | |
|   else
 | |
|     ${OCF_RESKEY_ctl} eval 'ToBeLoaded = rabbit_plugins:setup(), ok = app_utils:load_applications(ToBeLoaded), StartupApps = app_utils:app_dependency_order(ToBeLoaded,false), app_utils:start_applications(StartupApps).'
 | |
|     return $?
 | |
|   fi
 | |
| }
 | |
| 
 | |
| list_active_plugins() {
 | |
|   local list
 | |
|   list=`${OCF_RESKEY_ctl} eval 'rabbit_plugins:active().'`
 | |
|   echo "${list}"
 | |
| }
 | |
| 
 | |
| try_to_start_rmq_app() {
 | |
|     local startup_log="${1:-${OCF_RESKEY_log_dir}/startup_log}"
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} try_to_start_rmq_app():"
 | |
| 
 | |
|     get_status
 | |
|     rc=$?
 | |
|     if [ $rc -ne $OCF_SUCCESS ] ; then
 | |
|         ocf_log info "${LH} RMQ-runtime (beam) not started, starting..."
 | |
|         start_beam_process
 | |
|         rc=$?
 | |
|         if [ $rc -ne $OCF_SUCCESS ]; then
 | |
|             ocf_log err "${LH} Failed to start beam - returning from the function"
 | |
|             return $OCF_ERR_GENERIC
 | |
|         fi
 | |
|     fi
 | |
| 
 | |
| 
 | |
|     if [ -z "${startup_log}" ] ; then
 | |
|         startup_log="${OCF_RESKEY_log_dir}/startup_log"
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} begin."
 | |
|     ocf_log info "${LH} Execute start_app with timeout: ${TIMEOUT_ARG}"
 | |
|     su_rabbit_cmd "${OCF_RESKEY_ctl} start_app >>${startup_log} 2>&1"
 | |
|     rc=$?
 | |
|     if [ $rc -eq 0 ] ; then
 | |
|         ocf_log info "${LH} start_app was successful."
 | |
|         ocf_log info "${LH} waiting for start to finish with timeout: ${TIMEOUT_ARG}"
 | |
|         su_rabbit_cmd "${OCF_RESKEY_ctl} wait ${OCF_RESKEY_pid_file}"
 | |
|         rc=$?
 | |
|         if [ $rc -ne 0 ] ; then
 | |
|              ocf_log err "${LH} RMQ-server app failed to wait for start."
 | |
|              return $OCF_ERR_GENERIC
 | |
|         fi
 | |
|         rc=$OCF_SUCCESS
 | |
|         # Loading enabled modules
 | |
|         ocf_log info "${LH} start plugins."
 | |
|         load_plugins
 | |
|         local mrc=$?
 | |
|         if [ $mrc -eq 0 ] ; then
 | |
|           local mlist
 | |
|           mlist=`list_active_plugins`
 | |
|           ocf_log info "${LH} Starting plugins: ${mlist}"
 | |
|         else
 | |
|           ocf_log info "${LH} Starting plugins: failed."
 | |
|         fi
 | |
|     else
 | |
|         ocf_log info "${LH} start_app failed."
 | |
|         rc=$OCF_ERR_GENERIC
 | |
|     fi
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| start_rmq_server_app() {
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local startup_log="${OCF_RESKEY_log_dir}/startup_log"
 | |
|     local startup_output
 | |
|     local LH="${LL} start_rmq_server_app():"
 | |
|     local a
 | |
| 
 | |
|     #We are performing initial start check.
 | |
|     #We are not ready to provide service.
 | |
|     #Clients should not have access.
 | |
| 
 | |
| 
 | |
|     ocf_log info "${LH} begin."
 | |
|     # Safe-unblock the rules, if there are any
 | |
|     unblock_client_access
 | |
|     # Apply the blocking rule
 | |
|     block_client_access
 | |
|     rc=$?
 | |
|     if [ $rc -eq $OCF_SUCCESS ]; then
 | |
|       ocf_log info "${LH} blocked access to RMQ port"
 | |
|     else
 | |
|       ocf_log err "${LH} cannot block access to RMQ port!"
 | |
|       return $OCF_ERR_GENERIC
 | |
|     fi
 | |
|     get_status
 | |
|     rc=$?
 | |
|     if [ $rc -ne $OCF_SUCCESS ] ; then
 | |
|         ocf_log info "${LH} RMQ-runtime (beam) not started, starting..."
 | |
|         start_beam_process
 | |
|         rc=$?
 | |
|         if [ $rc -ne $OCF_SUCCESS ]; then
 | |
|             unblock_client_access
 | |
|             ocf_log info "${LH} unblocked access to RMQ port"
 | |
|             return $OCF_ERR_GENERIC
 | |
|         fi
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} RMQ-server app not started, starting..."
 | |
|     try_to_start_rmq_app "$startup_log"
 | |
|     rc=$?
 | |
|     if [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|         # rabbitmq-server started successfuly as master of cluster
 | |
|         master_score $MIN_MASTER_SCORE
 | |
|         stop_rmq_server_app
 | |
|         rc=$?
 | |
|         if [ $rc -ne 0 ] ; then
 | |
|             ocf_log err "${LH} RMQ-server app can't be stopped. Beam will be killed."
 | |
|             kill_rmq_and_remove_pid
 | |
|             unblock_client_access
 | |
|             ocf_log info "${LH} unblocked access to RMQ port"
 | |
|             return $OCF_ERR_GENERIC
 | |
|         fi
 | |
|     else
 | |
|        # error at start RMQ-server
 | |
|        ocf_log warn "${LH} RMQ-server app can't start without Mnesia cleaning."
 | |
|        for a in $(seq 1 10) ; do
 | |
|             rc=$OCF_ERR_GENERIC
 | |
|             reset_mnesia || break
 | |
|             try_to_start_rmq_app "$startup_log"
 | |
|             rc=$?
 | |
|             if [ $rc -eq $OCF_SUCCESS ]; then
 | |
|                 stop_rmq_server_app
 | |
|                 rc=$?
 | |
|                 if [ $rc -eq $OCF_SUCCESS ]; then
 | |
|                     ocf_log info "${LH} RMQ-server app Mnesia cleaned successfully."
 | |
|                     rc=$OCF_SUCCESS
 | |
|                     master_score $MIN_MASTER_SCORE
 | |
|                     break
 | |
|                 else
 | |
|                     ocf_log err "${LH} RMQ-server app can't be stopped during Mnesia cleaning. Beam will be killed."
 | |
|                     kill_rmq_and_remove_pid
 | |
|                     unblock_client_access
 | |
|                     ocf_log info "${LH} unblocked access to RMQ port"
 | |
|                     return $OCF_ERR_GENERIC
 | |
|                 fi
 | |
|             fi
 | |
|         done
 | |
|     fi
 | |
|     if [ $rc -eq $OCF_ERR_GENERIC ] ; then
 | |
|          ocf_log err "${LH} RMQ-server can't be started while many tries. Beam will be killed."
 | |
|          kill_rmq_and_remove_pid
 | |
|     fi
 | |
|     ocf_log info "${LH} end."
 | |
|     unblock_client_access
 | |
|     ocf_log info "${LH} unblocked access to RMQ port"
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| # check status of rabbit beam process or a rabbit app, if rabbit arg specified
 | |
| # by default, test if the kernel app is running, otherwise consider it is "not running"
 | |
| get_status() {
 | |
|     local what="${1:-kernel}"
 | |
|     local rc=$OCF_NOT_RUNNING
 | |
|     local LH="${LL} get_status():"
 | |
|     local body
 | |
|     local beam_running
 | |
| 
 | |
|     body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 )
 | |
|     rc=$?
 | |
| 
 | |
|     pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null
 | |
|     beam_running=$?
 | |
|     # report not running only if the which_applications() reported an error AND the beam is not running
 | |
|     if [ $rc -ne 0 -a $beam_running -ne 0 ] ; then
 | |
|         ocf_log info "${LH} failed with code ${rc}. Command output: ${body}"
 | |
|         return $OCF_NOT_RUNNING
 | |
|     # return a generic error, if there were errors and beam is found running
 | |
|     elif [ $rc -ne 0 ] ; then
 | |
|         ocf_log info "${LH} found the beam process running but failed with code ${rc}. Command output: ${body}"
 | |
|         return $OCF_ERR_GENERIC
 | |
|     fi
 | |
| 
 | |
|     # try to parse the which_applications() output only if it exited w/o errors
 | |
|     if [ "${what}" -a $rc -eq 0 ] ; then
 | |
|         rc=$OCF_NOT_RUNNING
 | |
|         echo "$body" | grep "\{${what}," > /dev/null 2>&1 && rc=$OCF_SUCCESS
 | |
| 
 | |
|         if [ $rc -ne $OCF_SUCCESS ] ; then
 | |
|             ocf_log info "${LH} app ${what} was not found in command output: ${body}"
 | |
|         fi
 | |
|     fi
 | |
| 
 | |
|     [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| action_status() {
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
| 
 | |
|     get_status
 | |
|     rc=$?
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| # return 0, if given node has a master attribute in CIB,
 | |
| # otherwise, return 1
 | |
| is_master() {
 | |
|     local result
 | |
|     result=`crm_attribute -N "${1}" -l reboot --name 'rabbit-master' --query 2>/dev/null |\
 | |
|            awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d'`
 | |
|     if [ "${result}" != 'true' ] ; then
 | |
|         return 1
 | |
|     fi
 | |
|     return 0
 | |
| }
 | |
| 
 | |
| # Verify if su_rabbit_cmd exited by timeout by checking its return code.
 | |
| # If it did not, return 0. If it did AND it is
 | |
| # $OCF_RESKEY_max_rabbitmqctl_timeouts'th timeout in a row,
 | |
| # return 2 to signal get_monitor that it should
 | |
| # exit with error. Otherwise return 1 to signal that there was a timeout,
 | |
| # but it should be ignored. Timeouts for different operations are tracked
 | |
| # separately. The second argument is used to distingush them.
 | |
| check_timeouts() {
 | |
|     local op_rc=$1
 | |
|     local timeouts_attr_name=$2
 | |
|     local op_name=$3
 | |
| 
 | |
|     # 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about
 | |
|     # timeout.
 | |
|     if [ $op_rc -ne 124 -a $op_rc -ne 137 -a $op_rc -ne 75 ]; then
 | |
|         ocf_update_private_attr $timeouts_attr_name 0
 | |
|         return 0
 | |
|     fi
 | |
| 
 | |
|     local count
 | |
|     count=$(ocf_get_private_attr $timeouts_attr_name 0)
 | |
| 
 | |
|     count=$((count+1))
 | |
|     # There is a slight chance that this piece of code will be executed twice simultaneously.
 | |
|     # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need
 | |
|     # precise calculation here.
 | |
|     ocf_update_private_attr $timeouts_attr_name $count
 | |
| 
 | |
|     if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then
 | |
|         ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now."
 | |
|         return 1
 | |
|     else
 | |
|         ocf_log err "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row and is not responding. The resource is failed."
 | |
|         return 2
 | |
|     fi
 | |
| }
 | |
| 
 | |
| wait_sync() {
 | |
|     local wait_time=$1
 | |
|     local queues
 | |
|     local opt_arg=""
 | |
| 
 | |
|     if [ "$OCF_RESKEY_rmq_feature_local_list_queues" = "true" ]; then
 | |
|         opt_arg="--local"
 | |
|     fi
 | |
| 
 | |
|     queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} -p ${OCF_RESKEY_default_vhost} list_queues $opt_arg name state"
 | |
| 
 | |
|     su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \
 | |
|           do sleep 2; done\""
 | |
| 
 | |
|     return $?
 | |
| }
 | |
| 
 | |
| get_monitor() {
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} get_monitor():"
 | |
|     local status_master=1
 | |
|     local rabbit_running
 | |
|     local name
 | |
|     local node
 | |
|     local node_start_time
 | |
|     local nowtime
 | |
|     local partitions_report
 | |
|     local node_partitions
 | |
| 
 | |
|     ocf_log info "${LH} CHECK LEVEL IS: ${OCF_CHECK_LEVEL}"
 | |
|     get_status
 | |
|     rc=$?
 | |
|     if [ $rc -eq $OCF_NOT_RUNNING ] ; then
 | |
|         ocf_log info "${LH} get_status() returns ${rc}."
 | |
|         ocf_log info "${LH} ensuring this slave does not get promoted."
 | |
|         master_score 0
 | |
|         return $OCF_NOT_RUNNING
 | |
|     elif [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|         ocf_log info "${LH} get_status() returns ${rc}."
 | |
|         ocf_log info "${LH} also checking if we are master."
 | |
|         get_status rabbit
 | |
|         rabbit_running=$?
 | |
|         is_master $THIS_PCMK_NODE
 | |
|         status_master=$?
 | |
|         ocf_log info "${LH} master attribute is ${status_master}"
 | |
|         if [ $status_master -eq 0 -a $rabbit_running -eq $OCF_SUCCESS ]
 | |
|         then
 | |
|             ocf_log info "${LH} We are the running master"
 | |
|             rc=$OCF_RUNNING_MASTER
 | |
|         elif [ $status_master -eq 0 -a $rabbit_running -ne $OCF_SUCCESS ] ; then
 | |
|             ocf_log err "${LH} We are the master and RMQ-runtime (beam) is not running. this is a failure"
 | |
|             exit $OCF_FAILED_MASTER
 | |
|         fi
 | |
|     fi
 | |
|     get_status rabbit
 | |
|     rabbit_running=$?
 | |
|     ocf_log info "${LH} checking if rabbit app is running"
 | |
| 
 | |
|     if [ $rc -eq $OCF_RUNNING_MASTER ]; then
 | |
|         if [ $rabbit_running -eq $OCF_SUCCESS ]; then
 | |
|             ocf_log info "${LH} rabbit app is running and is master of cluster"
 | |
|         else
 | |
|             ocf_log err "${LH} we are the master and rabbit app is not running. This is a failure"
 | |
|             exit $OCF_FAILED_MASTER
 | |
|         fi
 | |
|     else
 | |
|         start_time=$((180 + $(ocf_get_private_attr 'rabbit-start-phase-1-time' 0)))
 | |
|         restart_order_time=$((60 + $(ocf_get_private_attr 'rabbit-ordered-to-restart' 0)))
 | |
|         nowtime=$(now)
 | |
| 
 | |
|         # If we started more than 3 minutes ago, and
 | |
|         # we got order to restart less than 1 minute ago
 | |
|         if [ $nowtime -lt $restart_order_time ]; then
 | |
|             if [ $nowtime -gt $start_time ]; then
 | |
|                 ocf_log err "${LH} failing because we have received an order to restart from the master"
 | |
|                 stop_server_process
 | |
|                 rc=$OCF_ERR_GENERIC
 | |
|             else
 | |
|                 ocf_log warn "${LH} received an order to restart from the master, ignoring it because we have just started"
 | |
|             fi
 | |
|         fi
 | |
|     fi
 | |
| 
 | |
|     if [ $rc -eq $OCF_ERR_GENERIC ]; then
 | |
|         ocf_log err "${LH} get_status() returns generic error ${rc}"
 | |
|         ocf_log info "${LH} ensuring this slave does not get promoted."
 | |
|         master_score 0
 | |
|         return $OCF_ERR_GENERIC
 | |
|     fi
 | |
| 
 | |
|     # Recounting our master score
 | |
|     ocf_log info "${LH} preparing to update master score for node"
 | |
|     local our_start_time
 | |
|     local new_score
 | |
|     local node_start_time
 | |
|     local node_score
 | |
| 
 | |
|     our_start_time=$(get_node_start_time $THIS_PCMK_NODE)
 | |
| 
 | |
|     if [ $our_start_time -eq 0 ]; then
 | |
|         new_score=$MIN_MASTER_SCORE
 | |
|     else
 | |
|         new_score=$BEST_MASTER_SCORE
 | |
|         for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE)
 | |
|         do
 | |
|             node_start_time=$(get_node_start_time $node)
 | |
|             node_score=$(get_node_master_score $node)
 | |
| 
 | |
|             ocf_log info "${LH} comparing us (start time: $our_start_time, score: $new_score) with $node (start time: $node_start_time, score: $node_score)"
 | |
|             if [ $node_start_time -ne 0 -a $node_score -ne 0 -a $node_start_time -lt $our_start_time ]; then
 | |
|                 new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score ))
 | |
|             elif [ $node_start_time -ne 0 -a $node_score -ne 0 -a $node_start_time -eq $our_start_time ]; then
 | |
|                 # Do not get promoted if the other node is already master and we have the same start time
 | |
|                 if is_master $node; then
 | |
|                     new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score ))
 | |
|                 fi
 | |
|             fi
 | |
|         done
 | |
|     fi
 | |
| 
 | |
|     if [ "$new_score" -ne "$(get_node_master_score $THIS_PCMK_NODE)" ]; then
 | |
|         master_score $new_score
 | |
|     fi
 | |
|     ocf_log info "${LH} our start time is $our_start_time and score is $new_score"
 | |
| 
 | |
|     # Skip all other checks if rabbit app is not running
 | |
|     if [ $rabbit_running -ne $OCF_SUCCESS ]; then
 | |
|         ocf_log info "${LH} RabbitMQ is not running, get_monitor function ready to return ${rc}"
 | |
|         return $rc
 | |
|     fi
 | |
| 
 | |
|     # rc can be SUCCESS or RUNNING_MASTER, don't touch it unless there
 | |
|     # is some error uncovered by node_health_check
 | |
|     if ! node_health_check; then
 | |
|         rc=$OCF_ERR_GENERIC
 | |
|     fi
 | |
| 
 | |
|     if [ $rc -eq $OCF_RUNNING_MASTER ] ; then
 | |
|         # If we are the master and healthy, perform various
 | |
|         # connectivity checks for other nodes in the cluster.
 | |
|         # Order a member to restart if something fishy happens with it.
 | |
|         # All cross-node checks MUST happen only here.
 | |
| 
 | |
|         partitions_report="$(partitions_report)"
 | |
| 
 | |
|         for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE); do
 | |
|             # Restart node if we don't consider ourselves clustered with it
 | |
|             if ! is_clustered_with $node; then
 | |
|                 ocf_log warn "${LH} node $node is not connected with us"
 | |
|                 order_node_restart "$node"
 | |
|                 continue
 | |
|             fi
 | |
| 
 | |
|             # Restart node if it has any unresolved partitions
 | |
|             node_partitions=$(grep_partitions_report $node "$partitions_report")
 | |
|             if [ ! -z "$node_partitions" ]; then
 | |
|                 ocf_log warn "${LH} Node $node thinks that it is partitoned with $node_partitions"
 | |
|                 order_node_restart "$node"
 | |
|                 continue
 | |
|             fi
 | |
|         done
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} get_monitor function ready to return ${rc}"
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| order_node_restart() {
 | |
|     local node=${1:?}
 | |
|     ocf_log warn "${LH} Ordering node '$node' to restart"
 | |
|     ocf_update_private_attr 'rabbit-ordered-to-restart' "$(now)" "$node"
 | |
| }
 | |
| 
 | |
| # Checks whether node is mentioned somewhere in report returned by
 | |
| # partitions_report()
 | |
| grep_partitions_report() {
 | |
|     local node="${1:?}"
 | |
|     local report="${2:?}"
 | |
|     local rabbit_node
 | |
|     rabbit_node=$(rabbit_node_name "$node")
 | |
|     echo "$report" | grep "PARTITIONED $rabbit_node:" | sed -e 's/^[^:]\+: //'
 | |
| }
 | |
| 
 | |
| # Report partitions (if any) from viewpoint of every running node in cluster.
 | |
| # It is parseable/grepable version of `rabbitmqctl cluster_status`.
 | |
| #
 | |
| # If node sees partition, report will contain the line like:
 | |
| #     PARTITIONED node-name: list-of-nodes, which-node-name-considers, itself-partitioned-with
 | |
| partitions_report() {
 | |
|     $COMMAND_TIMEOUT xargs -0 ${OCF_RESKEY_ctl} eval <<EOF
 | |
| RpcTimeout = 10,
 | |
| 
 | |
| Nodes = rabbit_mnesia:cluster_nodes(running),
 | |
| 
 | |
| {Replies, _BadNodes} = gen_server:multi_call(Nodes, rabbit_node_monitor, partitions, RpcTimeout * 1000),
 | |
| 
 | |
| lists:foreach(fun ({_, []}) -> ok;
 | |
|                   ({Node, Partitions}) ->
 | |
|                       PartitionsStr = string:join([atom_to_list(Part) || Part <- Partitions],
 | |
|                                                   ", "),
 | |
|                       io:format("PARTITIONED ~s: ~s~n",
 | |
|                                 [Node, PartitionsStr])
 | |
|               end, Replies),
 | |
| 
 | |
| ok.
 | |
| EOF
 | |
| }
 | |
| 
 | |
| # Check if the rabbitmqctl control plane is alive.
 | |
| node_health_check() {
 | |
|     local rc
 | |
|     if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then
 | |
|         node_health_check_local
 | |
|         rc=$?
 | |
|     else
 | |
|         node_health_check_legacy
 | |
|         rc=$?
 | |
|     fi
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| node_health_check_local() {
 | |
|     local LH="${LH} node_health_check_local():"
 | |
|     local rc
 | |
|     local rc_timeouts
 | |
| 
 | |
|     # Give node_health_check some time to handle timeout by itself.
 | |
|     # By using internal rabbitmqctl timeouts, we allow it to print
 | |
|     # more useful diagnostics
 | |
|     local timeout=$((TIMEOUT_ARG - 2))
 | |
|     su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout"
 | |
|     rc=$?
 | |
| 
 | |
|     check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check"
 | |
|     rc_timeouts=$?
 | |
| 
 | |
|     if [ "$rc_timeouts" -eq 2 ]; then
 | |
|         master_score 0
 | |
|         ocf_log info "${LH} node_health_check timed out, retry limit reached"
 | |
|         return $OCF_ERR_GENERIC
 | |
|     elif [ "$rc_timeouts" -eq 1 ]; then
 | |
|         ocf_log info "${LH} node_health_check timed out, going to retry"
 | |
|         return $OCF_SUCCESS
 | |
|     fi
 | |
| 
 | |
|     if [ "$rc" -ne 0 ]; then
 | |
|         ocf_log err "${LH} rabbitmqctl node_health_check exited with errors."
 | |
|         return $OCF_ERR_GENERIC
 | |
|     else
 | |
|         return $OCF_SUCCESS
 | |
|     fi
 | |
| }
 | |
| 
 | |
| node_health_check_legacy() {
 | |
|     local rc_alive
 | |
|     local timeout_alive
 | |
|     su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels > /dev/null 2>&1"
 | |
|     rc_alive=$?
 | |
|     [ $rc_alive -eq 137 -o $rc_alive -eq 124 ] && ocf_log err "${LH} 'rabbitmqctl list_channels' timed out, per-node explanation: $(enhanced_list_channels)"
 | |
|     check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels"
 | |
|     timeout_alive=$?
 | |
| 
 | |
|     if [ $timeout_alive -eq 2 ]; then
 | |
|         master_score 0
 | |
|         return $OCF_ERR_GENERIC
 | |
|     elif [ $timeout_alive -eq 0 ]; then
 | |
|         if [ $rc_alive -ne 0 ]; then
 | |
|             ocf_log err "${LH} rabbitmqctl list_channels exited with errors."
 | |
|             rc=$OCF_ERR_GENERIC
 | |
|         fi
 | |
|     fi
 | |
| 
 | |
|     # Check for memory alarms for this Master or Slave node.
 | |
|     # If alert found, reset the alarm
 | |
|     # and restart the resource as it likely means a dead end situation
 | |
|     # when rabbitmq cluster is running with blocked publishing due
 | |
|     # to high memory watermark exceeded.
 | |
|     local alarms
 | |
|     local rc_alarms
 | |
|     local timeout_alarms
 | |
|     alarms=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q eval 'rabbit_alarm:get_alarms().'"`
 | |
|     rc_alarms=$?
 | |
|     check_timeouts $rc_alarms "rabbit_get_alarms_timeouts" "get_alarms"
 | |
|     timeout_alarms=$?
 | |
| 
 | |
|     if [ $timeout_alarms -eq 2 ]; then
 | |
|         master_score 0
 | |
|         return $OCF_ERR_GENERIC
 | |
| 
 | |
|     elif [ $timeout_alarms -eq 0 ]; then
 | |
|         if [ $rc_alarms -ne 0 ]; then
 | |
|             ocf_log err "${LH} rabbitmqctl get_alarms exited with errors."
 | |
|             rc=$OCF_ERR_GENERIC
 | |
| 
 | |
|         elif [ -n "${alarms}" ]; then
 | |
|             for node in ${alarms}; do
 | |
|                 name=`echo ${node} | perl -n -e "m/memory,'(?<n>\S+)+'/ && print \"$+{n}\n\""`
 | |
|                 if [ "${name}" = "${RABBITMQ_NODENAME}" ] ; then
 | |
|                     ocf_log err "${LH} Found raised memory alarm. Erasing the alarm and restarting."
 | |
|                     su_rabbit_cmd "${OCF_RESKEY_ctl} set_vm_memory_high_watermark 10 > /dev/null 2>&1"
 | |
|                     rc=$OCF_ERR_GENERIC
 | |
|                     break
 | |
|                 fi
 | |
|             done
 | |
|         fi
 | |
|     fi
 | |
| 
 | |
|     if ! is_cluster_status_ok ; then
 | |
|         rc=$OCF_ERR_GENERIC
 | |
|     fi
 | |
| 
 | |
|     # Check if the list of all queues is available,
 | |
|     # Also report some queues stats and total virtual memory.
 | |
|     local queues
 | |
|     local rc_queues
 | |
|     local timeout_queues
 | |
|     queues=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q -p ${OCF_RESKEY_default_vhost} list_queues memory messages consumer_utilisation"`
 | |
|     rc_queues=$?
 | |
|     check_timeouts $rc_queues "rabbit_list_queues_timeouts" "list_queues"
 | |
|     timeout_queues=$?
 | |
| 
 | |
|     if [ $timeout_queues -eq 2 ]; then
 | |
|         master_score 0
 | |
|         return $OCF_ERR_GENERIC
 | |
| 
 | |
|     elif [ $timeout_queues -eq 0 ]; then
 | |
|         if [ $rc_queues -ne 0 ]; then
 | |
|             ocf_log err "${LH} rabbitmqctl list_queues exited with errors."
 | |
|             rc=$OCF_ERR_GENERIC
 | |
| 
 | |
|         elif [ -n "${queues}" ]; then
 | |
|             local q_c
 | |
|             q_c=`printf %b "${queues}\n" | wc -l`
 | |
|             local mem
 | |
|             mem=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$1} END {print (sum/1048576)}'`
 | |
|             local mes
 | |
|             mes=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$2} END {print sum}'`
 | |
|             local c_u
 | |
|             c_u=`printf %b "${queues}\n" | awk -v sum=0 -v cnt=${q_c} '{sum+=$3} END {print (sum+1)/(cnt+1)}'`
 | |
|             local status
 | |
|             status=`echo $(su_rabbit_cmd "${OCF_RESKEY_ctl} -q status")`
 | |
|             ocf_log info "${LH} RabbitMQ is running ${q_c} queues consuming ${mem}m of ${TOTALVMEM}m total, with ${mes} queued messages, average consumer utilization ${c_u}"
 | |
|             ocf_log info "${LH} RabbitMQ status: ${status}"
 | |
|         fi
 | |
|     fi
 | |
| 
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| ocf_get_private_attr() {
 | |
|     local attr_name="${1:?}"
 | |
|     local attr_default_value="${2:?}"
 | |
|     local nodename="${3:-$THIS_PCMK_NODE}"
 | |
|     local count
 | |
|     count=$(attrd_updater -p --name "$attr_name" --node "$nodename" --query)
 | |
|     if [ $? -ne 0 ]; then
 | |
|         echo $attr_default_value
 | |
|     else
 | |
|         echo "$count" | awk -vdef_val="$attr_default_value" '{ gsub(/"/, "", $3); split($3, vals, "="); if (vals[2] != "") print vals[2]; else print def_val }'
 | |
|     fi
 | |
| }
 | |
| 
 | |
| ocf_update_private_attr() {
 | |
|     local attr_name="${1:?}"
 | |
|     local attr_value="${2:?}"
 | |
|     local nodename="${3:-$THIS_PCMK_NODE}"
 | |
|     ocf_run attrd_updater -p --name "$attr_name" --node "$nodename" --update "$attr_value"
 | |
| }
 | |
| 
 | |
| rabbitmqctl_with_timeout_check() {
 | |
|     local command="${1:?}"
 | |
|     local timeout_attr_name="${2:?}"
 | |
| 
 | |
|     su_rabbit_cmd "${OCF_RESKEY_ctl} $command"
 | |
|     local rc=$?
 | |
| 
 | |
|     check_timeouts $rc $timeout_attr_name "$command"
 | |
|     local has_timed_out=$?
 | |
| 
 | |
|     case "$has_timed_out" in
 | |
|         0)
 | |
|             return $rc;;
 | |
|         1)
 | |
|             return 0;;
 | |
|         2)
 | |
|             return 1;;
 | |
|     esac
 | |
| }
 | |
| 
 | |
| is_cluster_status_ok() {
 | |
|     local LH="${LH}: is_cluster_status_ok:"
 | |
|     rabbitmqctl_with_timeout_check cluster_status rabbit_cluster_status_timeouts > /dev/null 2>&1
 | |
| }
 | |
| 
 | |
| action_monitor() {
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} monitor:"
 | |
|     ocf_log debug "${LH} action start."
 | |
|     if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
 | |
|         d=`date '+%Y%m%d %H:%M:%S'`
 | |
|         echo $d >> /tmp/rmq-monitor.log
 | |
|         env >> /tmp/rmq-monitor.log
 | |
|         echo "$d  [monitor] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log
 | |
|     fi
 | |
|     get_monitor
 | |
|     rc=$?
 | |
|     ocf_log debug "${LH} role: ${OCF_RESKEY_CRM_meta_role}"
 | |
|     ocf_log debug "${LH} result: $rc"
 | |
|     ocf_log debug "${LH} action end."
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| 
 | |
| action_start() {
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} start:"
 | |
|     local nowtime
 | |
| 
 | |
|     if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
 | |
|         d=`date '+%Y%m%d %H:%M:%S'`
 | |
|         echo $d >> /tmp/rmq-start.log
 | |
|         env >> /tmp/rmq-start.log
 | |
|         echo "$d  [start]  start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} action begin."
 | |
| 
 | |
|     get_status
 | |
|     rc=$?
 | |
|     if [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|         ocf_log warn "${LH} RMQ-runtime (beam) already started."
 | |
|         return $OCF_SUCCESS
 | |
|     fi
 | |
| 
 | |
|     local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts"
 | |
|     local attr_name_to_reset
 | |
|     for attr_name_to_reset in $attrs_to_zero; do
 | |
|         ocf_update_private_attr $attr_name_to_reset 0
 | |
|     done
 | |
| 
 | |
|     nowtime=$(now)
 | |
|     ocf_log info "${LH} Setting phase 1 one start time to $nowtime"
 | |
|     ocf_update_private_attr 'rabbit-start-phase-1-time' "$nowtime"
 | |
|     ocf_log info "${LH} Deleting start time attribute"
 | |
|     ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
 | |
|     ocf_log info "${LH} Deleting master attribute"
 | |
|     ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
 | |
| 
 | |
|     ocf_log info "${LH} RMQ going to start."
 | |
|     start_rmq_server_app
 | |
|     rc=$?
 | |
|     if [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|         ocf_log info "${LH} RMQ prepared for start succesfully."
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} action end."
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| 
 | |
| action_stop() {
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} stop:"
 | |
| 
 | |
|     if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
 | |
|         d=$(date '+%Y%m%d %H:%M:%S')
 | |
|         echo $d >> /tmp/rmq-stop.log
 | |
|         env >> /tmp/rmq-stop.log
 | |
|         echo "$d  [stop]  start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} action begin."
 | |
| 
 | |
|     ocf_log info "${LH} Deleting master attribute"
 | |
|     ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
 | |
|     master_score 0
 | |
|     ocf_log info "${LH} Deleting start time attribute"
 | |
|     ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
 | |
| 
 | |
|     # Wait for synced state first
 | |
|     ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync"
 | |
|     wait_sync $((OCF_RESKEY_stop_time/2))
 | |
| 
 | |
|     ocf_log info "${LH} RMQ-runtime (beam) going to down."
 | |
|     stop_server_process
 | |
| 
 | |
|     if [ $? -ne $OCF_SUCCESS ] ; then
 | |
|         ocf_log err "RMQ-runtime (beam) couldn't be stopped and will likely became unmanaged. Take care of it manually!"
 | |
|         ocf_log info "${LH} action end."
 | |
|         exit $OCF_ERR_GENERIC
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} RMQ-runtime (beam) not running."
 | |
|     ocf_log info "${LH} action end."
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| 
 | |
| #######################################################################
 | |
| # Enhanced list_channels:
 | |
| # - nodes are processed in parallel
 | |
| # - report contains information about which nodes timed out
 | |
| #
 | |
| # 'list_channels' is used as a healh-check for current node, but it
 | |
| # actually checks overall health of all node in cluster. And there were
 | |
| # some bugs where only one (non-local) channel became stuck, but OCF
 | |
| # script was wrongfully killing local node.
 | |
| #
 | |
| # Hopefully all such bugs are fixed, but if not - it will allow to
 | |
| # detect such conditions.
 | |
| #
 | |
| # Somewhat strange implementation is due to the following reasons:
 | |
| # - ability to support older versions of RabbitMQ which have reached
 | |
| #   end-of-life with single version of the script
 | |
| # - zero dependencies - for older versions this functionality could be
 | |
| #   implemented as a plugin, but it'll require this plugin installation
 | |
| enhanced_list_channels() {
 | |
|     # One second less than timeout of su_rabbit_cmd
 | |
|     local timeout=$((${TIMEOUT_ARG:-5} - 1))
 | |
| 
 | |
|     su_rabbit_cmd "xargs -0 ${OCF_RESKEY_ctl} eval" <<EOF
 | |
| SecondsToCompletion = $timeout,
 | |
| 
 | |
| %% Milliseconds since unix epoch
 | |
| Now = fun() ->
 | |
|               {Mega, Secs, Micro} = os:timestamp(),
 | |
|               Mili = Micro div 1000,
 | |
|               Mili + 1000 * (Secs + 1000000 * Mega)
 | |
|       end,
 | |
| 
 | |
| %% We shouldn't continue execution past this time
 | |
| ShouldEndAt = Now() + SecondsToCompletion * 1000,
 | |
| 
 | |
| %% How many milliseconds we still have
 | |
| Timeout = fun() ->
 | |
|                   case ShouldEndAt - Now() of
 | |
|                       Past when Past =< 0 ->
 | |
|                           0;
 | |
|                       Timeout ->
 | |
|                           Timeout
 | |
|                   end
 | |
|           end,
 | |
| 
 | |
| %% Lambda combinator - for defining anonymous recursive functions
 | |
| Y = fun(F) ->
 | |
|             (fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)(
 | |
|               fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)
 | |
|     end,
 | |
| 
 | |
| Parent = self(),
 | |
| 
 | |
| ListChannels = Y(fun(Rec) ->
 | |
|                          fun (({Node, [], OkChannelsCount})) ->
 | |
|                                  Parent ! {Node, ok, OkChannelsCount};
 | |
|                              ({Node, [Chan|Rest], OkChannelsCount}) ->
 | |
|                                  case catch rpc:call(Node, rabbit_channel, info, [Chan], Timeout()) of
 | |
|                                      Infos when is_list(Infos) ->
 | |
|                                          Rec({Node, Rest, OkChannelsCount + 1});
 | |
|                                      {badrpc, {'EXIT', {noproc, _}}} ->
 | |
|                                          %% Channel became dead before we could request it's status, don't care
 | |
|                                          Rec({Node, Rest, OkChannelsCount});
 | |
|                                      Err ->
 | |
|                                          Parent ! {Node, Err, OkChannelsCount}
 | |
|                                  end
 | |
|                          end
 | |
|                  end),
 | |
| 
 | |
| SingleNodeListing = fun(Node) ->
 | |
|                             case catch rpc:call(Node, pg_local, get_members, [rabbit_channels], Timeout()) of
 | |
|                                 LocalChannels when is_list(LocalChannels) ->
 | |
|                                     ListChannels({Node, LocalChannels, 0});
 | |
|                                 Err ->
 | |
|                                     Parent ! {Node, Err, 0}
 | |
|                             end
 | |
|                     end,
 | |
| 
 | |
| AllNodes = rabbit_mnesia:cluster_nodes(running),
 | |
| [ spawn(fun() -> SingleNodeListing(Node) end) || Node <- AllNodes ],
 | |
| 
 | |
| WaitForNodes = Y(fun(Rec) ->
 | |
|                   fun ({[], Acc}) ->
 | |
|                           Acc;
 | |
|                       ({RemainingNodes, Acc}) ->
 | |
|                           receive
 | |
|                               {Node, _Status, _ChannelCount} = Smth ->
 | |
|                                   RemainingNodes1 = lists:delete(Node, RemainingNodes),
 | |
|                                   Rec({RemainingNodes1, [Smth|Acc]})
 | |
|                               after Timeout() + 100 ->
 | |
|                                       Acc
 | |
|                               end
 | |
|                   end
 | |
|           end),
 | |
| 
 | |
| Result = WaitForNodes({AllNodes, []}),
 | |
| 
 | |
| ExpandedResult = [ case lists:keysearch(Node, 1, Result) of
 | |
|                        {value, NodeResult} ->
 | |
|                            NodeResult;
 | |
|                        false ->
 | |
|                            {Node, no_data_collected, 0}
 | |
|                    end || Node <- AllNodes ],
 | |
| 
 | |
| ExpandedResult.
 | |
| EOF
 | |
| }
 | |
| 
 | |
| #######################################################################
 | |
| # Join the cluster and return OCF_SUCCESS, if joined.
 | |
| # Return 10, if node is trying to join to itself or empty destination.
 | |
| # Return OCF_ERR_GENERIC, if cannot join.
 | |
| jjj_join () {
 | |
|     local join_to="$1"
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} jjj_join:"
 | |
| 
 | |
|     my_host ${join_to}
 | |
|     rc=$?
 | |
|     ocf_log debug "${LH} node='${join_to}' rc='${rc}'"
 | |
| 
 | |
|     # Check whether we are joining to ourselves
 | |
|     # or master host is not given
 | |
|     if [ $rc -ne 0 -a "${join_to}" ] ; then
 | |
|             ocf_log info "${LH} Joining to cluster by node '${join_to}'"
 | |
|             join_to_cluster "${join_to}"
 | |
|             rc=$?
 | |
|             if [ $rc -ne $OCF_SUCCESS ] ; then
 | |
|                 ocf_log err "${LH} Failed to join the cluster. The mnesia will be reset."
 | |
|                 reset_mnesia
 | |
|                 rc=$OCF_ERR_GENERIC
 | |
|             fi
 | |
|     fi
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| action_notify() {
 | |
|     local rc_join=$OCF_SUCCESS
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local rc2=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} notify:"
 | |
|     local nodelist
 | |
| 
 | |
|     if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
 | |
|         d=`date '+%Y%m%d %H:%M:%S'`
 | |
|         echo $d >> /tmp/rmq-notify.log
 | |
|         env >> /tmp/rmq-notify.log
 | |
|         echo "$d  [notify]  ${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation} promote='${OCF_RESKEY_CRM_meta_notify_promote_uname}' demote='${OCF_RESKEY_CRM_meta_notify_demote_uname}' master='${OCF_RESKEY_CRM_meta_notify_master_uname}' slave='${OCF_RESKEY_CRM_meta_notify_slave_uname}' start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log
 | |
|     fi
 | |
| 
 | |
|     if [ "${OCF_RESKEY_CRM_meta_notify_type}" = 'post' ] ; then
 | |
|         # POST- anything notify section
 | |
|         case "$OCF_RESKEY_CRM_meta_notify_operation" in
 | |
|             promote)
 | |
|                 ocf_log info "${LH} post-promote begin."
 | |
| 
 | |
|                 rc=$OCF_SUCCESS
 | |
| 
 | |
|                 # Do nothing, if the list of nodes being promoted reported empty.
 | |
|                 # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
 | |
|                 if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then
 | |
|                     ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do."
 | |
| 
 | |
|                 elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then
 | |
|                     ocf_log info "${LH} ignoring post-promote of self"
 | |
| 
 | |
|                 elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then
 | |
|                     if get_status rabbit; then
 | |
|                         ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do."
 | |
|                     else
 | |
|                         ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. We only need to start the app."
 | |
| 
 | |
|                         try_to_start_rmq_app
 | |
|                         rc2=$?
 | |
|                         update_rabbit_start_time_if_rc $rc2
 | |
|                     fi
 | |
| 
 | |
|                 else
 | |
|                     # Note, this should fail when the mnesia is inconsistent.
 | |
|                     # For example, when the "old" master processing the promition of the new one.
 | |
|                     # Later this ex-master node will rejoin the cluster at post-start.
 | |
|                     jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}"
 | |
|                     rc=$?
 | |
|                     if [ $rc -eq $OCF_ERR_GENERIC ] ; then
 | |
|                         ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted."
 | |
|                     fi
 | |
|                 fi
 | |
| 
 | |
|                 ocf_log info "${LH} post-promote end."
 | |
|                 return $rc
 | |
|                 ;;
 | |
|             start)
 | |
|                 ocf_log info "${LH} post-start begin."
 | |
|                 # Do nothing, if the list of nodes being started or running reported empty
 | |
|                 # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
 | |
|                 if [ -z "${OCF_RESKEY_CRM_meta_notify_start_uname}" -a -z "${OCF_RESKEY_CRM_meta_notify_active_uname}" ] ; then
 | |
|                   ocf_log warn "${LH} I'm a last man standing and I must survive!"
 | |
|                   ocf_log info "${LH} post-start end."
 | |
|                   return $OCF_SUCCESS
 | |
|                 fi
 | |
|                 # check did this event from this host
 | |
|                 my_host "${OCF_RESKEY_CRM_meta_notify_start_uname}"
 | |
|                 rc=$?
 | |
|                 # Do nothing, if there is no master reported
 | |
|                 # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
 | |
|                 if [ -z "${OCF_RESKEY_CRM_meta_notify_master_uname}" ] ; then
 | |
|                   ocf_log warn "${LH} there are no nodes to join to reported on post-start. Nothing to do."
 | |
|                   ocf_log info "${LH} post-start end."
 | |
|                   return $OCF_SUCCESS
 | |
|                 fi
 | |
|                 if [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|                     # Now we need to:
 | |
|                     # a. join to the cluster if we are not joined yet
 | |
|                     # b. start the RabbitMQ application, which is always
 | |
|                     #    stopped after start action finishes
 | |
|                     check_need_join_to ${OCF_RESKEY_CRM_meta_notify_master_uname}
 | |
|                     rc_join=$?
 | |
|                     if [ $rc_join -eq $OCF_SUCCESS ]; then
 | |
|                       ocf_log warn "${LH} Going to join node ${OCF_RESKEY_CRM_meta_notify_master_uname}"
 | |
|                       jjj_join "${OCF_RESKEY_CRM_meta_notify_master_uname}"
 | |
|                       rc2=$?
 | |
|                     else
 | |
|                       ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}"
 | |
| 
 | |
|                       try_to_start_rmq_app
 | |
|                       rc2=$?
 | |
|                       update_rabbit_start_time_if_rc $rc2
 | |
|                     fi
 | |
|                     if [ -s "${OCF_RESKEY_definitions_dump_file}" ] ; then
 | |
|                         ocf_log info "File ${OCF_RESKEY_definitions_dump_file} exists"
 | |
|                         ocf_run curl --silent --show-error --request POST --user $OCF_RESKEY_admin_user:$OCF_RESKEY_admin_password $OCF_RESKEY_host_ip:15672/api/definitions --header "Content-Type:application/json" --data @$OCF_RESKEY_definitions_dump_file
 | |
|                         rc=$?
 | |
|                         if [ $rc -eq $OCF_SUCCESS ] ; then
 | |
|                             ocf_log info "RMQ definitions have imported succesfully."
 | |
|                         else
 | |
|                             ocf_log err "RMQ definitions have not imported."
 | |
|                         fi
 | |
|                     fi
 | |
|                     if [ $rc2 -eq $OCF_ERR_GENERIC ] ; then
 | |
|                         ocf_log warn "${LH} Failed to join the cluster on post-start. The resource will be restarted."
 | |
|                         ocf_log info "${LH} post-start end."
 | |
|                         return $OCF_ERR_GENERIC
 | |
|                     fi
 | |
|                 fi
 | |
|                 ocf_log info "${LH} post-start end."
 | |
|                 ;;
 | |
|             stop)
 | |
|                 # if rabbitmq-server stops on any another node, we should remove it from cluster (as ordinary operation)
 | |
|                 ocf_log info "${LH} post-stop begin."
 | |
|                 # Report not running, if there are no nodes being stopped reported
 | |
|                 if [ -z "${OCF_RESKEY_CRM_meta_notify_stop_uname}" ] ; then
 | |
|                   ocf_log warn "${LH} there are no nodes being stopped reported on post-stop. The resource will be restarted."
 | |
|                   ocf_log info "${LH} post-stop end."
 | |
|                   return $OCF_ERR_GENERIC
 | |
|                 fi
 | |
|                 my_host "${OCF_RESKEY_CRM_meta_notify_stop_uname}"
 | |
|                 rc=$?
 | |
|                 if [ $rc -ne $OCF_SUCCESS ] ; then
 | |
|                     # Wait for synced state first
 | |
|                     ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync"
 | |
|                     wait_sync $((OCF_RESKEY_stop_time/2))
 | |
|                     # On other nodes processing the post-stop, make sure the stopped node will be forgotten
 | |
|                     unjoin_nodes_from_cluster "${OCF_RESKEY_CRM_meta_notify_stop_uname}"
 | |
|                 else
 | |
|                     # On the nodes being stopped, reset the master score
 | |
|                     ocf_log info "${LH} resetting the master score."
 | |
|                     master_score 0
 | |
|                 fi
 | |
|                 # always returns OCF_SUCCESS
 | |
|                 ocf_log info "${LH} post-stop end."
 | |
|                 ;;
 | |
|             *)  ;;
 | |
|         esac
 | |
|     fi
 | |
| 
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| 
 | |
| 
 | |
| action_promote() {
 | |
|     local rc=$OCF_ERR_GENERIC
 | |
|     local LH="${LL} promote:"
 | |
| 
 | |
|     if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
 | |
|         d=$(date '+%Y%m%d %H:%M:%S')
 | |
|         echo $d >> /tmp/rmq-promote.log
 | |
|         env >> /tmp/rmq-promote.log
 | |
|         echo "$d  [promote]  start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log
 | |
|     fi
 | |
| 
 | |
|     ocf_log info "${LH} action begin."
 | |
| 
 | |
|     get_monitor
 | |
|     rc=$?
 | |
|     ocf_log info "${LH} get_monitor returns ${rc}"
 | |
|     case "$rc" in
 | |
|         "$OCF_SUCCESS")
 | |
|             # Running as slave. Normal, expected behavior.
 | |
|             ocf_log info "${LH} Resource is currently running as Slave"
 | |
|             # rabbitmqctl start_app if need
 | |
|             get_status rabbit
 | |
|             rc=$?
 | |
|             ocf_log info "${LH} Updating cluster master attribute"
 | |
|             ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --update 'true'
 | |
|             if [ $rc -ne $OCF_SUCCESS ] ; then
 | |
|                 ocf_log info "${LH} RMQ app is not started. Starting..."
 | |
|                 start_rmq_server_app
 | |
|                 rc=$?
 | |
|                 if [ $rc -eq 0 ] ; then
 | |
|                     try_to_start_rmq_app
 | |
|                     rc=$?
 | |
|                     if [ $rc -ne 0 ] ; then
 | |
|                         ocf_log err "${LH} Can't start RMQ app. Master resource is failed."
 | |
|                         ocf_log info "${LH} action end."
 | |
|                         exit $OCF_FAILED_MASTER
 | |
|                     fi
 | |
| 
 | |
|                     [ -f "${OCF_RESKEY_policy_file}" ] && . "${OCF_RESKEY_policy_file}"
 | |
| 
 | |
|                     update_rabbit_start_time_if_rc $rc
 | |
| 
 | |
|                     ocf_log info "${LH} Checking master status"
 | |
|                     get_monitor
 | |
|                     rc=$?
 | |
|                     ocf_log info "${LH} Master status is $rc"
 | |
|                     if [ $rc = $OCF_RUNNING_MASTER ]
 | |
|                     then
 | |
|                        rc=$OCF_SUCCESS
 | |
|                     else
 | |
|                        ocf_log err "${LH} Master resource is failed."
 | |
|                        ocf_log info "${LH} action end."
 | |
|                        exit $OCF_FAILED_MASTER
 | |
|                     fi
 | |
|                 else
 | |
|                     ocf_log err "${LH} Can't start RMQ-runtime."
 | |
|                     rc=$OCF_ERR_GENERIC
 | |
|                 fi
 | |
|             fi
 | |
|             return $rc
 | |
|             ;;
 | |
|         "$OCF_RUNNING_MASTER")
 | |
|             # Already a master. Unexpected, but not a problem.
 | |
|             ocf_log warn "${LH} Resource is already running as Master"
 | |
|             rc=$OCF_SUCCESS
 | |
|             ;;
 | |
| 
 | |
|         "$OCF_FAILED_MASTER")
 | |
|             # Master failed.
 | |
|             ocf_log err "${LH} Master resource is failed and not running"
 | |
|             ocf_log info "${LH} action end."
 | |
|             exit $OCF_FAILED_MASTER
 | |
|             ;;
 | |
| 
 | |
|         "$OCF_NOT_RUNNING")
 | |
|             # Currently not running.
 | |
|             ocf_log err "${LH} Resource is currently not running"
 | |
|             rc=$OCF_NOT_RUNNING
 | |
|             ;;
 | |
|         *)
 | |
|             # Failed resource. Let the cluster manager recover.
 | |
|             ocf_log err "${LH} Unexpected error, cannot promote"
 | |
|             ocf_log info "${LH} action end."
 | |
|             exit $rc
 | |
|             ;;
 | |
|     esac
 | |
| 
 | |
|     # transform slave RMQ-server to master
 | |
| 
 | |
|     ocf_log info "${LH} action end."
 | |
|     return $rc
 | |
| }
 | |
| 
 | |
| 
 | |
| action_demote() {
 | |
|     local LH="${LL} demote:"
 | |
|     ocf_log info "${LH} action begin."
 | |
|     ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
 | |
|     ocf_log info "${LH} action end."
 | |
|     return $OCF_SUCCESS
 | |
| }
 | |
| #######################################################################
 | |
| 
 | |
| rmq_setup_env
 | |
| 
 | |
| case "$1" in
 | |
|   meta-data)    meta_data
 | |
|                 exit $OCF_SUCCESS;;
 | |
|   usage|help)   usage
 | |
|                 exit $OCF_SUCCESS;;
 | |
| esac
 | |
| 
 | |
| # Anything except meta-data and help must pass validation
 | |
| action_validate || exit $?
 | |
| 
 | |
| # What kind of method was invoked?
 | |
| case "$1" in
 | |
|   start)        action_start;;
 | |
|   stop)         action_stop;;
 | |
|   status)       action_status;;
 | |
|   monitor)      action_monitor;;
 | |
|   validate)     action_validate;;
 | |
|   promote)      action_promote;;
 | |
|   demote)       action_demote;;
 | |
|   notify)       action_notify;;
 | |
|   validate-all) action_validate;;
 | |
|   *)            usage;;
 | |
| esac
 | |
| ###
 |