#!/bin/sh
#
#
# Resource Agent for solution Two-node cluster brain split.
#
#  License:      GNU General Public License (GPL)
#  (c) 2022-2026 Vivi, Uniontech
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
#
#
#######################################################################
# Initialization:
: ${OCF_RESKEY_interval_default=10}
: ${OCF_RESKEY_log_default="/var/log/split_brain_fence.log"}
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
: ${OCF_RESKEY_check_sh=${OCF_FUNCTIONS_DIR}/split_brain_check.sh}
: ${OCF_RESKEY_pid="/var/chkspl/chkspl.pid"}
: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}}
: ${OCF_RESKEY_interval=${OCF_RESKEY_interval_default}}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################

meta_data() {
	cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="chkspl">
<version>1.0</version>

<longdesc lang="zh">
这是一个用于集群节点状态监测的资源代理。可以帮助双节点集群处理脑裂相关问题。
</longdesc>
<longdesc lang="en">
This is a node status monitoring Resource Agent. It can help two-node clusters 
to handle Split-brain issues.

NB: Please pay attention to the timeouts specified in the actions
section below. They should be meaningful for the kind of resource
the agent manages. They should be the minimum advised timeouts,
but they shouldn't/cannot cover _all_ possible resource
instances. So, try to be neither overly generous nor too stingy,
but moderate. The minimum timeouts should never be below 10 seconds.
</longdesc>

<shortdesc lang="zh">双节点集群脑裂问题处理脚本</shortdesc>
<shortdesc lang="en">Two-node cluster Split-brain problem solving instance</shortdesc>

<parameters>
<parameter name="log" unique="1" required="0">
<longdesc lang="zh">
用于记录chkspl的log
</longdesc>
<longdesc lang="en">
The logfile to be used for chkspl.
</longdesc>
<shortdesc lang="zh">chkspl log 文件</shortdesc>
<shortdesc lang="en">chkspl log file</shortdesc>
<content type="string" default="${OCF_RESKEY_log_default}"/>
</parameter>

<parameter name="interval" unique="1" required="0">
<longdesc lang="zh">
用于运行脑裂的检测的时间间隔(单位s)
</longdesc>
<longdesc lang="en">
The time interval used to run the splinter brain check.
</longdesc>
<shortdesc lang="zh">chkspl 运行间隔</shortdesc>
<shortdesc lang="en">chkspl interval</shortdesc>
<content type="integer" default="${OCF_RESKEY_interval_default}"/>
</parameter>

</parameters>


<actions>
<action name="start"        timeout="20s" />
<action name="stop"         timeout="20s" />
<action name="monitor"      timeout="20s" interval="10s" depth="0" />
<action name="meta-data"    timeout="5s" />
<action name="validate-all"   timeout="20s" />
</actions>

</resource-agent>
END
}

#######################################################################

chkspl_usage() {
	cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

chkspl_start() {
    local pid
    chkspl_validate_all || exit $?
    ocf_log info "Split-brain monitor starting"
    chkspl_prepare_dir
    chkspl_status;local rc=$?
    if [ $rc = $OCF_SUCCESS ];then
        ocf_exit_reason "chkspl already running"
	exit $OCF_ERR_CONFIGURED
    fi
    echo "started" >$OCF_RESKEY_state
    sh $OCF_RESKEY_check_sh "$OCF_RESKEY_interval" "$OCF_RESKEY_log" "$OCF_RESKEY_state" >/dev/null &
    pid=$!
    echo "$pid" >$OCF_RESKEY_pid
    #ocf_release_lock_on_exit $LOCKFILE
    #ocf_take_lock  $LOCKFILE
    return $OCF_SUCCESS

}

chkspl_stop() {
    local pid
    local rc
    if [ ! -f $OCF_RESKEY_pid ]; then
        ocf_log info "chkspl is not running"
	rm -r $OCF_RESKEY_state
        exit $OCF_SUCCESS;
    fi

    pid=`cat $OCF_RESKEY_pid 2> /dev/null `

    chkspl_check_pid $pid;rc=$?
    if [ $rc -ne 0 ]; then
        rm -f $OCF_RESKEY_pid
	echo "stopped" >$OCF_RESKEY_state
	ocf_exit_reason "chkspl is already stopped"
        exit $OCF_SUCCESS;
    fi

    /bin/kill $pid >/dev/null;rc=$?
    if [ $rc != 0 ]; then
        ocf_log error "chkspl couldn't be stopped"
    fi
    
    shutdown_timeout=15
    local count=0
    while [ $count -lt $shutdown_timeout ]
    do
        chkspl_status $pid;rc=$?
        if [ $rc = $OCF_NOT_RUNNING ]; then
            echo "stopped" >$OCF_RESKEY_state
            break
        fi
        count=`expr $count + 1`
        sleep 1
        ocf_log debug "chkspl still hasn't stopped yet. Waiting..."
    done
    chkspl_status $pid;rc=$?
    if [ $rc != $OCF_NOT_RUNNING ]; then
        ocf_log info "chkspl failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..."
        /bin/kill -KILL $pid > /dev/null
    fi

    ocf_log info "chkspl stopped";
    echo "stopped" >$OCF_RESKEY_state
    rm -f $OCF_RESKEY_pid
    rm -f $OCF_RESKEY_state
    exit $OCF_SUCCESS
}

chkspl_monitor() {
    if [ -f $OCF_RESKEY_state ];then
        local state=`cat $OCF_RESKEY_state`
	if [ "u$state" = "u" ];then
	     state="null"
	fi
    fi
    chkspl_status;local rc=$?
    if [ $rc = $OCF_NOT_RUNNING ];then
	 ocf_log info "Chkspl is not running"
	 if [ "$state" = "Failed" ];then
             ocf_exit_reason "chkspl service failed"
	     exit $OCF_ERR_CONFIGURED
	 fi
	 exit $OCF_NOT_RUNNING
     elif [ "$state" = "Failed" ];then
         ocf_exit_reason "chkspl service failed"
	 exit $OCF_ERR_CONFIGURED
     elif [ $rc = $OCF_SUCCESS ];then
         ocf_log info "chkspl already running"
	 exit $OCF_SUCCESS
     fi
}

chkspl_status() {
    local pid
    pid=$1
    if [ -z "$pid" ]; then
        if [ ! -e $OCF_RESKEY_pid ]; then
	    if [ -e $OCF_RESKEY_state ];then
                rm -r $OCF_RESKEY_state
            fi
	    echo "stopped" >$OCF_RESKEY_state
            return $OCF_NOT_RUNNING;
        fi
        pid=`cat $OCF_RESKEY_pid`
    fi
    chkspl_check_pid $pid
    if [ $? -eq 0 ]; then
	#ocf_log info "Chkspl is running"
	echo "started" >$OCF_RESKEY_state
        return $OCF_SUCCESS;
    else
        if [ -e $OCF_RESKEY_pid ]; then
	    #ocf_log info "MySQL not running: removing old PID file"
	    rm -f $OCF_RESKEY_pid
        fi
	if [ -e $OCF_RESKEY_state ];then
            rm -r $OCF_RESKEY_state
        fi
        return $OCF_NOT_RUNNING;
    fi

}

chkspl_validate_all() {
   local rc
   local ip
   local net
   local node_num
   node_num=$(pcs status|grep "nodes configured" |cut -d '*' -f2|awk -F ' ' '{print $1}')
   if [ $node_num -ne 2 ];then
       ocf_exit_reason "The current cluster is not a two-node cluster"
       return $OCF_ERR_CONFIGURED
   fi

   local ip_all=$(corosync-cfgtool -s |grep "LINK ID" -A1|grep addr|awk -F '=' '{print $2}'|tr -d ' ')
   for ip in $ip_all ;do
       net=$(ifconfig|grep -B1 $ip|grep ^[a-z] | awk -F: '{print $1}')
       cat /etc/sysconfig/network-scripts/ifcfg-$net|grep -i "onboot"|grep -i "yes" &>/dev/null;rc=$?
       if [ $rc -ne 0 ];then
	   ocf_exit_reason "The onboot parameter of the current NIC is abnormal"
	   return $OCF_ERR_CONFIGURED
       fi
   done

   if [ ! -f $OCF_RESKEY_check_sh ];then
       ocf_exit_reason "split_brain_check.sh file missing"
       return $OCF_ERR_INSTALLED
   fi
   return $OCF_SUCCESS
 
}

chkspl_check_pid(){
    local pid=$1
    if [ -d /proc -a -d /proc/1 ]; then
        [ "u$pid" != "u" -a -d /proc/$pid ]
    else
        kill -s 0 $pid >/dev/null 2>&1
    fi
    return $?
}

chkspl_prepare_dir(){
    local pid_dir=`dirname $OCF_RESKEY_pid`
    if [ ! -d "$pid_dir" ]; then
        ocf_log info "Creating PID dir: $pid_dir"
        mkdir -p $pid_dir
        chown root:root $pid_dir
    fi

    local state_dir=`dirname $OCF_RESKEY_state`
    if [ ! -d "$state_dir" ]; then
        ocf_log info "Creating state dir: $state_dir"
        mkdir -p $state_dir
        chown root:root $pid_dir
        touch $OCF_RESKEY_state
    fi

    return $OCF_SUCCESS
}


: ${OCF_RESKEY_fake="chkspl"}
: ${OCF_RESKEY_state=${HA_RSCTMP}/chkspl.state}

#: ${LOCKFILE=${HA_RSCTMP}/chkspl}
#ocf_release_lock_on_exit $LOCKFILE

case $__OCF_ACTION in
meta-data)	meta_data
		exit $OCF_SUCCESS
		;;
start)		chkspl_start;;
stop)		chkspl_stop;;
monitor)	chkspl_monitor;;
validate-all)	chkspl_validate_all;;
usage|help)	chkspl_usage
		exit $OCF_SUCCESS
		;;
*)		chkspl_usage
		exit $OCF_ERR_UNIMPLEMENTED
		;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc

