
#!/bin/bash

#define variable
log_level=0                     #DEBUG = 0, release=1
fatal_value=0
synd_value=0
reboot_bus_id=0
dh_node_idx=0
powercycle_flag=0

declare -a reboot_flag_list
declare -a dh_sbdf_list
declare -a dh_health_check_list
declare -a dh_netcard_type_list

#define const
ZTE_NICB_NET_PF="8061"           #zxnic_e310       pf0/1 feature_V2.24.20.02_upf分支
ZTE_NICB_NET_VF="8062"           #zxnic_e310       vf
ZTE_NICA_NET_PF="8049"           #zxnic_e312       pf0/1 feature_V2.24.20.02_upf分支
ZTE_NICA_NET_VF="8060"           #zxnic_e312       vf
ZTE_INICA_UPF_BOND_PF="806f"     #zxinic_i512_upf  pf0 release_V2.24.30.01
ZTE_INICA_UPF_SRIOV_PF="804e"    #zxinic_i512_upf  pf1 release_V2.24.30.01
ZTE_INICA_UPF_SRIOV_VF="804f"    #zxinic_i512_upf  vf  release_V2.24.30.01
ZTE_VENDER_ID="1cf2"
ZTE_MPF="1041"
ZTE_MPF_VENDER_ID="1111"
NET_CLASSCODE="0200"

WORK_PATH=/var/log/dhselfhealing
TIME_PATH=/etc/dhselfhealing
PCI_DEVICE_PATH="/sys/bus/pci/devices"

MESSAGE_LOG=message.log
TIME_FILE=lastreboot.log
REBOOT_INFO_LOG=rebootinfo.log
REBOOT_INFO_OLD_LOG=rebootinfo_old.log

#宏定义
PL_ERROR=0
PL_INFO=1
PL_DEBUG=0
PL_RELEASE=1
ENABLE=0
DISABLE=1
OS_START_TIME=1200
SELF_HEALING_INTERVAL=3600

#$1==0:error level; $1==1:info level
shell_printf()
{
    if [ $log_level -eq $PL_DEBUG ]; then
        echo $2 | tee -a $WORK_PATH/$MESSAGE_LOG
    else
        if [ $1 -eq $PL_ERROR ]; then
            echo $2 | tee -a $WORK_PATH/$MESSAGE_LOG
        fi
    fi
}

read_fatal_value()
{
    local fatal_file_path="$PCI_DEVICE_PATH/$1/fatal"

    if [ -f "$fatal_file_path" ]; then
        fatal_value=$(cat $fatal_file_path)
        shell_printf $PL_INFO "$fatal_file_path: $fatal_value"
        return 1
    else
        shell_printf $PL_ERROR "$fatal_file_path 文件不存在"
        return 0
    fi
}

read_synd_value()
{
    local synd_file_path="$PCI_DEVICE_PATH/$1/synd"

    if [ -f "$synd_file_path" ]; then
        synd_value=$(cat $synd_file_path)
        shell_printf $PL_INFO "$synd_file_path: $synd_value"

        if (($((synd_value >> bit)) == 0xff)); then
            synd_value=0
        fi

        for ((bit = 0; bit < 8; bit++)); do
            if (($((synd_value >> bit)) & 1)); then
                case $bit in
                    0)
                        shell_printf $PL_INFO "synd bit$bit: riscv_fw_exception"
                        ;;
                    1)
                        shell_printf $PL_INFO "synd bit$bit: riscv_core_exception"
                        ;;
                    2)
                        shell_printf $PL_INFO "synd bit$bit: riscv_counter_missed"
                        ;;
                    3)
                        shell_printf $PL_INFO "synd bit$bit: pcie_comm_error"
                        ;;
                    4)
                        shell_printf $PL_INFO "synd bit$bit: vqm_fatal"
                        ;;
                    5)
                        shell_printf $PL_INFO "synd bit$bit: bttl_fatal"
                        ;;
                    6)
                        shell_printf $PL_INFO "synd bit$bit: ddr_fatal"
                        ;;
                    7)
                        shell_printf $PL_INFO "synd bit$bit: ocm_fatal"
                        ;;
                esac
            fi
        done
        return 1
    else
        shell_printf $PL_ERROR "$synd_file_path 文件不存在"
        return 0
    fi
}

get_device_info()
{
    local count=0
    local device_id=0
    local sbdf=""
    local bus=$1

    count=$(echo "$bus" | grep -o ':' | wc -l)
    if [ $count -eq 1 ]; then
        sbdf="0000:$bus"
    else
        sbdf=$bus
    fi

    dh_sbdf_list[$dh_node_idx]=$sbdf
    device_id=$(cat $PCI_DEVICE_PATH/${dh_sbdf_list[$dh_node_idx]}/device)
    shell_printf $PL_INFO "${dh_sbdf_list[$dh_node_idx]}:$device_id"
}

search_bus_id()
{
    local device_id=$1
    local func_num=0

    for busid in `lspci -n | grep "$device_id" | grep "$ZTE_VENDER_ID" | awk '{print $1}'`
    do
        func_num=$(echo "$busid" | cut -d'.' -f 2)
        if [[ "$func_num" == '0' ]]; then
            get_device_info "$busid"
            dh_node_idx=$(($dh_node_idx + 1))
        fi
    done
}

get_health_check_node_info()
{
    dh_health_check_list=("$ZTE_NICB_NET_PF" "$ZTE_NICA_NET_PF" "$ZTE_INICA_UPF_BOND_PF")
    dh_netcard_type_list=('zxnic_e310' 'zxnic_e312' 'zxinic_i512_upf')
    dh_node_idx=0

    for ((i = 0; i < ${#dh_health_check_list[@]}; i++))
    do
        shell_printf $PL_INFO "$i# health check did: ${dh_health_check_list[$i]} ${dh_netcard_type_list[$i]}"
        search_bus_id ${dh_health_check_list[$i]}
    done
}

self_healing_process()
{
    for ((i = 0; i < ${#dh_sbdf_list[@]}; i++))
    do
        shell_printf $PL_INFO "------------------------ DH net_card $i health process ------------------------"
        shell_printf $PL_INFO "Index $i: ${dh_sbdf_list[$i]}"
        read_fatal_value ${dh_sbdf_list[$i]}
        if [ $? -eq 1 ]; then
            if [ $fatal_value -eq 1 ]; then
                reboot_bus_id=${dh_sbdf_list[$i]}
                reboot_flag_list[$i]=$ENABLE
                read_synd_value ${dh_sbdf_list[$i]}
            else
                reboot_flag_list[$i]=$DISABLE
            fi
        else
            reboot_flag_list[$i]=$DISABLE
        fi
    done

    powercycle_flag=$DISABLE
    for value in "${reboot_flag_list[@]}"
    do
        if [ $value -eq $ENABLE ]; then
            powercycle_flag=$ENABLE
            break
        fi
    done

    return
}

check_reboot_time()
{
    local command=`cat /proc/uptime| awk -F . '{print $1}'`

    if [ $command -lt $OS_START_TIME ]; then
        shell_printf $PL_ERROR 'The system does not need to be checked within 20 minutes after startup.'
        exit
    fi
}

check_time_file()
{
    if [ ! -f $TIME_PATH/$TIME_FILE  ]
    then
        shell_printf $PL_ERROR "time file not exist, please redeploy dhselfhealingrun.sh script."
        exit
    fi
}

check_ipmitool()
{
    if ! command -v ipmitool > /dev/null 2>&1; then
        shell_printf $PL_ERROR "The ipmitool tool is not installed. Please install the ipmitool tool first."
        exit
    fi

    ipmitool power status >/dev/null 2>&1
    if [ $? -ne 0 ]; then
        shell_printf $PL_ERROR "Running the ipmitool tool requires root permission. Please configure the permission first."
        exit
    fi
}

clear_message_file()
{
    if [ -d "$WORK_PATH" ]; then
        sudo rm -f $WORK_PATH/$MESSAGE_LOG
    fi

    date|tee -a $WORK_PATH/$MESSAGE_LOG
}

rename_reboot_info_log()
{
    local fatal_file_path="$PCI_DEVICE_PATH/$reboot_bus_id/fatal"
    local synd_file_path="$PCI_DEVICE_PATH/$reboot_bus_id/synd"
    local fatal_reg=$(cat $fatal_file_path)
    local synd_reg=$(cat $synd_file_path)

    if [ -f $WORK_PATH/$REBOOT_INFO_OLD_LOG  ]; then
        rm -f $WORK_PATH/$REBOOT_INFO_OLD_LOG
    fi

    if [ -f $WORK_PATH/$REBOOT_INFO_LOG ]; then
        mv $WORK_PATH/$REBOOT_INFO_LOG $WORK_PATH/$REBOOT_INFO_OLD_LOG
    fi

    echo $reboot_bus_id" save fatal message:" |tee $WORK_PATH/$REBOOT_INFO_LOG
    echo $fatal_file_path" : "$fatal_reg |tee -a $WORK_PATH/$REBOOT_INFO_LOG
    echo $synd_file_path" : "$synd_reg |tee -a $WORK_PATH/$REBOOT_INFO_LOG
    echo $reboot_bus_id" smartnic fatal error ,need power cycle now!!!!" |tee -a $WORK_PATH/$REBOOT_INFO_LOG
    date |tee -a $WORK_PATH/$REBOOT_INFO_LOG
}

execute_ipmitool_command()
{
    local curtime=`date`
    echo $curtime" smartnic fatal error,need power cycle "|tee -a /var/log/messages
    shell_printf $PL_ERROR "power cycle: Restarting system."

    sync
    ipmitool power cycle
}

judge_power_cycle()
{
    local last_reboot_time=`cat $TIME_PATH/$TIME_FILE`
    local current_time=`date '+%s'`
    local time_interval=`expr $current_time - $last_reboot_time`
    if [ $time_interval -gt $SELF_HEALING_INTERVAL ]; then
        echo $current_time > $TIME_PATH/$TIME_FILE
        rename_reboot_info_log
        execute_ipmitool_command
    else
        shell_printf $PL_ERROR "The time interval for repeated reboots is less than one hour."
    fi

    return
}

main()
{
    check_reboot_time
    check_time_file
    clear_message_file
    check_ipmitool

    shell_printf $PL_INFO '----------------------------------- starts -----------------------------------'

    get_health_check_node_info
    self_healing_process

    if [ $powercycle_flag -eq $ENABLE ]; then
        judge_power_cycle
    fi

    shell_printf $PL_INFO '----------------------------------- finish -----------------------------------'
}

main
