1. 需求场景
在自动化邮件报表中,总是会出现邮件发送不及时、邮件发送出错等等问题。因此开发了此系统,当邮件任务执行出错或者未执行时发送短信报警警告
2. 基本思路
在执行邮件任务之前,即在系统调用发送邮件的方法之前,先在邮件检测系统表中记录,状态为失败,发送成功之后更新数据库,置为成功。
shell邮件检测系统每1分钟执行一次,遍历邮件任务表和检测表,2者对比发现失败的任务和未执行的任务。一旦发现,调用短信发送脚本发送短信报警
3. 整体代码
#!/bin/sh
### DOUGUO JOB INSPECTOSCOPE SNIFFER ###
### @author zjf ###
### @date 2017-12-27 ###
# INITALIZE SHELL
. /etc/profile
. ~/.bash_profile
# DEFINE ERROE CODE
ERROR_CODE_001="job non-executed"
ERROR_CODE_002="data result non-compliant formats"
ERROR_CODE_003="send-mail/exec-job non-succeed"
# INITALIZE WORKING DIR
_SCRIPT_NAME=$0
_WORK_DIR=`dirname ${_SCRIPT_NAME}`
cd ${_WORK_DIR}
# TIME ARGS
CUR_DATE=`date -d "-0 day" "+%Y-%m-%d"`
JUDGE_MINUTE=`date -d "-5 minute" +%H:%M`
#JUDGE_MINUTE="08:05"
echo "CUR_DATE:$CUR_DATE"
echo "JUDGE_MINUTE:$JUDGE_MINUTE"
# DB ARGS
HOSTNAME="192.168.1.135"
USERNAME="hadooper"
PASSWORD="hadoop@K+IhBOS"
DBNAME="douguo_data"
# DEFINE _FUN RETURN RESULT
g_contain_rs=""
# the area to define functions
# functions must be defined before use it
# _FUN
function contain() {
g_contain_rs="0"
array=$1
value=$2
for _arr_val in ${array[@]}
do
ta_array=`echo ${_arr_val} | cut -d ',' -f 1`
tb_value=`echo ${value} | cut -d ',' -f 1`
if [ "${ta_array}" = "${tb_value}" ];then
# 已记录
g_contain_rs="1"
fi
done
}
# _FUN
# uodate job's status
function update_job_status() {
_jon_id=$1
_to_change_schema=$2
_to_change_data=$3
_stat_date=$4
query_update="update dd_job_monitor set ${_to_change_schema}='${_to_change_data}' where job_id=${_jon_id} and date(statdate)='${_stat_date}'"
query_result=`/usr/local/MysqL/bin/MysqL -h${HOSTNAME} -u${USERNAME} -p${PASSWORD} -D ${DBNAME} -e "${query_update}"`
}
# _FUN
# insert into job tb,otherwise the alarm module will always alarming !
function insert_job() {
_jon_id=$1
_job_name=$2
_stat_date=$3
query_update="INSERT INTO dd_job_monitor(job_id,job_type,send_type,job_name,run_time,msg_reciver,data_status,job_status,alarm_status,statdate) VALUES(${_jon_id},'alarm','${_job_name}','',1,'${_stat_date}')"
echo "$query_update"
query_result=`/usr/local/MysqL/bin/MysqL -h${HOSTNAME} -u${USERNAME} -p${PASSWORD} -D ${DBNAME} -e "${query_update}"`
}
# query languages
## !modify log 2018-02-26 :
# 在QUERYA中,设置条件id not in (36,12),这2个邮件在当天的最后5分钟
# 本系统有5min延迟,故会产生重复报警,所以不再检测邮件 36,12
QUERYA="select concat_ws(',',concat_ws('',id,subject),right(send_time,5)) from dd_sys_mail_set where send_type='day' and id not in (36,12);"
#QUERYA="select concat_ws(',5)) from dd_sys_mail_set where send_type='day';"
QUERYB="select concat_ws(',job_id,job_name),right(run_time,5),alarm_status) from dd_job_monitor where date(statdate)='${CUR_DATE}'"
# query from MysqL
count_a=`/usr/local/MysqL/bin/MysqL -h${HOSTNAME} -u${USERNAME} -p${PASSWORD} -D ${DBNAME} -e "${QUERYA}" --skip-column-name`
count_b=`/usr/local/MysqL/bin/MysqL -h${HOSTNAME} -u${USERNAME} -p${PASSWORD} -D ${DBNAME} -e "${QUERYB}" --skip-column-name`
# echo in data files and delete the first line 'log in file /var/MysqL/query.log'
echo "${count_a}" | sed 's/\t/,/g' > ${_WORK_DIR}/tmp/queryloga.log
echo "${count_b}" | sed 's/\t/,/g' > ${_WORK_DIR}/tmp/querylogb.log
sed -i '1d' ${_WORK_DIR}/tmp/queryloga.log
sed -i '1d' ${_WORK_DIR}/tmp/querylogb.log
# read log a and create array a
LOOP_FLAG_A=0
ARR_RS_A=()
while read line;
do
ARR_RS_A[${LOOP_FLAG_A}]=${line}
LOOP_FLAG_A=`expr $LOOP_FLAG_A + 1`
done < ${_WORK_DIR}/tmp/queryloga.log
# read log b and create array b
LOOP_FLAG_B=0
ARR_RS_B=()
while read line;
do
ARR_RS_B[${LOOP_FLAG_B}]=${line}
LOOP_FLAG_B=`expr $LOOP_FLAG_B + 1`
done < ${_WORK_DIR}/tmp/querylogb.log
# loop the result arr and judge if can send the alarm
for SYSJOB in ${ARR_RS_A[@]}
do
# split the job info
SYSJOB_MINUTE=`echo $SYSJOB | cut -d ',' -f 2`
SYSJOB_NAME=`echo $SYSJOB | cut -d ',' -f 1`
SYSJOB_NAME_SUBJECT=`echo ${SYSJOB_NAME} | cut -d '' -f 2`
SYSJOB_NAME_ID=`echo ${SYSJOB_NAME} | cut -d '' -f 1`
# if time before 5 min ago equals job run time,it should be recorded in ARR_RS_B
if [ "${SYSJOB_MINUTE}" = "${JUDGE_MINUTE}" ]; then
# if job logged in job table ? 1 : 0
contain "${ARR_RS_B[*]}" "${SYSJOB_NAME}"
# 0: job not in tb
if [ "$g_contain_rs" = "0" ]; then
error_msg_001="DC_WEB: MAIL: ERROR:001: ${SYSJOB_NAME_ID}: ${SYSJOB_NAME_SUBJECT}: ${ERROR_CODE_001}"
echo "${error_msg_001}" >> ${_WORK_DIR}/job.log
`sh /opt/DATA/goldmine/src/utils/sms/sms_send.sh "${error_msg_001}"`
insert_job "${SYSJOB_NAME_ID}" "${SYSJOB_NAME_SUBJECT}" "${CUR_DATE}"
else
# test...
echo "logging...<${CUR_DATE}> the inspectoscope has logged the job:${SYSJOB_NAME_ID}:${SYSJOB_NAME_SUBJECT}" >> ${_WORK_DIR}/job.log
for _LOGTMPJOB in ${ARR_RS_B[@]}
do
LOGJOB_NAME=`echo $_LOGTMPJOB | cut -d ',' -f 1`
LOGJOB_MINUTE=`echo $_LOGTMPJOB | cut -d ',' -f 2`
LOGJOB_DATA_STATUS=`echo $_LOGTMPJOB | cut -d ',' -f 3`
LOGJOB_JOB_STATUS=`echo $_LOGTMPJOB | cut -d ',' -f 4`
LOGJOB_ALARM_STATUS=`echo $_LOGTMPJOB | cut -d ',' -f 5`
LOGJOB_JOB_ID=`echo $LOGJOB_NAME | cut -d '' -f 1`
LOGJOB_JOB_NAME=`echo $LOGJOB_NAME | cut -d '' -f 2`
if [ "${SYSJOB_NAME}" = "${LOGJOB_NAME}" ]; then
# LOGJOB_ALARM_STATUS 1:has warned,0:not warnd
if [[ "${LOGJOB_DATA_STATUS}" = "-1" && "${LOGJOB_ALARM_STATUS}" = "0" ]]; then
error_msg_002="DC_WEB: MAIL: ERROR:002: ${LOGJOB_JOB_ID}: ${LOGJOB_JOB_NAME}: ${ERROR_CODE_002}"
echo "${error_msg_002}" >> ${_WORK_DIR}/job.log
`sh /opt/DATA/goldmine/src/utils/sms/sms_send.sh "${error_msg_002}"`
fi
if [[ "${LOGJOB_JOB_STATUS}" = "-1" && "${LOGJOB_ALARM_STATUS}" = "0" ]]; then
error_msg_003="DC_WEB: MAIL: ERROR:003: ${LOGJOB_JOB_ID}: ${LOGJOB_JOB_NAME}: ${ERROR_CODE_003}"
echo "${error_msg_003}" >> ${_WORK_DIR}/job.log
`sh /opt/DATA/goldmine/src/utils/sms/sms_send.sh "${error_msg_003}"`
fi
# update job alarm status
_fun_param_schema_name="alarm_status"
_fun_param_schema_data="1"
update_job_status ${LOGJOB_JOB_ID} ${_fun_param_schema_name} ${_fun_param_schema_data} ${CUR_DATE}
# test...
echo "logging...<${CUR_DATE}> the inspectoscope has alarm the job:${LOGJOB_JOB_ID}:${LOGJOB_JOB_NAME}" >> ${_WORK_DIR}/job.log
fi
done
fi
fi
done