五、测试
5.1备节点失效
在node2上杀死postgres数据库进程,模拟备节点上数据库崩溃:
[root@node2~]#killall-9postgres
查看此时集群状态:
[root@node1~]#crm_mon-Afr-1 Lastupdated:WedJan2202:15:062014 Lastchange:WedJan2202:15:332014viacrm_attributeonnode1 Stack:classicopenais(withplugin) CurrentDC:node1-partitionwithquorum Version:1.1.10-14.el6_5.2-368c726 2Nodesconfigured,2expectedvotes 7Resourcesconfigured Online:[node1node2] Fulllistofresources: vip-slave(ocf::heartbeat:IPaddr2):Startednode1 ResourceGroup:master-group vip-master(ocf::heartbeat:IPaddr2):Startednode1 vip-rep(ocf::heartbeat:IPaddr2):Startednode1 Master/SlaveSet:msPostgresql[pgsql] Masters:[node1] Stopped:[node2] CloneSet:clnPingCheck[pingCheck] Started:[node1node2] NodeAttributes: *Nodenode1: +default_ping_set:100 +master-pgsql:1000 +pgsql-data-status:LATEST +pgsql-master-baseline:0000000006000078 +pgsql-status:PRI *Nodenode2: +default_ping_set:100 +master-pgsql:-INFINITY +pgsql-data-status:DISCONNECT +pgsql-status:STOP Migrationsummary: *Nodenode2: pgsql:migration-threshold=1fail-count=1last-failure='WedJan2202:15:352014' *Nodenode1: Failedactions: pgsql_monitor_7000onnode2'notrunning'(7):call=42,status=complete,last-rc-change='WedJan2202:14:582014',queued=0ms,exec=0ms
{vip-slave资源已成功切换到了node1上}
重启node2上的corosync,数据库将重新伴随启动:
[root@node2~]#servicecorosyncrestart [root@node1~]#crm_mon-Afr-1 Lastupdated:WedJan2202:16:242014 Lastchange:WedJan2202:16:552014viacrm_attributeonnode1 Stack:classicopenais(withplugin) CurrentDC:node1-partitionwithquorum Version:1.1.10-14.el6_5.2-368c726 2Nodesconfigured,2expectedvotes 7Resourcesconfigured Online:[node1node2] Fulllistofresources: vip-slave(ocf::heartbeat:IPaddr2):Startednode2 ResourceGroup:master-group vip-master(ocf::heartbeat:IPaddr2):Startednode1 vip-rep(ocf::heartbeat:IPaddr2):Startednode1 Master/SlaveSet:msPostgresql[pgsql] Masters:[node1] Slaves:[node2] CloneSet:clnPingCheck[pingCheck] Started:[node1node2] NodeAttributes: *Nodenode1: +default_ping_set:100 +master-pgsql:1000 +pgsql-data-status:LATEST +pgsql-master-baseline:0000000006000078 +pgsql-status:PRI *Nodenode2: +default_ping_set:100 +master-pgsql:100 +pgsql-data-status:STREAMING|SYNC +pgsql-status:HS:sync Migrationsummary: *Nodenode2: *Nodenode1:
{vip-slave又重新回到了nod2上}
5.2主节点失效切换
在node1上杀死postgres数据库进程,模拟主节点上数据库崩溃:
[root@node1~]#killall-9postgres
等会查看集群状态:
[root@node2~]#crm_mon-Afr-1 Lastupdated:WedJan2202:17:502014 Lastchange:WedJan2202:18:162014viacrm_attributeonnode2 Stack:classicopenais(withplugin) CurrentDC:node1-partitionwithquorum Version:1.1.10-14.el6_5.2-368c726 2Nodesconfigured,2expectedvotes 7Resourcesconfigured Online:[node1node2] Fulllistofresources: vip-slave(ocf::heartbeat:IPaddr2):Startednode2 ResourceGroup:master-group vip-master(ocf::heartbeat:IPaddr2):Startednode2 vip-rep(ocf::heartbeat:IPaddr2):Startednode2 Master/SlaveSet:msPostgresql[pgsql] Masters:[node2] Stopped:[node1] CloneSet:clnPingCheck[pingCheck] Started:[node1node2] NodeAttributes: *Nodenode1: +default_ping_set:100 +master-pgsql:-INFINITY +pgsql-data-status:DISCONNECT +pgsql-status:STOP *Nodenode2: +default_ping_set:100 +master-pgsql:1000 +pgsql-data-status:LATEST +pgsql-master-baseline:0000000008014A70 +pgsql-status:PRI Migrationsummary: *Nodenode2: *Nodenode1: pgsql:migration-threshold=1fail-count=1last-failure='WedJan2202:18:112014' Failedactions: pgsql_monitor_2000onnode1'notrunning'(7):call=2435,last-rc-change='WedJan2202:18:112014','宋体';font-size:13px;">{vip-master/vip-rep都已成功切换到node2上,且node2已变为master,node2上pg数据库状态已切换为PRI}
停止node1上的corosync:
[root@node1~]#servicecorosyncstop执行一次基础同步:
[postgres@node1data]$pwd /opt/pgsql/data [postgres@node1data]$rm-rf* [postgres@node1data]$pg_basebackup-h192.168.1.3-Upostgres-D/opt/pgsql/data/-P 19172/19172kB(100%),1/1tablespace NOTICE:pg_stop_backupcomplete,allrequiredWALsegmentshavebeenarchived [postgres@node1data]$ls backup_labelbasepg_clogpg_ident.confpg_notifypg_stat_tmppg_tblspcPG_VERSIONpostgresql.conf backup_label.oldglobalpg_hba.confpg_multixactpg_serialpg_subtranspg_twophasepg_xlogrecovery.done启动node1上的corosync:
[root@node1~]#servicecorosyncstart
5.3主节点恢复
修复原主节点后将其恢复为当前备节点
在node1上执行一次基础同步:
[postgres@node1data]$pwd /opt/pgsql/data [postgres@node1data]$rm-rf* [postgres@node1data]$pg_basebackup-h192.168.2.3-Upostgres-D/opt/pgsql/data/-P 19172/19172kB(100%),'宋体';font-size:13px;">启动heartbeat之前必须删除资锁,不然资源将不会伴随heartbeat启动:[root@node1~]#rm-rf/var/lib/pgsql/tmp/PGsql.lock{该锁文件在当节点为主节点时创建,但不会因为heartbeat的异常停止或数据库/系统的异常终止而自动删除,所以在恢复一个节点的时候只要该节点充当过主节点就需要手动清理该锁文件}
重启node1上的heartbeat:
[root@node1~]#serviceheartbeatrestart过段时间后查看集群状态:
[root@node2~]#crm_mon-Afr1 ============ Lastupdated:MonJan2708:50:432014 Stack:Heartbeat CurrentDC:node2(f2dcd1df-7429-42f5-82e9-b73921f97cab)-partitionwithquorum Version:1.0.12-unknown 2Nodesconfigured,unknownexpectedvotes 4Resourcesconfigured. ============ Online:[node1node2] Fulllistofresources: vip-slave(ocf::heartbeat:IPaddr2):Startednode1 ResourceGroup:master-group vip-master(ocf::heartbeat:IPaddr2):Startednode2 vip-rep(ocf::heartbeat:IPaddr2):Startednode2 Master/SlaveSet:msPostgresql Masters:[node2] Slaves:[node1] CloneSet:clnPingCheck Started:[node1node2] NodeAttributes: *Nodenode1: +default_ping_set:100 +master-pgsql:0:100 +pgsql-data-status:STREAMING|SYNC +pgsql-status:HS:sync *Nodenode2: +default_ping_set:100 +master-pgsql:1:1000 +pgsql-data-status:LATEST +pgsql-master-baseline:00000000120000B0 +pgsql-status:PRI Migrationsummary: *Nodenode1: *Nodenode2:{vip-slave已成功切到node1上,node1成功成为流复制备节点}
六、管理
6.1启动关闭corosync
[root@node1~]#servicecorosyncstart [root@node1~]#servicecorosyncstop
6.2查看HA状态
[root@node1~]#crmstatus Lastupdated:TueJan2123:55:132014 Lastchange:TueJan2123:37:362014viacrm_attributeonnode1 Stack:classicopenais(withplugin) CurrentDC:node1-partitionwithquorum Version:1.1.10-14.el6_5.2-368c726 2Nodesconfigured,2expectedvotes 7Resourcesconfigured Online:[node1node2] vip-slave(ocf::heartbeat:IPaddr2):Startednode2 ResourceGroup:master-group vip-master(ocf::heartbeat:IPaddr2):Startednode1 vip-rep(ocf::heartbeat:IPaddr2):Startednode1 Master/SlaveSet:msPostgresql[pgsql] Masters:[node1] Slaves:[node2] CloneSet:clnPingCheck[pingCheck] Started:[node1node2]6.3查看资源状态及节点属性
[root@node1~]#crm_mon-Afr-1 Lastupdated:TueJan2123:37:202014 Lastchange:TueJan2123:37:362014viacrm_attributeonnode1 Stack:classicopenais(withplugin) CurrentDC:node1-partitionwithquorum Version:1.1.10-14.el6_5.2-368c726 2Nodesconfigured,'宋体';">6.4查看配置[root@node1~]#crmconfigureshow nodenode1\ attributespgsql-data-status="LATEST" nodenode2\ attributespgsql-data-status="STREAMING|SYNC" primitivepgsqlocf:heartbeat:pgsql\ paramspgctl="/opt/pgsql/bin/pg_ctl"psql="/opt/pgsql/bin/psql"pgdata="/opt/pgsql/data/"start_opt="-p5432"rep_mode="sync"node_list="node1node2"restore_command="cp/opt/archivelog/%f%p"primary_conninfo_opt="keepalives_idle=60keepalives_interval=5keepalives_count=5"master_ip="192.168.1.3"stop_escalate="0"\ opstarttimeout="60s"interval="0s"on-fail="restart"\ opmonitortimeout="60s"interval="7s"on-fail="restart"\ opmonitortimeout="60s"interval="2s"on-fail="restart"role="Master"\ oppromotetimeout="60s"interval="0s"on-fail="restart"\ opdemotetimeout="60s"interval="0s"on-fail="stop"\ …… ……6.5实时监控HA
[root@node1~]#crm_mon-Afr Lastupdated:WedJan2200:40:122014 Lastchange:TueJan2123:37:362014viacrm_attributeonnode1 Stack:classicopenais(withplugin) CurrentDC:node1-partitionwithquorum Version:1.1.10-14.el6_5.2-368c726 2Nodesconfigured,2expectedvotes 7Resourcesconfigured Online:[node1node2] Fulllistofresources: vip-slave(ocf::heartbeat:IPaddr2):Startednode2 ResourceGroup:master-group vip-master(ocf::heartbeat:IPaddr2):Startednode1 vip-rep(ocf::heartbeat:IPaddr2):Startednode1 Master/SlaveSet:msPostgresql[pgsql] Masters:[node1] Slaves:[node2] CloneSet:clnPingCheck[pingCheck] Started:[node1node2] NodeAttributes: *Nodenode1: +default_ping_set:100 +master-pgsql:1000 +pgsql-data-status:LATEST +pgsql-master-baseline:0000000006000078 +pgsql-status:PRI *Nodenode2: +default_ping_set:100 +master-pgsql:100 +pgsql-data-status:STREAMING|SYNC +pgsql-status:HS:sync Migrationsummary:*Nodenode2:*Nodenode1:6.6 crm_resource命令
资源启动/关闭:
[root@node1~]#crm_resource-rvip-master-vstarted [root@node1~]#crm_resource-rvip-master-vstoped
列举资源:
[root@node1~]#crm_resource-L vip-slave(ocf::heartbeat:IPaddr2):Started ResourceGroup:master-group vip-master(ocf::heartbeat:IPaddr2):Started vip-rep(ocf::heartbeat:IPaddr2):Started Master/SlaveSet:msPostgresql[pgsql] Masters:[node1] Slaves:[node2] CloneSet:clnPingCheck[pingCheck] Started:[node1node2]查看资源位置:
[root@node1~]#crm_resource-W-rpgsql resourcepgsqlisrunningon:node2迁移资源:
[root@node1~]#crm_resource-M-rvip-slave-Nnode2删除资源:
[root@node1~]#crm_resource-D-rvip-slave-tprimitive6.7 crm命令
列举指定的RA:
[root@node1~]#crmralistocfpacemaker ClusterMonDummyHealthcpuHealthSMARTStatefulSysInfoSystemHealthcontroldpingpingd remote删除节点:
[root@node1~]#crmnodedeletenode2停用节点:
[root@node1~]#crmnodestandbynode2启用节点:
[root@node1~]#crmnodeonlinenode2配置pacemaker:
[root@node1~]#crmconfigure crm(live)configure# …… …… crm(live)configure#commit crm(live)configure#quit6.8重置failcount
[root@node1~]#crmresource crm(live)resource#failcountpgsqlsetnode10 crm(live)resource#failcountpgsqlshownode1 scope=statusname=fail-count-pgsqlvalue=0 [root@node1~]#crmresourcecleanuppgsql Cleaninguppgsql:0onnode1 Waitingfor1repliesfromtheCRMd.OK [root@node1~]#crm_failcount-G-Unode1-rpgsql scope=statusname=fail-count-pgsqlvalue=INFINITY [root@node1~]#crm_failcount-D-Unode1-rpgsql
七、问题记录
7.1 Q1
问题现象:
corosync.log日志中报错:
Jan 15 10:23:57 node1 lrmd: [6327]: info: RA output: (pgsql:0:monitor:stderr) /usr/lib/ocf/resource.d//heartbeat/pgsql: line 1749: ocf_local_nodename: command not found
Jan 15 10:23:57 node1 crm_attribute: [11094]: info: Invoked: /usr/sbin/crm_attribute -l reboot -N -n -v 0000000006000090 pgsql-xlog-loclrm_get_rsc_type_Metadata(578)
Jan 15 10:23:57 node1 lrmd: [6327]: info: RA output: (pgsql:0:monitor:stderr) Could not map uname=-n to a UUID: The object/attribute does not exist
解决方式:
查看pgsql脚本,发现其中使用了ocf_local_nodename,该函数本该在ocf-shellfuncs.in中有定义,但却没有这个函数,上网查看相关论坛
http://www.gossamer-threads.com/lists/linuxha/users/89379?do=post_view_threaded
指出此时需要相关补丁,解决ocf_local_nodename函数的补丁:
https://github.com/ClusterLabs/resource-agents/commit/abc1c3f6464f6e5e7a1e41cd7c9b8179896c1903
最新的版本没有ocf_local_nodename函数,所以使用以下版本:
{注:确保pacemaker版本>1.1.8,不然crm_node -n命令无法使用}
https://github.com/ClusterLabs/resource-agents/blob/a6f4ddf76cb4bbc1b3df4c9b6632a6351b63c19e/heartbeat/pgsql
https://github.com/ClusterLabs/resource-agents/tree/abc1c3f6464f6e5e7a1e41cd7c9b8179896c1903/heartbeat
不含有ocf_local_nodename函数的pgsql脚本:
https://raw.github.com/ClusterLabs/resource-agents/a6f4ddf76cb4bbc1b3df4c9b6632a6351b63c19e/heartbeat/pgsql
7.2 Q2
问题现象:
[root@node1~]#crmconfigureloadupdatepgsql.crm WARNING:pingCheck:specifiedtimeout60sforstartissmallerthantheadvised90 WARNING:pingCheck:specifiedtimeout60sforstopissmallerthantheadvised100 WARNING:pgsql:specifiedtimeout60sforstopissmallerthantheadvised120 WARNING:pgsql:specifiedtimeout60sforstartissmallerthantheadvised120 WARNING:pgsql:specifiedtimeout60sfornotifyissmallerthantheadvised90 WARNING:pgsql:specifiedtimeout60sfordemoteissmallerthantheadvised120 WARNING:pgsql:specifiedtimeout60sforpromoteissmallerthantheadvised120 ERROR:master-group:attributeordereddoesnotexist Doyoustillwanttocommit?no错误提示:在定义的master-group中ordered属性不存在
(1)该问题是pacemaker版本所致,在pacemaker-1.1版本中不支持ordered,colocated属性,通过以下方法以1.0版本的cibconfig.py替换当前新版本试图解决此问题,结果失败:
[root@node1~]#vim/usr/lib64/python2.6/site-packages/crmsh/cibconfig.py [root@node1~]#cd/usr/lib64/python2.6/site-packages/crmsh/ [root@node1crmsh]#mvcibconfig.pycibconfig.py.bak [root@node1crmsh]#wgethttps://github.com/ClusterLabs/pacemaker-1.0/blob/fa1a99ab36e0ed015f1bcbbb28f7db962a9d1abc/shell/modules/cibconfig.py(2)从配置脚本中去除关于ordered的定义(成功):
group master-group \
vip-master\
vip-rep\
Meta\
ordered="false"
改为:
vip-rep
7.3 Q3
安装pacemaker时报错:
#yuminstallpacemaker* …… -->ProcessingDependency:libesmtp.so.5()(64bit)forpackage:pacemaker -->FinishedDependencyResolution pacemaker-1.0.12-1.el5.centos.i386fromclusterlabshasdepsolvingproblems -->MissingDependency:libesmtp.so.5isneededbypackagepacemaker-1.0.12-1.el5.centos.i386(clusterlabs) pacemaker-1.0.12-1.el5.centos.x86_64fromclusterlabshasdepsolvingproblems -->MissingDependency:libesmtp.so.5()(64bit)isneededbypackagepacemaker-1.0.12-1.el5.centos.x86_64(clusterlabs) Error:MissingDependency:libesmtp.so.5isneededbypackagepacemaker-1.0.12-1.el5.centos.i386(clusterlabs) Error:MissingDependency:libesmtp.so.5()(64bit)isneededbypackagepacemaker-1.0.12-1.el5.centos.x86_64(clusterlabs) Youcouldtryusing--skip-brokentoworkaroundtheproblem Youcouldtryrunning:package-cleanup--problems package-cleanup--dupes rpm-Va--nofiles--nodigest Theprogrampackage-cleanupisfoundintheyum-utilspackage.提示缺少libesmtp,安装即可
#wgetftp://ftp.univie.ac.at/systems/linux/fedora/epel/5/x86_64/libesmtp-1.0.4-5.el5.x86_64.rpm #wgetftp://ftp.univie.ac.at/systems/linux/fedora/epel/5/i386/libesmtp-1.0.4-5.el5.i386.rpm #rpm-ivhlibesmtp-1.0.4-5.el5.x86_64.rpm #rpm-ivhlibesmtp-1.0.4-5.el5.i386.rpm7.4 Q4
加载crm配置时报错:
[root@node1~]#crmconfigureloadupdatepgsql.crm ERROR:pgsql:parameterrep_modedoesnotexist ERROR:pgsql:parameternode_listdoesnotexist ERROR:pgsql:parametermaster_ipdoesnotexist ERROR:pgsql:parameterrestore_commanddoesnotexist ERROR:pgsql:parameterprimary_conninfo_optdoesnotexist WARNING:pgsql:specifiedtimeout60sforstopissmallerthantheadvised120 WARNING:pgsql:actionmonitor_MasternotadvertisedinMeta-data,itmaynotbesupportedbytheRA WARNING:pgsql:specifiedtimeout60sforstartissmallerthantheadvised120 WARNING:pgsql:actionnotifynotadvertisedinMeta-data,itmaynotbesupportedbytheRA WARNING:pgsql:actiondemotenotadvertisedinMeta-data,itmaynotbesupportedbytheRA WARNING:pgsql:actionpromotenotadvertisedinMeta-data,itmaynotbesupportedbytheRA WARNING:pingCheck:specifiedtimeout60sforstartissmallerthantheadvised90 WARNING:pingCheck:specifiedtimeout60sforstopissmallerthantheadvised100 Doyoustillwanttocommit?no参数不存在是因为pgsql脚本太旧,需要替换
scppgsqlroot@192.168.100.201:/usr/lib/ocf/resource.d/heartbeat/ scpocf-shellfuncs.inroot@192.168.100.201:/usr/lib/ocf/lib/heartbeat/ scppgsqlroot@192.168.100.202:/usr/lib/ocf/resource.d/heartbeat/ scpocf-shellfuncs.inroot@192.168.100.202:/usr/lib/ocf/lib/heartbeat/7.5 Q5
[root@node1~]#crm_mon-Afr-1 Lastupdated:TueJan2105:10:562014 Lastchange:TueJan2105:10:082014viacibadminonnode1 Stack:classicopenais(withplugin) CurrentDC:node1-partitionwithquorum Version:1.1.10-14.el6_5.2-368c726 2Nodesconfigured,2expectedvotes 7Resourcesconfigured Online:[node1node2] Fulllistofresources: vip-slave(ocf::heartbeat:IPaddr2):Stopped ResourceGroup:master-group vip-master(ocf::heartbeat:IPaddr2):Stopped vip-rep(ocf::heartbeat:IPaddr2):Stopped Master/SlaveSet:msPostgresql[pgsql] Stopped:[node1node2] CloneSet:clnPingCheck[pingCheck] Stopped:[node1node2] NodeAttributes: *Nodenode1: *Nodenode2: Migrationsummary: *Nodenode1: *Nodenode2: Failedactions: pingCheck_monitor_0onnode1'invalidparameter'(2):call=23,last-rc-change='TueJan2105:10:102014',queued=200ms,exec=0ms pingCheck_monitor_0onnode2'invalidparameter'(2):call=23,last-rc-change='TueJan2105:09:362014',queued=281ms,'宋体';font-size:13px;">该错误是因为脚本定义中的pingCheck调用的pingd脚本中存在未知参数,经查ocf/pacemaker/pingd中不存在multiplier参数:
primitive pingCheck ocf:pacemaker:pingd \
params\
name="default_ping_set"\
host_list="192.168.100.1"\
multiplier="100"\
opstart timeout="60s" interval="0s" on-fail="restart" \
opmonitor timeout="60s" interval="10s" on-fail="restart" \
opstop timeout="60s" interval="0s" on-fail="ignore"
因此将调用改为ocf:heartbeat:pingd
7.6 Q6
corosync日志中报错:
Jan 21 04:36:02 corosync [TOTEM ] Received message has invalid digest... ignoring.
Jan 21 04:36:02 corosync [TOTEM ] Invalid packet data
说明网络中存在相同的多播,更改多播地址即可。
八、参考资源
脚本:
https://github.com/ClusterLabs/resource-agents/blob/master/heartbeat/pgsql
脚本使用说明:
https://github.com/t-matsuo/resource-agents/wiki/Resource-Agent-for-Postgresql-9.1-streaming-replication
crm_resouce命令:
http://www.novell.com/zh-cn/documentation/sle_ha/book_sleha/data/man_crmresource.html
crm_failcount命令:
http://www.novell.com/zh-cn/documentation/sle_ha/book_sleha/data/man_crmfailcount.html