pg初始化完shmem,给其加上索引"ShmemIndex"后,接着就在shmem里初始化xlog。
2初始化xlog相关结构
话说main()->…->PostmasterMain()->…->reset_shared() ->CreateSharedMemoryAndSemaphores()>…->XLOGSHmemInit(),初始化控制文件data/global/pg_control相关数据结构及事务日志xlog相关数据结构,相关结构定义在下面。
typedef struct ControlFileData
{
/*
* Unique system identifier --- to ensure wematch up xlog files with the
* installation that produced them.
*/
uint64 system_identifier;
/*
* Version identifier information. Keep these fields at the same offset,
* especially pg_control_version; they won't bereal useful if they move
* around. (Forhistorical reasons they must be 8 bytes into the file
* rather than immediately at the front.)
*
* pg_control_version identifies the format ofpg_control itself.
* catalog_version_no identifies the format ofthe system catalogs.
*
* There are additional version identifiers inindividual files; for
* example,WAL logs contain per-page magic numbersthat can serve as
* version cues for the WAL log.
*/
uint32 pg_control_version; /* PG_CONTROL_VERSION */
uint32 catalog_version_no; /* see catversion.h */
/*
* System status data
*/
DBState state; /*see enum above */
pg_time_t time; /*time stamp of last pg_control update */
XLogRecPtr checkPoint; /*last check point record ptr */
XLogRecPtr prevCheckPoint; /* prevIoUs check point recordptr */
CheckPoint checkPointCopy; /* copy of last check pointrecord */
/*
* These two values determine the minimum pointwe must recover up to
* before starting up:
*
* minRecoveryPoint is updated to the latestreplayed LSN whenever we
* flush a data change during archive recovery.That guards against
* starting archive recovery,aborting it,andrestarting with an earlier
* stop location. If we've already flushed datachanges from WAL record X
* to disk,we mustn't start up until we reachX again. Zero when not
* doing archive recovery.
*
* backupStartPoint is the redo pointer of thebackup start checkpoint,if
* we are recovering from an online backup andhaven't reached the end of
* backup yet. It is reset to zero when the endof backup is reached,and
* we mustn't start up before that. A booleanwould suffice otherwise,but
* we use the redo pointer as a cross-checkwhen we see an end-of-backup
* record,to make sure the end-of-backuprecord corresponds the base
* backup we're recovering from.
*/
XLogRecPtr minRecoveryPoint;
XLogRecPtr backupStartPoint;
/*
* Parameter settings that determine if the WALcan be used for archival
* or hot standby.
*/
int wal_level;
int MaxConnections;
int max_prepared_xacts;
int max_locks_per_xact;
/*
* This data is used to check for hardware-architecturecompatibility of
* the database and the backendexecutable. We need not check endianness
* explicitly,since the pg_control versionwill surely look wrong to a
* machine of different endianness,but we doneed to worry about MAXALIGN
* and floating-point format. (Note: storage layout nominally also
* depends on SHORTALIGN and INTALIGN,but inpractice these are the same
* on all architectures of interest.)
*
* Testing just one double value is not a verybulletproof test for
* floating-point compatibility,but it willcatch most cases.
*/
uint32 maxAlign; /* alignment requirement for tuples */
double floatFormat; /* constant 1234567.0 */
#define FLOATFORMAT_VALUE 1234567.0
/*
* This data is used to make sure that configurationof this database is
* compatible with the backend executable.
*/
uint32 blcksz; /* data block size for this DB */
uint32 relseg_size; /* blocks per segment of large relation */
uint32 xlog_blcksz; /* block size within WAL files */
uint32 xlog_seg_size; /* size of each WAL segment */
uint32 nameDataLen; /* catalog name field width */
uint32 indexMaxKeys; /* max number of columns in an index */
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
/*flag indicating internal format of timestamp,interval,time */
bool enableIntTimes; /* int64 storageenabled? */
/*flags indicating pass-by-value status of varIoUs types */
bool float4ByVal; /* float4 pass-by-value? */
bool float8ByVal; /* float8,int8,etc pass-by-value? */
/*CRC of all above ... MUST BE LAST! */
pg_crc32 crc;
} ControlFileData;
@H_502_17@/*
* Bodyof CheckPoint XLOG records. This isdeclared here because we keep
* acopy of the latest one in pg_control for possible disaster recovery.
*Changing this struct requires a PG_CONTROL_VERSION bump.
*/
typedef struct CheckPoint
{
XLogRecPtr redo; /*next RecPtr available when we began to
* create CheckPoint (i.e. REDO start point) */
TimeLineID ThisTimeLineID; /* current TLI */
uint32 nextXidEpoch; /* higher-order bits of nextXid */
TransactionIdnextXid; /* next free XID */
Oid nextOid; /* next free OID */
MultiXactIdnextMulti; /* next freeMultiXactId */
MultiXactOffsetnextMultiOffset; /* next free MultiXactoffset */
TransactionIdoldestXid; /* cluster-wide minimumdatfrozenxid */
Oid oldestXidDB; /* database with minimum datfrozenxid */
pg_time_t time; /*time stamp of checkpoint */
/*
* Oldest XID still running. This is onlyneeded to initialize hot standby
* mode from an online checkpoint,so we onlybother calculating this for
* online checkpoints and only when wal_levelis hot_standby. Otherwise
* it's set to InvalidTransactionId.
*/
TransactionIdoldestActiveXid;
} CheckPoint;
@H_502_17@/* @H_502_17@* Total shared-memorystate for XLOG. @H_502_17@*/ @H_502_17@typedef struct XLogCtlData @H_502_17@{ @H_502_17@ /* Protected byWALInsertLock: */ @H_502_17@ XLogCtlInsertInsert; @H_502_17@ @H_502_17@ /* Protected byinfo_lck: */ @H_502_17@ XLogwrtRqstLogwrtRqst; @H_502_17@ XLogwrtResultLogwrtResult; @H_502_17@ uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */ @H_502_17@ TransactionIdckptXid; @H_502_17@ XLogRecPtr asyncXactLSN; /*LSN of newest async commit/abort */ @H_502_17@ uint32 lastRemovedLog; /* latest removed/recycledXLOG segment */ @H_502_17@ uint32 lastRemovedSeg; @H_502_17@ @H_502_17@ /* Protected byWALWriteLock: */ @H_502_17@ XLogCtlWrite Write; @H_502_17@ @H_502_17@ /* @H_502_17@ * These values do not change after startup,although the pointed-to pages @H_502_17@ * and xlblocks values certainly do. Permission to read/write the pages @H_502_17@ * and xlblocks values depends on WALInsertLockand WALWriteLock. @H_502_17@ */ @H_502_17@ char *pages; /* buffers forunwritten XLOG pages */ @H_502_17@ XLogRecPtr*xlblocks; /* 1st byte ptr-s +XLOG_BLCKSZ */ @H_502_17@ int XLogCacheBlck; /* highest allocated xlog buffer index */ @H_502_17@ TimeLineID ThisTimeLineID; @H_502_17@ TimeLineID RecoveryTargetTLI; @H_502_17@ @H_502_17@ /* @H_502_17@ * archiveCleanupCommand is read fromrecovery.conf but needs to be in @H_502_17@ * shared memory so that the bgwriter processcan access it. @H_502_17@ */ @H_502_17@ char archiveCleanupCommand[MAXPGPATH]; @H_502_17@ @H_502_17@ /* @H_502_17@ * SharedRecoveryInProgress indicates if we'restill in crash or archive @H_502_17@ * recovery.Protected by info_lck. @H_502_17@ */ @H_502_17@ bool SharedRecoveryInProgress; @H_502_17@ @H_502_17@ /* @H_502_17@ * SharedHotStandbyActive indicates if we'restill in crash or archive @H_502_17@ * recovery.Protected by info_lck. @H_502_17@ */ @H_502_17@ bool SharedHotStandbyActive; @H_502_17@ @H_502_17@ /* @H_502_17@ * recoveryWakeupLatch is used to wake up thestartup process to continue @H_502_17@ * WAL replay,if it is waiting for WAL toarrive or failover trigger file @H_502_17@ * to appear. @H_502_17@ */ @H_502_17@ Latch recoveryWakeupLatch; @H_502_17@ @H_502_17@ /* @H_502_17@ * During recovery,we keep a copy of thelatest checkpoint record here. @H_502_17@ * Used by the background writer when it wantsto create a restartpoint. @H_502_17@ * @H_502_17@ * Protected by info_lck. @H_502_17@ */ @H_502_17@ XLogRecPtr lastCheckPointRecPtr; @H_502_17@ CheckPoint lastCheckPoint; @H_502_17@ @H_502_17@ /* end+1 of the lastrecord replayed (or being replayed) */ @H_502_17@ XLogRecPtr replayEndRecPtr; @H_502_17@ /* end+1 of the lastrecord replayed */ @H_502_17@ XLogRecPtr recoveryLastRecPtr; @H_502_17@ /* timestamp of lastCOMMIT/ABORT record replayed (or being replayed) */ @H_502_17@ TimestampTzrecoveryLastXTime; @H_502_17@ /* Are we requestedto pause recovery? */ @H_502_17@ bool recoveryPause; @H_502_17@ @H_502_17@ slock_t info_lck; /*locks shared variables shown above */ @H_502_17@} XLogCtlData; @H_502_17@ @H_502_17@/* @H_502_17@* Shared state datafor XLogInsert. @H_502_17@*/ @H_502_17@typedef struct XLogCtlInsert @H_502_17@{ @H_502_17@ XLogwrtResultLogwrtResult; /* a recent value of LogwrtResult */ @H_502_17@ XLogRecPtr PrevRecord; /*start of prevIoUsly-inserted record */ @H_502_17@ int curridx; /* current block index in cache */ @H_502_17@ XLogPageHeadercurrpage; /* points to header of blockin cache */ @H_502_17@ char *currpos; /* currentinsertion point in cache */ @H_502_17@ XLogRecPtr RedoRecPtr; /*current redo point for insertions */ @H_502_17@ bool forcePageWrites; /* forcing full-page writes for PITR? */ @H_502_17@ @H_502_17@ /* @H_502_17@ * exclusiveBackup is true if a backup startedwith pg_start_backup() is @H_502_17@ * in progress,and nonExclusiveBackups is acounter indicating the number @H_502_17@ * of streaming base backups currently inprogress. forcePageWrites is set @H_502_17@ * to true when either of these is non-zero.lastBackupStart is the latest @H_502_17@ * checkpoint redo location used as a startingpoint for an online backup. @H_502_17@ */ @H_502_17@ bool exclusiveBackup; @H_502_17@ int nonExclusiveBackups; @H_502_17@ XLogRecPtr lastBackupStart; @H_502_17@} XLogCtlInsert;
在XLOGSHmemInit()函数里,首先在shmem的哈希表索引"ShmemIndex"上给控制文件pg_control增加一个HashElement和ShmemIndexEnt(entry),在shmem里根据ControlFileData大小调用ShmemAlloc()分配内存空间,使ShmemIndexEnt的成员location指向该空间,size成员记录该空间大小。
XLOGSHmemInit()调用ShmemInitStruct(),在其中调用hash_search()在哈希表索引"ShmemIndex"中查找"XLOGCtl",如果没有,就在shmemIndex中给"XLOG Ctl"分一个HashElement和ShmemIndexEnt(entry),在其中的Entry中写上"XLOG Ctl"。返回ShmemInitStruct(),再调用ShmemAlloc()在共享内存上给"XLOG Ctl"相关结构(见下面“XLog相关结构图”)分配空间,设置entry(在这儿及ShmemIndexEnt类型变量)的成员location指向该空间,size成员记录该空间大小,最后返回XLOGShmemInit(),让XLogCtlData *类型静态全局变量XLogCtl指向在shmem里给"XLOG Ctl"相关结构分配的内存地址,设置其中XLogCtlData结构类型的成员值。初始化完成后数据结构如下图。
@H_502_17@初始化完xlog的内存结构图
@H_502_17@ 为了精简上图,把创建shmem的哈希表索引"ShmemIndex"时创建的HCTL结构删掉了,这个结构的作用是记录创建可扩展哈希表的相关信息。增加了左边灰色底的部分,描述共享内存/shmem里各变量物理布局概览,由下往上,由低地址到高地址。其中的"Control File"即ControlFileDate和"XLOG Ctl"即xlog的相关结构图下面分别给出,要不上面的图太大了。
控制文件结构图
@H_502_17@ 上图中ControlFileData结构中的XLogRecPtr和CheckPoint不是指针,因此应该用右边的相应结构图代替,把这两个合进去有点费劲,将就着看吧。 @H_502_17@XLog相关结构图