从逻辑角度来看,这两个XLOG位置实际是同一个位置,所以在做数据恢复时,先找到检查点的XLOG物理位置,然后根据这里的结束检查点时写入的XLOG信息找到逻辑位置,从逻辑位置开始,读取XLOG并实施xlog replay恢复,至少要恢复到XLOG物理位置才能确保数据库的一致性和完整性。





checkpointer process 介绍,挑选了一些关键步骤进行讲解:



365         /* 366          * Process any requests or signals received recently. 367          */ 368         AbsorbFsyncRequests();......
388         if (checkpoint_requested) 389         { 390             checkpoint_requested = false; 391             do_checkpoint = true; 392             BgWriterStats.m_requested_checkpoints++; 393         }


407         /* 408          * Force a checkpoint if too much time has elapsed since the last one. 409          * Note that we count a timed checkpoint in stats only when this 410          * occurs without an external request, but we set the CAUSE_TIME flag 411          * bit even if there is also an external request. 412          */ 413         now = (pg_time_t) time(NULL); 414         elapsed_secs = now - last_checkpoint_time; 415         if (elapsed_secs >= CheckPointTimeout) 416         { 417             if (!do_checkpoint) 418                 BgWriterStats.m_timed_checkpoints++; 419             do_checkpoint = true; 420             flags |= CHECKPOINT_CAUSE_TIME; 421         }......

进入检查点,记录检查点的逻辑位置(即开始位置的XLOG OFFSET),调用CreateCheckPoint创建检查点。

423         /* 424          * Do a checkpoint if requested. 425          */ 426         if (do_checkpoint) 427         { 428             bool        ckpt_performed = false; 429             bool        do_restartpoint; 430  431             /* use volatile pointer to prevent code rearrangement */ 432             volatile CheckpointerShmemStruct *cps = CheckpointerShmem; 433  434             /* 435              * Check if we should perform a checkpoint or a restartpoint. As a 436              * side-effect, RecoveryInProgress() initializes TimeLineID if 437              * it's not set yet. 438              */ 439             do_restartpoint = RecoveryInProgress(); 440  441             /* 442              * Atomically fetch the request flags to figure out what kind of a 443              * checkpoint we should perform, and increase the started-counter 444              * to acknowledge that we've started a new checkpoint. 445              */ 446             SpinLockAcquire(&cps->ckpt_lck); 447             flags |= cps->ckpt_flags; 448             cps->ckpt_flags = 0; 449             cps->ckpt_started++; 450             SpinLockRelease(&cps->ckpt_lck); 451  452             /* 453              * The end-of-recovery checkpoint is a real checkpoint that's 454              * performed while we're still in recovery. 455              */ 456             if (flags & CHECKPOINT_END_OF_RECOVERY) 457                 do_restartpoint = false; 458  459             /* 460              * We will warn if (a) too soon since last checkpoint (whatever 461              * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag 462              * since the last checkpoint start.  Note in particular that this 463              * implementation will not generate warnings caused by 464              * CheckPointTimeout < CheckPointWarning. 465              */ 466             if (!do_restartpoint && 467                 (flags & CHECKPOINT_CAUSE_XLOG) && 468                 elapsed_secs < CheckPointWarning) 469                 ereport(LOG, 470                         (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", 471                 "checkpoints are occurring too frequently (%d seconds apart)", 472                                        elapsed_secs, 473                                        elapsed_secs), 474                          errhint("Consider increasing the configuration parameter \"max_wal_size\"."))); 475  476             /* 477              * Initialize checkpointer-private variables used during 478              * checkpoint 479              */ 480             ckpt_active = true; 481             if (!do_restartpoint) 482                 ckpt_start_recptr = GetInsertRecPtr();     // 记录检查点开始前的XLOG位置,用于检查点调度判断                                                                                             //  不要和逻辑位置混淆,这还不是。 483             ckpt_start_time = now; 484             ckpt_cached_elapsed = 0; 485  486             /* 487              * Do the checkpoint. 488              */ 489             if (!do_restartpoint) 490             { 491                 CreateCheckPoint(flags);    // 创建检查点 492                 ckpt_performed = true; 493             } 494             else 495                 ckpt_performed = CreateRestartPoint(flags); 496  497             /* 498              * After any checkpoint, close all smgr files.  This is so we 499              * won't hang onto smgr references to deleted files indefinitely. 500              */ 501             smgrcloseall(); 502  503             /* 504              * Indicate checkpoint completion to any waiting backends. 505              */ 506             SpinLockAcquire(&cps->ckpt_lck); 507             cps->ckpt_done = cps->ckpt_started; 508             SpinLockRelease(&cps->ckpt_lck); 509  510             if (ckpt_performed) 511             { 512                 /* 513                  * Note we record the checkpoint start time not end time as 514                  * last_checkpoint_time.  This is so that time-driven 515                  * checkpoints happen at a predictable spacing. 516                  */ 517                 last_checkpoint_time = now; 518             } 519             else 520             { 521                 /* 522                  * We were not able to perform the restartpoint (checkpoints 523                  * throw an ERROR in case of error).  Most likely because we 524                  * have not received any new checkpoint WAL records since the 525                  * last restartpoint. Try again in 15 s. 526                  */ 527                 last_checkpoint_time = now - CheckPointTimeout + 15; 528             } 529  530             ckpt_active = false; 531         }

记录检查点开始前的XLOG位置, 用于检查点调度,和逻辑位置无关。


/* * GetInsertRecPtr -- Returns the current insert position. * * NOTE: The value *actually* returned is the position of the last full * xlog page. It lags behind the real insert position by at most 1 page. * For that, we don't need to scan through WAL insertion locks, and an * approximation is enough for the current usage of this function. */XLogRecPtrGetInsertRecPtr(void){        /* use volatile pointer to prevent code rearrangement */        volatile XLogCtlData *xlogctl = XLogCtl;        XLogRecPtr      recptr;
SpinLockAcquire(&xlogctl->info_lck);        recptr = xlogctl->LogwrtRqst.Write;    //   写入并返回XLOG位置        SpinLockRelease(&xlogctl->info_lck);
return recptr;}



/* * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint *               in time? * * Compares the current progress against the time/segments elapsed since last * checkpoint, and returns true if the progress we've made this far is greater * than the elapsed time/segments. */static boolIsCheckpointOnSchedule(double progress){        XLogRecPtr      recptr;        struct timeval now;        double          elapsed_xlogs,                                elapsed_time;
/* Scale progress according to checkpoint_completion_target. */        progress *= CheckPointCompletionTarget;   // checkpoint_completion_target 参数控制系数,所以系数越大,progress越大。
/*         * Check against the cached value first. Only do the more expensive         * calculations once we reach the target previously calculated. Since         * neither time or WAL insert pointer moves backwards, a freshly         * calculated value can only be greater than or equal to the cached value.         */        if (progress < ckpt_cached_elapsed)                return false;  // 返回false,checkpointer不休息
/*         * Check progress against WAL segments written and checkpoint_segments.         *         * We compare the current WAL insert location against the location         * computed before calling CreateCheckPoint. The code in XLogInsert that         * actually triggers a checkpoint when checkpoint_segments is exceeded         * compares against RedoRecptr, so this is not completely accurate.ca         * However, it's good enough for our purposes, we're only calculating an         * estimate anyway.         */        if (!RecoveryInProgress())        {                recptr = GetInsertRecPtr();                elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) / XLogSegSize) / CheckPointSegments;                //  CheckPointSegments由参数checkpoint_segments控制.                //  checkpoint_completion_target 是0-1的范围                //  checkpoint_segments是触发检查点的XLOG个数,                //  假设checkpoint_completion_target = 0.1, progress传入参数=1, 那么                //  checkpoint_segments=100, 那么每产生 0.1×100=10个XLOG文件后, checkpointer要休息一下,以免对性能造成太大影响                //   checkpointer休息多久由CheckpointWriteDelay函数来控制。
if (progress < elapsed_xlogs)  // 未达到休息点                {                        ckpt_cached_elapsed = elapsed_xlogs;                        return false;  // 返回false,checkpointer不休息                }        }
/*         * Check progress against time elapsed and checkpoint_timeout.         */        gettimeofday(&now, NULL);        elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +                                        now.tv_usec / 1000000.0) / CheckPointTimeout;  //  另一个判断依据是检查点耗时和checkpoint_timeout参数。
if (progress < elapsed_time)        {                ckpt_cached_elapsed = elapsed_time;                return false;        }
/* It looks like we're on schedule. */        return true;}


/* * CheckpointWriteDelay -- control rate of checkpoint * * This function is called after each page write performed by BufferSync(). * It is responsible for throttling BufferSync()'s write rate to hit * checkpoint_completion_target. * * The checkpoint request flags should be passed in; currently the only one * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. * * 'progress' is an estimate of how much of the work has been done, as a * fraction between 0.0 meaning none, and 1.0 meaning all done. */voidCheckpointWriteDelay(int flags, double progress){        static int      absorb_counter = WRITES_PER_ABSORB;
/* Do nothing if checkpoint is being executed by non-checkpointer process */        if (!AmCheckpointerProcess())                return;
/*         * Perform the usual duties and take a nap, unless we're behind schedule,         * in which case we just try to catch up as quickly as possible.         */        if (!(flags & CHECKPOINT_IMMEDIATE) &&                !shutdown_requested &&                !ImmediateCheckpointRequested() &&                IsCheckpointOnSchedule(progress))   //  IsCheckpointOnSchedule 即判断是否达到调度位置        {                if (got_SIGHUP)                {                        got_SIGHUP = false;                        ProcessConfigFile(PGC_SIGHUP);                        /* update shmem copies of config variables */                        UpdateSharedMemoryConfig();                }                AbsorbFsyncRequests();                absorb_counter = WRITES_PER_ABSORB;
/*                 * Report interim activity statistics to the stats collector.                 */                pgstat_send_bgwriter();
/*                 * This sleep used to be connected to bgwriter_delay, typically 200ms.                 * That resulted in more frequent wakeups if not much work to do.                 * Checkpointer and bgwriter are no longer related so take the Big                 * Sleep.                 */                pg_usleep(100000L);  // 休息100000微秒即100毫秒,虽然checkpointer休息了,但是bgwriter同样会在一定的时间后被唤醒,由bgwriter_delay控制。        }        else if (--absorb_counter <= 0)        {                /*                 * Absorb pending fsync requests after each WRITES_PER_ABSORB write                 * operations even when we don't sleep, to prevent overflow of the                 * fsync request queue.                 */                AbsorbFsyncRequests();                absorb_counter = WRITES_PER_ABSORB;        }}






0.05<0.1, 返回false, 不休息。什么情况能休息? 当xlog经历个数比值小于等于0.05时才能休息,也就是发生在XLOG 5个或以内时。

如果调大调度系数到1,那么progress=(100/1000)*1=0.1,当xlog经历个数比值小于等于0.1时才能休息,也就是发生在XLOG 10个或以内时。






/* * Perform a checkpoint --- either during shutdown, or on-the-fly * * flags is a bitwise OR of the following: *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, *              ignoring checkpoint_completion_target parameter. *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or *              CHECKPOINT_END_OF_RECOVERY). * * Note: flags contains other bits, of interest here only for logging purposes. * In particular note that this routine is synchronous and does not pay * attention to CHECKPOINT_WAIT. * * If !shutdown then we are writing an online checkpoint. This is a very special * kind of operation and WAL record because the checkpoint action occurs over * a period of time yet logically occurs at just a single LSN. The logical  逻辑位置是检查点开始时的位置。 * position of the WAL record (redo ptr) is the same or earlier than the * physical position. When we replay WAL we locate the checkpoint via its * physical position then read the redo ptr and actually start replay at the * earlier logical position. Note that we don't write *anything* to WAL at  逻辑位置不写任何东西,在GetInsertRecPtr这里。 * the logical position, so that location could be any other kind of WAL record. * All of this mechanism allows us to continue working while we checkpoint.   * As a result, timing of actions is critical here and be careful to note that * this function will likely take minutes to execute on a busy system. */voidCreateCheckPoint(int flags){        /* use volatile pointer to prevent code rearrangement */        volatile XLogCtlData *xlogctl = XLogCtl;        bool            shutdown;        CheckPoint      checkPoint;        XLogRecPtr      recptr;        XLogCtlInsert *Insert = &XLogCtl->Insert;        XLogRecData rdata;        uint32          freespace;        XLogSegNo       _logSegNo;        XLogRecPtr      curInsert;        VirtualTransactionId *vxids;        int                     nvxids;......


/*         * Acquire CheckpointLock to ensure only one checkpoint happens at a time.         * (This is just pro forma, since in the present system structure there is         * only one process that is allowed to issue checkpoints at any given         * time.)         */        LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);......


if (shutdown)        {                LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);                ControlFile->state = DB_SHUTDOWNING;                ControlFile->time = (pg_time_t) time(NULL);                UpdateControlFile();                LWLockRelease(ControlFileLock);        }......


/*         * We must block concurrent insertions while examining insert state to         * determine the checkpoint REDO pointer.         */        WALInsertLockAcquireExclusive();        curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);.....



/*         * Compute new REDO record ptr = location of next XLOG record.         *         * NB: this is NOT necessarily where the checkpoint record itself will be,         * since other backends may insert more XLOG records while we're off doing         * the buffer flush work.  Those XLOG records are logically after the         * checkpoint, even though physically before it.  Got that?         */        freespace = INSERT_FREESPACE(curInsert);        if (freespace == 0)        {                if (curInsert % XLogSegSize == 0)                        curInsert += SizeOfXLogLongPHD;                else                        curInsert += SizeOfXLogShortPHD;        }        checkPoint.redo = curInsert;        /*         * Here we update the shared RedoRecPtr for future XLogInsert calls; this         * must be done while holding all the insertion locks.         *         * Note: if we fail to complete the checkpoint, RedoRecPtr will be left         * pointing past where it really needs to point.  This is okay; the only         * consequence is that XLogInsert might back up whole buffers that it         * didn't really need to.  We can't postpone advancing RedoRecPtr because         * XLogInserts that happen while we are dumping buffers must assume that         * their buffer changes are not included in the checkpoint.         */        RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
/*         * Now we can release the WAL insertion locks, allowing other xacts to         * proceed while we are flushing disk buffers.         */



/*         * Get the other info we need for the checkpoint record.         */        LWLockAcquire(XidGenLock, LW_SHARED);        checkPoint.nextXid = ShmemVariableCache->nextXid;        checkPoint.oldestXid = ShmemVariableCache->oldestXid;        checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;        LWLockRelease(XidGenLock);
/* Increase XID epoch if we've wrapped around since last checkpoint */        checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;        if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)                checkPoint.nextXidEpoch++;
LWLockAcquire(OidGenLock, LW_SHARED);        checkPoint.nextOid = ShmemVariableCache->nextOid;        if (!shutdown)                checkPoint.nextOid += ShmemVariableCache->oidCount;        LWLockRelease(OidGenLock);
MultiXactGetCheckptMulti(shutdown,                                                         &checkPoint.nextMulti,                                                         &checkPoint.nextMultiOffset,                                                         &checkPoint.oldestMulti,                                                         &checkPoint.oldestMultiDB);

在checkpoint开始Fsync数据据前,务必等待已提交事务的clog 以及clog的XLOG都已经写完整。

/*         * In some cases there are groups of actions that must all occur on one         * side or the other of a checkpoint record. Before flushing the         * checkpoint record we must explicitly wait for any backend currently         * performing those groups of actions.         *         * One example is end of transaction, so we must wait for any transactions         * that are currently in commit critical sections.  If an xact inserted         * its commit record into XLOG just before the REDO point, then a crash         * restart from the REDO point would not replay that record, which means         * that our flushing had better include the xact's update of pg_clog.  So         * we wait till he's out of his commit critical section before proceeding.         * See notes in RecordTransactionCommit().         *         * Because we've already released the insertion locks, this test is a bit         * fuzzy: it is possible that we will wait for xacts we didn't really need         * to wait for.  But the delay should be short and it seems better to make         * checkpoint take a bit longer than to hold off insertions longer than         * necessary. (In fact, the whole reason we have this issue is that xact.c    //  根源在这里,因为提交写clog的XLOG和写CLOG分两部分完成,分别由2个锁来保护,但实际上这两部分信息应该在检查点的同一边,要么检查点前,要么检查点后。//  所以这里才需要等待,就是等它们到同一面,即那些在检查点前写XLOG的但是没有更新CLOG的,必须等它们的CLOG完成。// 为什么呢?因为RECOVERY时检查点之前的XLOG是不会去replay的,如果clog的xlog在这之前,但是CLOG未写成功,那么在恢复时又不会去replay这些xlog,将导致这些CLOG缺失。         * does commit record XLOG insertion and clog update as two separate steps         * protected by different locks, but again that seems best on grounds of         * minimizing lock contention.)         *         * A transaction that has not yet set delayChkpt when we look cannot be at         * risk, since he's not inserted his commit record yet; and one that's         * already cleared it is not at risk either, since he's done fixing clog         * and we will correctly flush the update below.  So we cannot miss any         * xacts we need to wait for.         */        vxids = GetVirtualXIDsDelayingChkpt(&nvxids);        if (nvxids > 0)        {                do                {                        pg_usleep(10000L);      /* wait for 10 msec */                } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));        }        pfree(vxids);

CheckPointGuts(checkPoint.redo, flags);



/*         * Now insert the checkpoint record into XLOG.         */        rdata.data = (char *) (&checkPoint);        rdata.len = sizeof(checkPoint);        rdata.buffer = InvalidBuffer;        rdata.next = NULL;
recptr = XLogInsert(RM_XLOG_ID,                                                shutdown ? XLOG_CHECKPOINT_SHUTDOWN :                                                XLOG_CHECKPOINT_ONLINE,                                                &rdata);


/*         * Select point at which we can truncate the log, which we base on the         * prior checkpoint's earliest info.         */        XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
/*         * Update the control file.         */        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);        if (shutdown)                ControlFile->state = DB_SHUTDOWNED;        ControlFile->prevCheckPoint = ControlFile->checkPoint;        ControlFile->checkPoint = ProcLastRecPtr;    //  包含检查点的 xlog 结束位置,  ProcLastRecPtr是XLogInsert中更新的一个全局变量,表示XLOG位置。        ControlFile->checkPointCopy = checkPoint;    // 包含检查点的 xlog 逻辑位置,在前面记录了,请看前面的代码        ControlFile->time = (pg_time_t) time(NULL);        /* crash recovery should always recover to the end of WAL */        ControlFile->minRecoveryPoint = InvalidXLogRecPtr;        ControlFile->minRecoveryPointTLI = 0;
/*         * Persist unloggedLSN value. It's reset on crash recovery, so this goes         * unused on non-shutdown checkpoints, but seems useful to store it always         * for debugging purposes.         */        SpinLockAcquire(&XLogCtl->ulsn_lck);        ControlFile->unloggedLSN = XLogCtl->unloggedLSN;        SpinLockRelease(&XLogCtl->ulsn_lck);
UpdateControlFile();        LWLockRelease(ControlFileLock);




/* * Flush all data in shared memory to disk, and fsync * * This is the common code shared between regular checkpoints and * recovery restartpoints. */static voidCheckPointGuts(XLogRecPtr checkPointRedo, int flags){        CheckPointCLOG();   // src/backend/access/transam/clog.c        CheckPointSUBTRANS();  // src/backend/access/transam/subtrans.c        CheckPointMultiXact();  // src/backend/access/transam/multixact.c        CheckPointPredicate();  // src/backend/storage/lmgr/predicate.c        CheckPointRelationMap();  // src/backend/utils/cache/relmapper.c        CheckPointReplicationSlots();  //  src/backend/replication/slot.c        CheckPointSnapBuild();  // src/backend/replication/logical/snapbuild.c        CheckPointLogicalRewriteHeap();  // src/backend/access/heap/rewriteheap.c        CheckPointBuffers(flags);       /* performs all required fsyncs */  // src/backend/storage/buffer/bufmgr.c        /* We deliberately delay 2PC checkpointing as long as possible */          CheckPointTwoPhase(checkPointRedo);  //  src/backend/access/transam/twophase.c}







/* * Check whether we've consumed enough xlog space that a checkpoint is needed. * * new_segno indicates a log file that has just been filled up (or read * during recovery). We measure the distance from RedoRecPtr to new_segno * and see if that exceeds CheckPointSegments. * * Note: it is caller's responsibility that RedoRecPtr is up-to-date. */static boolXLogCheckpointNeeded(XLogSegNo new_segno){        XLogSegNo       old_segno;
XLByteToSeg(RedoRecPtr, old_segno);
if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))             // CheckPointSegments取决于参数checkpoint_segments                return true;        return false;}


/* * Write and/or fsync the log at least as far as WriteRqst indicates. * * If flexible == TRUE, we don't have to write as far as WriteRqst, but * may stop at any convenient boundary (such as a cache or logfile boundary). * This option allows us to avoid uselessly issuing multiple writes when a * single one would do. * * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst) * must be called before grabbing the lock, to make sure the data is ready to * write. */static voidXLogWrite(XLogwrtRqst WriteRqst, bool flexible){......                                /*                                 * Request a checkpoint if we've consumed too much xlog since                                 * the last one.  For speed, we first check using the local                                 * copy of RedoRecPtr, which might be out of date; if it looks                                 * like a checkpoint is needed, forcibly update RedoRecPtr and                                 * recheck.                                 */                                if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))                                {                                        (void) GetRedoRecPtr();                                        if (XLogCheckpointNeeded(openLogSegNo))                                                RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);                                }......


