btree 相关注释

main
yezhengmao 2023-07-07 11:23:17 +08:00
parent c08c2c1620
commit d97ef01e50
5 changed files with 60 additions and 91 deletions

View File

@ -1960,11 +1960,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
} }
/* /*
** Decode the flags byte (the first byte of the header) for a page **
** and initialize fields of the MemPage structure accordingly.
**
** Only the following combinations are supported. Anything different
** indicates a corrupt database files:
** **
** PTF_ZERODATA (0x02, 2) ** PTF_ZERODATA (0x02, 2)
** PTF_LEAFDATA | PTF_INTKEY (0x05, 5) ** PTF_LEAFDATA | PTF_INTKEY (0x05, 5)
@ -2244,8 +2240,7 @@ static void zeroPage(MemPage *pPage, int flags){
/* /*
** Convert a DbPage obtained from the pager into a MemPage used by ** DbPage Btree 使 MemPage
** the btree layer.
*/ */
static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);

View File

@ -9,8 +9,7 @@
** May you share freely, never taking more than you give. ** May you share freely, never taking more than you give.
** **
************************************************************************* *************************************************************************
** This file implements an external (disk-based) database using BTrees. ** BTrees
** For a detailed discussion of BTrees, refer to
** **
** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3: ** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
** "Sorting And Searching", pages 473-480. Addison-Wesley ** "Sorting And Searching", pages 473-480. Addison-Wesley
@ -32,27 +31,17 @@
** Finding a particular key requires reading O(log(M)) pages from the ** Finding a particular key requires reading O(log(M)) pages from the
** disk where M is the number of entries in the tree. ** disk where M is the number of entries in the tree.
** **
** In this implementation, a single file can hold one or more separate ** BTree BTree
** BTrees. Each BTree is identified by the index of its root page. The ** key data payload
** key and data for any entry are combined to form the "payload". A ** payload
** fixed amount of payload can be carried directly on the database ** payload
** page. If the payload is larger than the preset amount then surplus ** payload cell
** bytes are stored on overflow pages. The payload for an entry
** and the preceding pointer are combined to form a "Cell". Each
** page has a small header which contains the Ptr(N) pointer and other
** information such as the size of key and data.
** **
** FORMAT DETAILS ** 1 0 0
** 512 - 65536 2
** btree page / freelist page / overflow page / pointer-map page
** **
** The file is divided into pages. The first page is called page 1, ** 1 btree page 100
** the second is page 2, and so forth. A page number of zero indicates
** "no such page". The page size can be any power of 2 between 512 and 65536.
** Each page can be either a btree page, a freelist page, an overflow
** page, or a pointer-map page.
**
** The first page is always a btree page. The first 100 bytes of the first
** page contain a special header (the "file header") that describes the file.
** The format of the file header is as follows:
** **
** OFFSET SIZE DESCRIPTION ** OFFSET SIZE DESCRIPTION
** 0 16 Header string: "SQLite format 3\000" ** 0 16 Header string: "SQLite format 3\000"
@ -81,31 +70,21 @@
** 92 4 The version-valid-for number ** 92 4 The version-valid-for number
** 96 4 SQLITE_VERSION_NUMBER ** 96 4 SQLITE_VERSION_NUMBER
** **
** All of the integer values are big-endian (most significant byte first). ** file change counter :
** **
** The file change counter is incremented when the database is changed ** max embedding payload : cell 使
** This counter allows other processes to know when the file has changed ** 255 - 100% 64 - 25% cell 1/4
** and thus when they need to flush their cache. ** cell payload overflow
**
** The max embedded payload fraction is the amount of the total usable
** space in a page that can be consumed by a single cell for standard
** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
** is to limit the maximum cell size so that at least 4 cells will fit
** on one page. Thus the default max embedded payload fraction is 64.
**
** If the payload for a cell is larger than the max payload, then extra
** payload is spilled to overflow pages. Once an overflow page is allocated,
** as many bytes as possible are moved into the overflow pages without letting
** the cell size drop below the min embedded payload fraction.
** **
** The min leaf payload fraction is like the min embedded payload fraction ** The min leaf payload fraction is like the min embedded payload fraction
** except that it applies to leaf nodes in a LEAFDATA tree. The maximum ** except that it applies to leaf nodes in a LEAFDATA tree. The maximum
** payload fraction for a LEAFDATA tree is always 100% (or 255) and it ** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
** not specified in the header. ** not specified in the header.
** **
** Each btree pages is divided into three sections: The header, the ** BTree
** cell pointer array, and the cell content area. Page 1 also has a 100-byte ** The header
** file header that occurs before the page header. ** The cell pointer array
** The cell content area
** **
** |----------------| ** |----------------|
** | file header | 100 bytes. Page 1 only. ** | file header | 100 bytes. Page 1 only.
@ -133,12 +112,11 @@
** 7 1 number of fragmented free bytes ** 7 1 number of fragmented free bytes
** 8 4 Right child (the Ptr(N) value). Omitted on leaves. ** 8 4 Right child (the Ptr(N) value). Omitted on leaves.
** **
** The flags define the format of this btree page. The leaf flag means that ** intkey key
** this page has no children. The zerodata flag means that this page carries ** zerodata key data
** only keys and no data. The intkey flag means that the key is an integer ** leaf
** which is stored in the key size entry of the cell header rather than in
** the payload area.
** **
** cell
** The cell pointer array begins on the first byte after the page header. ** The cell pointer array begins on the first byte after the page header.
** The cell pointer array contains zero or more 2-byte numbers which are ** The cell pointer array contains zero or more 2-byte numbers which are
** offsets from the beginning of the page to the cell content in the cell ** offsets from the beginning of the page to the cell content in the cell
@ -149,23 +127,13 @@
** Cell content is stored at the very end of the page and grows toward the ** Cell content is stored at the very end of the page and grows toward the
** beginning of the page. ** beginning of the page.
** **
** Unused space within the cell content area is collected into a linked list of ** freeblock 4 4 block
** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset ** freeblock
** to the first freeblock is given in the header. Freeblocks occur in
** increasing order. Because a freeblock must be at least 4 bytes in size,
** any group of 3 or fewer unused bytes in the cell content area cannot
** exist on the freeblock chain. A group of 3 or fewer free bytes is called
** a fragment. The total number of bytes in all fragments is recorded.
** in the page header at offset 7.
**
** SIZE DESCRIPTION ** SIZE DESCRIPTION
** 2 Byte offset of the next freeblock ** 2 Byte offset of the next freeblock
** 2 Bytes in this freeblock ** 2 Bytes in this freeblock
** **
** Cells are of variable length. Cells are stored in the cell content area at ** cell cell
** the end of the page. Pointers to the cells are in the cell pointer array
** that immediately follows the page header. Cells is not necessarily
** contiguous or in order, but cell pointers are contiguous and in order.
** **
** Cell content makes use of variable length integers. A variable ** Cell content makes use of variable length integers. A variable
** length integer is 1 to 9 bytes where the lower 7 bits of each ** length integer is 1 to 9 bytes where the lower 7 bits of each
@ -272,35 +240,35 @@ typedef struct CellInfo CellInfo;
*/ */
struct MemPage { struct MemPage {
u8 isInit; /* True if previously initialized. MUST BE FIRST! */ u8 isInit; /* True if previously initialized. MUST BE FIRST! */
u8 intKey; /* True if table b-trees. False for index b-trees */ u8 intKey; /* table - 1 / index - 0 */
u8 intKeyLeaf; /* True if the leaf of an intKey table */ u8 intKeyLeaf; /* True if the leaf of an intKey table */
Pgno pgno; /* Page number for this page */ Pgno pgno; /* 当前页的页号 */
/* Only the first 8 bytes (above) are zeroed by pager.c when a new page /* Only the first 8 bytes (above) are zeroed by pager.c when a new page
** is allocated. All fields that follow must be initialized before use */ ** is allocated. All fields that follow must be initialized before use */
u8 leaf; /* True if a leaf page */ u8 leaf; /* 是否为叶子节点页 */
u8 hdrOffset; /* 100 for page 1. 0 otherwise */ u8 hdrOffset; /* 第 1 页额外 100 header offset 长度 */
u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */ u8 childPtrSize; /* 叶子节点无子节点 == 0否则 == 4 */
u8 max1bytePayload; /* min(maxLocal,127) */ u8 max1bytePayload; /* min(maxLocal,127) */
u8 nOverflow; /* Number of overflow cell bodies in aCell[] */ u8 nOverflow; /* Number of overflow cell bodies in aCell[] */
u16 maxLocal; /* Copy of BtShared.maxLocal or BtShared.maxLeaf */ u16 maxLocal; /* Copy of BtShared.maxLocal or BtShared.maxLeaf */
u16 minLocal; /* Copy of BtShared.minLocal or BtShared.minLeaf */ u16 minLocal; /* Copy of BtShared.minLocal or BtShared.minLeaf */
u16 cellOffset; /* Index in aData of first cell pointer */ u16 cellOffset; /* 第一个 cell 指针的位置 */
int nFree; /* Number of free bytes on the page. -1 for unknown */ int nFree; /* Number of free bytes on the page. -1 for unknown */
u16 nCell; /* Number of cells on this page, local and ovfl */ u16 nCell; /* 当前页面 cell 数量 */
u16 maskPage; /* Mask for page offset */ u16 maskPage; /* Mask for page offset */
u16 aiOvfl[4]; /* Insert the i-th overflow cell before the aiOvfl-th u16 aiOvfl[4]; /* Insert the i-th overflow cell before the aiOvfl-th
** non-overflow cell */ ** non-overflow cell */
u8 *apOvfl[4]; /* Pointers to the body of overflow cells */ u8 *apOvfl[4]; /* Pointers to the body of overflow cells */
BtShared *pBt; /* Pointer to BtShared that this page is part of */ BtShared *pBt; /* Pointer to BtShared that this page is part of */
u8 *aData; /* Pointer to disk image of the page data */ u8 *aData; /* 指向磁盘中数据格式 */
u8 *aDataEnd; /* One byte past the end of the entire page - not just u8 *aDataEnd; /* One byte past the end of the entire page - not just
** the usable space, the entire page. Used to prevent ** the usable space, the entire page. Used to prevent
** corruption-induced buffer overflow. */ ** corruption-induced buffer overflow. */
u8 *aCellIdx; /* The cell index area */ u8 *aCellIdx; /* cell 指针的位置 */
u8 *aDataOfst; /* Same as aData for leaves. aData+4 for interior */ u8 *aDataOfst; /* Same as aData for leaves. aData+4 for interior */
DbPage *pDbPage; /* Pager page handle */ DbPage *pDbPage; /* Pager page handle */
u16 (*xCellSize)(MemPage*,u8*); /* cellSizePtr method */ u16 (*xCellSize)(MemPage*,u8*); /* cellSizePtr method */
void (*xParseCell)(MemPage*,u8*,CellInfo*); /* btreeParseCell method */ void (*xParseCell)(MemPage*,u8*,CellInfo*); /* cell解析函数 */
}; };
/* /*
@ -441,7 +409,7 @@ struct BtShared {
u16 minLocal; /* Minimum local payload in non-LEAFDATA tables */ u16 minLocal; /* Minimum local payload in non-LEAFDATA tables */
u16 maxLeaf; /* Maximum local payload in a LEAFDATA table */ u16 maxLeaf; /* Maximum local payload in a LEAFDATA table */
u16 minLeaf; /* Minimum local payload in a LEAFDATA table */ u16 minLeaf; /* Minimum local payload in a LEAFDATA table */
u32 pageSize; /* Total number of bytes on a page */ u32 pageSize; /* 页面大小 */
u32 usableSize; /* Number of usable bytes on each page */ u32 usableSize; /* Number of usable bytes on each page */
int nTransaction; /* Number of open transactions (read + write) */ int nTransaction; /* Number of open transactions (read + write) */
u32 nPage; /* Number of pages in the database */ u32 nPage; /* Number of pages in the database */
@ -479,10 +447,10 @@ struct BtShared {
*/ */
struct CellInfo { struct CellInfo {
i64 nKey; /* The key for INTKEY tables, or nPayload otherwise */ i64 nKey; /* The key for INTKEY tables, or nPayload otherwise */
u8 *pPayload; /* Pointer to the start of payload */ u8 *pPayload; /* payload 的指针位置 */
u32 nPayload; /* Bytes of payload */ u32 nPayload; /* payload 的字节数 */
u16 nLocal; /* Amount of payload held locally, not on overflow */ u16 nLocal; /* Amount of payload held locally, not on overflow */
u16 nSize; /* Size of the cell content on the main b-tree page */ u16 nSize; /* cell 大小 */
}; };
/* /*
@ -546,7 +514,7 @@ struct BtCursor {
u16 ix; /* Current index for apPage[iPage] */ u16 ix; /* Current index for apPage[iPage] */
u16 aiIdx[BTCURSOR_MAX_DEPTH-1]; /* Current index in apPage[i] */ u16 aiIdx[BTCURSOR_MAX_DEPTH-1]; /* Current index in apPage[i] */
struct KeyInfo *pKeyInfo; /* Arg passed to comparison function */ struct KeyInfo *pKeyInfo; /* Arg passed to comparison function */
MemPage *pPage; /* Current page */ MemPage *pPage; /* 游标所在内存页面 */
MemPage *apPage[BTCURSOR_MAX_DEPTH-1]; /* Stack of parents of current page */ MemPage *apPage[BTCURSOR_MAX_DEPTH-1]; /* Stack of parents of current page */
}; };

View File

@ -171,6 +171,8 @@ int sqlite3PagerTrace=1; /* True to enable tracing */
** **
** OPEN: ** OPEN:
** **
**
**
** The pager starts up in this state. Nothing is guaranteed in this ** The pager starts up in this state. Nothing is guaranteed in this
** state - the file may or may not be locked and the database size is ** state - the file may or may not be locked and the database size is
** unknown. The database may not be read or written. ** unknown. The database may not be read or written.
@ -619,7 +621,7 @@ struct PagerSavepoint {
struct Pager { struct Pager {
sqlite3_vfs *pVfs; /* OS functions to use for IO */ sqlite3_vfs *pVfs; /* OS functions to use for IO */
u8 exclusiveMode; /* Boolean. True if locking_mode==EXCLUSIVE */ u8 exclusiveMode; /* Boolean. True if locking_mode==EXCLUSIVE */
u8 journalMode; /* One of the PAGER_JOURNALMODE_* values */ u8 journalMode; /* 日志模式 */
u8 useJournal; /* Use a rollback journal on this file */ u8 useJournal; /* Use a rollback journal on this file */
u8 noSync; /* Do not sync the journal if true */ u8 noSync; /* Do not sync the journal if true */
u8 fullSync; /* Do extra syncs of the journal for robustness */ u8 fullSync; /* Do extra syncs of the journal for robustness */
@ -648,8 +650,8 @@ struct Pager {
u8 doNotSpill; /* Do not spill the cache when non-zero */ u8 doNotSpill; /* Do not spill the cache when non-zero */
u8 subjInMemory; /* True to use in-memory sub-journals */ u8 subjInMemory; /* True to use in-memory sub-journals */
u8 bUseFetch; /* True to use xFetch() */ u8 bUseFetch; /* True to use xFetch() */
u8 hasHeldSharedLock; /* True if a shared lock has ever been held */ u8 hasHeldSharedLock; /* 是否占有 shared Lock */
Pgno dbSize; /* Number of pages in the database */ Pgno dbSize; /* 数据库中页面总数 */
Pgno dbOrigSize; /* dbSize before the current transaction */ Pgno dbOrigSize; /* dbSize before the current transaction */
Pgno dbFileSize; /* Number of pages in the database file */ Pgno dbFileSize; /* Number of pages in the database file */
Pgno dbHintSize; /* Value passed to FCNTL_SIZE_HINT call */ Pgno dbHintSize; /* Value passed to FCNTL_SIZE_HINT call */
@ -3185,8 +3187,7 @@ static int pagerWalFrames(
} }
/* /*
** Begin a read transaction on the WAL. ** WAL
**
** This routine used to be called "pagerOpenSnapshot()" because it essentially ** This routine used to be called "pagerOpenSnapshot()" because it essentially
** makes a snapshot of the database at the current point in time and preserves ** makes a snapshot of the database at the current point in time and preserves
** that snapshot for use by the reader in spite of concurrently changes by ** that snapshot for use by the reader in spite of concurrently changes by
@ -5192,6 +5193,9 @@ int sqlite3PagerSharedLock(Pager *pPager){
** outstanding pages. This implies that the pager state should either ** outstanding pages. This implies that the pager state should either
** be OPEN or READER. READER is only possible if the pager is or was in ** be OPEN or READER. READER is only possible if the pager is or was in
** exclusive access mode. */ ** exclusive access mode. */
/*
** OPEN READER
*/
assert( sqlite3PcacheRefCount(pPager->pPCache)==0 ); assert( sqlite3PcacheRefCount(pPager->pPCache)==0 );
assert( assert_pager_state(pPager) ); assert( assert_pager_state(pPager) );
assert( pPager->eState==PAGER_OPEN || pPager->eState==PAGER_READER ); assert( pPager->eState==PAGER_OPEN || pPager->eState==PAGER_READER );
@ -5551,6 +5555,7 @@ static int getPageNormal(
}else{ }else{
assert( pPg->pPager==pPager ); assert( pPg->pPager==pPager );
pPager->aStat[PAGER_STAT_MISS]++; pPager->aStat[PAGER_STAT_MISS]++;
// 读取页面数据
rc = readDbPage(pPg); rc = readDbPage(pPg);
if( rc!=SQLITE_OK ){ if( rc!=SQLITE_OK ){
goto pager_acquire_err; goto pager_acquire_err;

View File

@ -41,7 +41,7 @@
struct PCache { struct PCache {
PgHdr *pDirty, *pDirtyTail; /* LRU 顺序排列的脏页 */ PgHdr *pDirty, *pDirtyTail; /* LRU 顺序排列的脏页 */
PgHdr *pSynced; /* Last synced page in dirty page list */ PgHdr *pSynced; /* Last synced page in dirty page list */
i64 nRefSum; /* Sum of ref counts over all pages */ i64 nRefSum; /* 总页面引用计数 */
int szCache; /* Configured cache size */ int szCache; /* Configured cache size */
int szSpill; /* Size before spilling occurs */ int szSpill; /* Size before spilling occurs */
int szPage; /* Size of every page in this cache */ int szPage; /* Size of every page in this cache */
@ -368,6 +368,7 @@ int sqlite3PcacheSetPageSize(PCache *pCache, int szPage){
/* /*
** Try to obtain a page from the cache. ** Try to obtain a page from the cache.
** cache
** **
** This routine returns a pointer to an sqlite3_pcache_page object if ** This routine returns a pointer to an sqlite3_pcache_page object if
** such an object is already in cache, or if a new one is created. ** such an object is already in cache, or if a new one is created.

View File

@ -24,12 +24,12 @@ typedef struct PCache PCache;
*/ */
struct PgHdr { struct PgHdr {
sqlite3_pcache_page *pPage; /* Pcache object page handle */ sqlite3_pcache_page *pPage; /* Pcache object page handle */
void *pData; /* Page data */ void *pData; /* 页面数据 */
void *pExtra; /* Extra content */ void *pExtra; /* Extra content */
PCache *pCache; /* PRIVATE: Cache that owns this page */ PCache *pCache; /* PRIVATE: Cache that owns this page */
PgHdr *pDirty; /* Transient list of dirty sorted by pgno */ PgHdr *pDirty; /* Transient list of dirty sorted by pgno */
Pager *pPager; /* The pager this page is part of */ Pager *pPager; /* The pager this page is part of */
Pgno pgno; /* Page number for this page */ Pgno pgno; /* 页号 */
#ifdef SQLITE_CHECK_PAGES #ifdef SQLITE_CHECK_PAGES
u32 pageHash; /* Hash of page content */ u32 pageHash; /* Hash of page content */
#endif #endif
@ -40,7 +40,7 @@ struct PgHdr {
** private to pcache.c and should not be accessed by other modules. ** private to pcache.c and should not be accessed by other modules.
** pCache is grouped with the public elements for efficiency. ** pCache is grouped with the public elements for efficiency.
*/ */
i64 nRef; /* Number of users of this page */ i64 nRef; /* 页面应用技术 */
PgHdr *pDirtyNext; /* Next element in list of dirty pages */ PgHdr *pDirtyNext; /* Next element in list of dirty pages */
PgHdr *pDirtyPrev; /* Previous element in list of dirty pages */ PgHdr *pDirtyPrev; /* Previous element in list of dirty pages */
/* NB: pDirtyNext and pDirtyPrev are undefined if the /* NB: pDirtyNext and pDirtyPrev are undefined if the