btree 相关注释

main
yezhengmao 2023-07-07 11:23:17 +08:00
parent c08c2c1620
commit d97ef01e50
5 changed files with 60 additions and 91 deletions

View File

@ -1960,11 +1960,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
}
/*
** Decode the flags byte (the first byte of the header) for a page
** and initialize fields of the MemPage structure accordingly.
**
** Only the following combinations are supported. Anything different
** indicates a corrupt database files:
**
**
** PTF_ZERODATA (0x02, 2)
** PTF_LEAFDATA | PTF_INTKEY (0x05, 5)
@ -2244,8 +2240,7 @@ static void zeroPage(MemPage *pPage, int flags){
/*
** Convert a DbPage obtained from the pager into a MemPage used by
** the btree layer.
** DbPage Btree 使 MemPage
*/
static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);

View File

@ -9,8 +9,7 @@
** May you share freely, never taking more than you give.
**
*************************************************************************
** This file implements an external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
** BTrees
**
** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
** "Sorting And Searching", pages 473-480. Addison-Wesley
@ -32,27 +31,17 @@
** Finding a particular key requires reading O(log(M)) pages from the
** disk where M is the number of entries in the tree.
**
** In this implementation, a single file can hold one or more separate
** BTrees. Each BTree is identified by the index of its root page. The
** key and data for any entry are combined to form the "payload". A
** fixed amount of payload can be carried directly on the database
** page. If the payload is larger than the preset amount then surplus
** bytes are stored on overflow pages. The payload for an entry
** and the preceding pointer are combined to form a "Cell". Each
** page has a small header which contains the Ptr(N) pointer and other
** information such as the size of key and data.
** BTree BTree
** key data payload
** payload
** payload
** payload cell
**
** FORMAT DETAILS
** 1 0 0
** 512 - 65536 2
** btree page / freelist page / overflow page / pointer-map page
**
** The file is divided into pages. The first page is called page 1,
** the second is page 2, and so forth. A page number of zero indicates
** "no such page". The page size can be any power of 2 between 512 and 65536.
** Each page can be either a btree page, a freelist page, an overflow
** page, or a pointer-map page.
**
** The first page is always a btree page. The first 100 bytes of the first
** page contain a special header (the "file header") that describes the file.
** The format of the file header is as follows:
** 1 btree page 100
**
** OFFSET SIZE DESCRIPTION
** 0 16 Header string: "SQLite format 3\000"
@ -81,31 +70,21 @@
** 92 4 The version-valid-for number
** 96 4 SQLITE_VERSION_NUMBER
**
** All of the integer values are big-endian (most significant byte first).
** file change counter :
**
** The file change counter is incremented when the database is changed
** This counter allows other processes to know when the file has changed
** and thus when they need to flush their cache.
**
** The max embedded payload fraction is the amount of the total usable
** space in a page that can be consumed by a single cell for standard
** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
** is to limit the maximum cell size so that at least 4 cells will fit
** on one page. Thus the default max embedded payload fraction is 64.
**
** If the payload for a cell is larger than the max payload, then extra
** payload is spilled to overflow pages. Once an overflow page is allocated,
** as many bytes as possible are moved into the overflow pages without letting
** the cell size drop below the min embedded payload fraction.
** max embedding payload : cell 使
** 255 - 100% 64 - 25% cell 1/4
** cell payload overflow
**
** The min leaf payload fraction is like the min embedded payload fraction
** except that it applies to leaf nodes in a LEAFDATA tree. The maximum
** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
** not specified in the header.
**
** Each btree pages is divided into three sections: The header, the
** cell pointer array, and the cell content area. Page 1 also has a 100-byte
** file header that occurs before the page header.
** BTree
** The header
** The cell pointer array
** The cell content area
**
** |----------------|
** | file header | 100 bytes. Page 1 only.
@ -133,12 +112,11 @@
** 7 1 number of fragmented free bytes
** 8 4 Right child (the Ptr(N) value). Omitted on leaves.
**
** The flags define the format of this btree page. The leaf flag means that
** this page has no children. The zerodata flag means that this page carries
** only keys and no data. The intkey flag means that the key is an integer
** which is stored in the key size entry of the cell header rather than in
** the payload area.
** intkey key
** zerodata key data
** leaf
**
** cell
** The cell pointer array begins on the first byte after the page header.
** The cell pointer array contains zero or more 2-byte numbers which are
** offsets from the beginning of the page to the cell content in the cell
@ -149,23 +127,13 @@
** Cell content is stored at the very end of the page and grows toward the
** beginning of the page.
**
** Unused space within the cell content area is collected into a linked list of
** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
** to the first freeblock is given in the header. Freeblocks occur in
** increasing order. Because a freeblock must be at least 4 bytes in size,
** any group of 3 or fewer unused bytes in the cell content area cannot
** exist on the freeblock chain. A group of 3 or fewer free bytes is called
** a fragment. The total number of bytes in all fragments is recorded.
** in the page header at offset 7.
**
** freeblock 4 4 block
** freeblock
** SIZE DESCRIPTION
** 2 Byte offset of the next freeblock
** 2 Bytes in this freeblock
**
** Cells are of variable length. Cells are stored in the cell content area at
** the end of the page. Pointers to the cells are in the cell pointer array
** that immediately follows the page header. Cells is not necessarily
** contiguous or in order, but cell pointers are contiguous and in order.
** cell cell
**
** Cell content makes use of variable length integers. A variable
** length integer is 1 to 9 bytes where the lower 7 bits of each
@ -272,35 +240,35 @@ typedef struct CellInfo CellInfo;
*/
struct MemPage {
u8 isInit; /* True if previously initialized. MUST BE FIRST! */
u8 intKey; /* True if table b-trees. False for index b-trees */
u8 intKey; /* table - 1 / index - 0 */
u8 intKeyLeaf; /* True if the leaf of an intKey table */
Pgno pgno; /* Page number for this page */
Pgno pgno; /* 当前页的页号 */
/* Only the first 8 bytes (above) are zeroed by pager.c when a new page
** is allocated. All fields that follow must be initialized before use */
u8 leaf; /* True if a leaf page */
u8 hdrOffset; /* 100 for page 1. 0 otherwise */
u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */
u8 leaf; /* 是否为叶子节点页 */
u8 hdrOffset; /* 第 1 页额外 100 header offset 长度 */
u8 childPtrSize; /* 叶子节点无子节点 == 0否则 == 4 */
u8 max1bytePayload; /* min(maxLocal,127) */
u8 nOverflow; /* Number of overflow cell bodies in aCell[] */
u16 maxLocal; /* Copy of BtShared.maxLocal or BtShared.maxLeaf */
u16 minLocal; /* Copy of BtShared.minLocal or BtShared.minLeaf */
u16 cellOffset; /* Index in aData of first cell pointer */
u16 cellOffset; /* 第一个 cell 指针的位置 */
int nFree; /* Number of free bytes on the page. -1 for unknown */
u16 nCell; /* Number of cells on this page, local and ovfl */
u16 nCell; /* 当前页面 cell 数量 */
u16 maskPage; /* Mask for page offset */
u16 aiOvfl[4]; /* Insert the i-th overflow cell before the aiOvfl-th
** non-overflow cell */
u8 *apOvfl[4]; /* Pointers to the body of overflow cells */
BtShared *pBt; /* Pointer to BtShared that this page is part of */
u8 *aData; /* Pointer to disk image of the page data */
u8 *aData; /* 指向磁盘中数据格式 */
u8 *aDataEnd; /* One byte past the end of the entire page - not just
** the usable space, the entire page. Used to prevent
** corruption-induced buffer overflow. */
u8 *aCellIdx; /* The cell index area */
u8 *aCellIdx; /* cell 指针的位置 */
u8 *aDataOfst; /* Same as aData for leaves. aData+4 for interior */
DbPage *pDbPage; /* Pager page handle */
u16 (*xCellSize)(MemPage*,u8*); /* cellSizePtr method */
void (*xParseCell)(MemPage*,u8*,CellInfo*); /* btreeParseCell method */
void (*xParseCell)(MemPage*,u8*,CellInfo*); /* cell解析函数 */
};
/*
@ -441,7 +409,7 @@ struct BtShared {
u16 minLocal; /* Minimum local payload in non-LEAFDATA tables */
u16 maxLeaf; /* Maximum local payload in a LEAFDATA table */
u16 minLeaf; /* Minimum local payload in a LEAFDATA table */
u32 pageSize; /* Total number of bytes on a page */
u32 pageSize; /* 页面大小 */
u32 usableSize; /* Number of usable bytes on each page */
int nTransaction; /* Number of open transactions (read + write) */
u32 nPage; /* Number of pages in the database */
@ -479,10 +447,10 @@ struct BtShared {
*/
struct CellInfo {
i64 nKey; /* The key for INTKEY tables, or nPayload otherwise */
u8 *pPayload; /* Pointer to the start of payload */
u32 nPayload; /* Bytes of payload */
u8 *pPayload; /* payload 的指针位置 */
u32 nPayload; /* payload 的字节数 */
u16 nLocal; /* Amount of payload held locally, not on overflow */
u16 nSize; /* Size of the cell content on the main b-tree page */
u16 nSize; /* cell 大小 */
};
/*
@ -546,7 +514,7 @@ struct BtCursor {
u16 ix; /* Current index for apPage[iPage] */
u16 aiIdx[BTCURSOR_MAX_DEPTH-1]; /* Current index in apPage[i] */
struct KeyInfo *pKeyInfo; /* Arg passed to comparison function */
MemPage *pPage; /* Current page */
MemPage *pPage; /* 游标所在内存页面 */
MemPage *apPage[BTCURSOR_MAX_DEPTH-1]; /* Stack of parents of current page */
};

View File

@ -171,6 +171,8 @@ int sqlite3PagerTrace=1; /* True to enable tracing */
**
** OPEN:
**
**
**
** The pager starts up in this state. Nothing is guaranteed in this
** state - the file may or may not be locked and the database size is
** unknown. The database may not be read or written.
@ -619,7 +621,7 @@ struct PagerSavepoint {
struct Pager {
sqlite3_vfs *pVfs; /* OS functions to use for IO */
u8 exclusiveMode; /* Boolean. True if locking_mode==EXCLUSIVE */
u8 journalMode; /* One of the PAGER_JOURNALMODE_* values */
u8 journalMode; /* 日志模式 */
u8 useJournal; /* Use a rollback journal on this file */
u8 noSync; /* Do not sync the journal if true */
u8 fullSync; /* Do extra syncs of the journal for robustness */
@ -648,8 +650,8 @@ struct Pager {
u8 doNotSpill; /* Do not spill the cache when non-zero */
u8 subjInMemory; /* True to use in-memory sub-journals */
u8 bUseFetch; /* True to use xFetch() */
u8 hasHeldSharedLock; /* True if a shared lock has ever been held */
Pgno dbSize; /* Number of pages in the database */
u8 hasHeldSharedLock; /* 是否占有 shared Lock */
Pgno dbSize; /* 数据库中页面总数 */
Pgno dbOrigSize; /* dbSize before the current transaction */
Pgno dbFileSize; /* Number of pages in the database file */
Pgno dbHintSize; /* Value passed to FCNTL_SIZE_HINT call */
@ -3185,8 +3187,7 @@ static int pagerWalFrames(
}
/*
** Begin a read transaction on the WAL.
**
** WAL
** This routine used to be called "pagerOpenSnapshot()" because it essentially
** makes a snapshot of the database at the current point in time and preserves
** that snapshot for use by the reader in spite of concurrently changes by
@ -5192,6 +5193,9 @@ int sqlite3PagerSharedLock(Pager *pPager){
** outstanding pages. This implies that the pager state should either
** be OPEN or READER. READER is only possible if the pager is or was in
** exclusive access mode. */
/*
** OPEN READER
*/
assert( sqlite3PcacheRefCount(pPager->pPCache)==0 );
assert( assert_pager_state(pPager) );
assert( pPager->eState==PAGER_OPEN || pPager->eState==PAGER_READER );
@ -5551,6 +5555,7 @@ static int getPageNormal(
}else{
assert( pPg->pPager==pPager );
pPager->aStat[PAGER_STAT_MISS]++;
// 读取页面数据
rc = readDbPage(pPg);
if( rc!=SQLITE_OK ){
goto pager_acquire_err;

View File

@ -41,7 +41,7 @@
struct PCache {
PgHdr *pDirty, *pDirtyTail; /* LRU 顺序排列的脏页 */
PgHdr *pSynced; /* Last synced page in dirty page list */
i64 nRefSum; /* Sum of ref counts over all pages */
i64 nRefSum; /* 总页面引用计数 */
int szCache; /* Configured cache size */
int szSpill; /* Size before spilling occurs */
int szPage; /* Size of every page in this cache */
@ -368,6 +368,7 @@ int sqlite3PcacheSetPageSize(PCache *pCache, int szPage){
/*
** Try to obtain a page from the cache.
** cache
**
** This routine returns a pointer to an sqlite3_pcache_page object if
** such an object is already in cache, or if a new one is created.

View File

@ -24,12 +24,12 @@ typedef struct PCache PCache;
*/
struct PgHdr {
sqlite3_pcache_page *pPage; /* Pcache object page handle */
void *pData; /* Page data */
void *pData; /* 页面数据 */
void *pExtra; /* Extra content */
PCache *pCache; /* PRIVATE: Cache that owns this page */
PgHdr *pDirty; /* Transient list of dirty sorted by pgno */
Pager *pPager; /* The pager this page is part of */
Pgno pgno; /* Page number for this page */
Pgno pgno; /* 页号 */
#ifdef SQLITE_CHECK_PAGES
u32 pageHash; /* Hash of page content */
#endif
@ -40,7 +40,7 @@ struct PgHdr {
** private to pcache.c and should not be accessed by other modules.
** pCache is grouped with the public elements for efficiency.
*/
i64 nRef; /* Number of users of this page */
i64 nRef; /* 页面应用技术 */
PgHdr *pDirtyNext; /* Next element in list of dirty pages */
PgHdr *pDirtyPrev; /* Previous element in list of dirty pages */
/* NB: pDirtyNext and pDirtyPrev are undefined if the