From d97ef01e50b25b6dbaf694a73c60477bec0d8bad Mon Sep 17 00:00:00 2001 From: yezhengmao Date: Fri, 7 Jul 2023 11:23:17 +0800 Subject: [PATCH] =?UTF-8?q?btree=20=E7=9B=B8=E5=85=B3=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/btree.c | 9 +--- src/btreeInt.h | 118 ++++++++++++++++++------------------------------- src/pager.c | 15 ++++--- src/pcache.c | 3 +- src/pcache.h | 6 +-- 5 files changed, 60 insertions(+), 91 deletions(-) diff --git a/src/btree.c b/src/btree.c index 87bc005..1eac163 100644 --- a/src/btree.c +++ b/src/btree.c @@ -1960,11 +1960,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ } /* -** Decode the flags byte (the first byte of the header) for a page -** and initialize fields of the MemPage structure accordingly. -** -** Only the following combinations are supported. Anything different -** indicates a corrupt database files: +** 解析页类型:仅支持如下参数组合 ** ** PTF_ZERODATA (0x02, 2) ** PTF_LEAFDATA | PTF_INTKEY (0x05, 5) @@ -2244,8 +2240,7 @@ static void zeroPage(MemPage *pPage, int flags){ /* -** Convert a DbPage obtained from the pager into a MemPage used by -** the btree layer. +** 转化 DbPage 到 Btree 使用的 MemPage */ static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); diff --git a/src/btreeInt.h b/src/btreeInt.h index 6d35784..f44ae4e 100644 --- a/src/btreeInt.h +++ b/src/btreeInt.h @@ -9,8 +9,7 @@ ** May you share freely, never taking more than you give. ** ************************************************************************* -** This file implements an external (disk-based) database using BTrees. -** For a detailed discussion of BTrees, refer to +** BTrees 实现参考 ** ** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3: ** "Sorting And Searching", pages 473-480. Addison-Wesley @@ -32,27 +31,17 @@ ** Finding a particular key requires reading O(log(M)) pages from the ** disk where M is the number of entries in the tree. ** -** In this implementation, a single file can hold one or more separate -** BTrees. Each BTree is identified by the index of its root page. The -** key and data for any entry are combined to form the "payload". A -** fixed amount of payload can be carried directly on the database -** page. If the payload is larger than the preset amount then surplus -** bytes are stored on overflow pages. The payload for an entry -** and the preceding pointer are combined to form a "Cell". Each -** page has a small header which contains the Ptr(N) pointer and other -** information such as the size of key and data. +** 一个文件可能含有一个或多个不同的 BTree,每一个 BTree 由根页面表示 +** key 和 data 构成 payload +** 固定大小的 payload 能够直接存放在数据库页中 +** payload 太大会存放在溢出页中 +** payload 和前一个指针组成一个 cell +** +** 注意:页面编号从 1 开始,而非 0 ,0页代表页面不存在 +** 页面大小为 512 - 65536 ,且为 2 的幂次方 +** 页类型: btree page / freelist page / overflow page / pointer-map page ** -** FORMAT DETAILS -** -** The file is divided into pages. The first page is called page 1, -** the second is page 2, and so forth. A page number of zero indicates -** "no such page". The page size can be any power of 2 between 512 and 65536. -** Each page can be either a btree page, a freelist page, an overflow -** page, or a pointer-map page. -** -** The first page is always a btree page. The first 100 bytes of the first -** page contain a special header (the "file header") that describes the file. -** The format of the file header is as follows: +** 第 1 页为 btree page 含有特殊文件头,共 100 字节,格式如下(大端存放): ** ** OFFSET SIZE DESCRIPTION ** 0 16 Header string: "SQLite format 3\000" @@ -81,31 +70,21 @@ ** 92 4 The version-valid-for number ** 96 4 SQLITE_VERSION_NUMBER ** -** All of the integer values are big-endian (most significant byte first). -** -** The file change counter is incremented when the database is changed -** This counter allows other processes to know when the file has changed -** and thus when they need to flush their cache. -** -** The max embedded payload fraction is the amount of the total usable -** space in a page that can be consumed by a single cell for standard -** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default -** is to limit the maximum cell size so that at least 4 cells will fit -** on one page. Thus the default max embedded payload fraction is 64. -** -** If the payload for a cell is larger than the max payload, then extra -** payload is spilled to overflow pages. Once an overflow page is allocated, -** as many bytes as possible are moved into the overflow pages without letting -** the cell size drop below the min embedded payload fraction. +** file change counter : 文件修改时增加,多进程场景知道文件是否被修改 +** +** max embedding payload : 一个 cell 最大使用空间占一页的百分比 +** 255 - 100% 默认 64 - 25% 及一个 cell 最多占用 1/4 页 +** 如果超过 cell 限定大小,额外的 payload 存放在 overflow 页中 ** ** The min leaf payload fraction is like the min embedded payload fraction ** except that it applies to leaf nodes in a LEAFDATA tree. The maximum ** payload fraction for a LEAFDATA tree is always 100% (or 255) and it ** not specified in the header. ** -** Each btree pages is divided into three sections: The header, the -** cell pointer array, and the cell content area. Page 1 also has a 100-byte -** file header that occurs before the page header. +** BTree 页面格式: +** The header +** The cell pointer array +** The cell content area ** ** |----------------| ** | file header | 100 bytes. Page 1 only. @@ -133,12 +112,11 @@ ** 7 1 number of fragmented free bytes ** 8 4 Right child (the Ptr(N) value). Omitted on leaves. ** -** The flags define the format of this btree page. The leaf flag means that -** this page has no children. The zerodata flag means that this page carries -** only keys and no data. The intkey flag means that the key is an integer -** which is stored in the key size entry of the cell header rather than in -** the payload area. +** intkey key 是整数 +** zerodata 只包含 key 不包含 data +** leaf 没有子页节点 ** +** cell 指针有序存放 ** The cell pointer array begins on the first byte after the page header. ** The cell pointer array contains zero or more 2-byte numbers which are ** offsets from the beginning of the page to the cell content in the cell @@ -149,23 +127,13 @@ ** Cell content is stored at the very end of the page and grows toward the ** beginning of the page. ** -** Unused space within the cell content area is collected into a linked list of -** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset -** to the first freeblock is given in the header. Freeblocks occur in -** increasing order. Because a freeblock must be at least 4 bytes in size, -** any group of 3 or fewer unused bytes in the cell content area cannot -** exist on the freeblock chain. A group of 3 or fewer free bytes is called -** a fragment. The total number of bytes in all fragments is recorded. -** in the page header at offset 7. -** +** freeblock 大小至少为 4 字节,小于 4 字节的 block 为碎片内存 +** freeblock 格式如下: ** SIZE DESCRIPTION ** 2 Byte offset of the next freeblock ** 2 Bytes in this freeblock -** -** Cells are of variable length. Cells are stored in the cell content area at -** the end of the page. Pointers to the cells are in the cell pointer array -** that immediately follows the page header. Cells is not necessarily -** contiguous or in order, but cell pointers are contiguous and in order. +** +** cell 可以不连续和有序,但是 cell 指针必须有序且连续 ** ** Cell content makes use of variable length integers. A variable ** length integer is 1 to 9 bytes where the lower 7 bits of each @@ -272,35 +240,35 @@ typedef struct CellInfo CellInfo; */ struct MemPage { u8 isInit; /* True if previously initialized. MUST BE FIRST! */ - u8 intKey; /* True if table b-trees. False for index b-trees */ + u8 intKey; /* table - 1 / index - 0 */ u8 intKeyLeaf; /* True if the leaf of an intKey table */ - Pgno pgno; /* Page number for this page */ + Pgno pgno; /* 当前页的页号 */ /* Only the first 8 bytes (above) are zeroed by pager.c when a new page ** is allocated. All fields that follow must be initialized before use */ - u8 leaf; /* True if a leaf page */ - u8 hdrOffset; /* 100 for page 1. 0 otherwise */ - u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */ + u8 leaf; /* 是否为叶子节点页 */ + u8 hdrOffset; /* 第 1 页额外 100 header offset 长度 */ + u8 childPtrSize; /* 叶子节点无子节点 == 0,否则 == 4 */ u8 max1bytePayload; /* min(maxLocal,127) */ u8 nOverflow; /* Number of overflow cell bodies in aCell[] */ u16 maxLocal; /* Copy of BtShared.maxLocal or BtShared.maxLeaf */ u16 minLocal; /* Copy of BtShared.minLocal or BtShared.minLeaf */ - u16 cellOffset; /* Index in aData of first cell pointer */ + u16 cellOffset; /* 第一个 cell 指针的位置 */ int nFree; /* Number of free bytes on the page. -1 for unknown */ - u16 nCell; /* Number of cells on this page, local and ovfl */ + u16 nCell; /* 当前页面 cell 数量 */ u16 maskPage; /* Mask for page offset */ u16 aiOvfl[4]; /* Insert the i-th overflow cell before the aiOvfl-th ** non-overflow cell */ u8 *apOvfl[4]; /* Pointers to the body of overflow cells */ BtShared *pBt; /* Pointer to BtShared that this page is part of */ - u8 *aData; /* Pointer to disk image of the page data */ + u8 *aData; /* 指向磁盘中数据格式 */ u8 *aDataEnd; /* One byte past the end of the entire page - not just ** the usable space, the entire page. Used to prevent ** corruption-induced buffer overflow. */ - u8 *aCellIdx; /* The cell index area */ + u8 *aCellIdx; /* cell 指针的位置 */ u8 *aDataOfst; /* Same as aData for leaves. aData+4 for interior */ DbPage *pDbPage; /* Pager page handle */ u16 (*xCellSize)(MemPage*,u8*); /* cellSizePtr method */ - void (*xParseCell)(MemPage*,u8*,CellInfo*); /* btreeParseCell method */ + void (*xParseCell)(MemPage*,u8*,CellInfo*); /* cell解析函数 */ }; /* @@ -441,7 +409,7 @@ struct BtShared { u16 minLocal; /* Minimum local payload in non-LEAFDATA tables */ u16 maxLeaf; /* Maximum local payload in a LEAFDATA table */ u16 minLeaf; /* Minimum local payload in a LEAFDATA table */ - u32 pageSize; /* Total number of bytes on a page */ + u32 pageSize; /* 页面大小 */ u32 usableSize; /* Number of usable bytes on each page */ int nTransaction; /* Number of open transactions (read + write) */ u32 nPage; /* Number of pages in the database */ @@ -479,10 +447,10 @@ struct BtShared { */ struct CellInfo { i64 nKey; /* The key for INTKEY tables, or nPayload otherwise */ - u8 *pPayload; /* Pointer to the start of payload */ - u32 nPayload; /* Bytes of payload */ + u8 *pPayload; /* payload 的指针位置 */ + u32 nPayload; /* payload 的字节数 */ u16 nLocal; /* Amount of payload held locally, not on overflow */ - u16 nSize; /* Size of the cell content on the main b-tree page */ + u16 nSize; /* cell 大小 */ }; /* @@ -546,7 +514,7 @@ struct BtCursor { u16 ix; /* Current index for apPage[iPage] */ u16 aiIdx[BTCURSOR_MAX_DEPTH-1]; /* Current index in apPage[i] */ struct KeyInfo *pKeyInfo; /* Arg passed to comparison function */ - MemPage *pPage; /* Current page */ + MemPage *pPage; /* 游标所在内存页面 */ MemPage *apPage[BTCURSOR_MAX_DEPTH-1]; /* Stack of parents of current page */ }; diff --git a/src/pager.c b/src/pager.c index 44384de..e7ecae0 100644 --- a/src/pager.c +++ b/src/pager.c @@ -171,6 +171,8 @@ int sqlite3PagerTrace=1; /* True to enable tracing */ ** ** OPEN: ** +** 初始化状态,无读写事务 +** 所有数据都不一定可靠 ** The pager starts up in this state. Nothing is guaranteed in this ** state - the file may or may not be locked and the database size is ** unknown. The database may not be read or written. @@ -619,7 +621,7 @@ struct PagerSavepoint { struct Pager { sqlite3_vfs *pVfs; /* OS functions to use for IO */ u8 exclusiveMode; /* Boolean. True if locking_mode==EXCLUSIVE */ - u8 journalMode; /* One of the PAGER_JOURNALMODE_* values */ + u8 journalMode; /* 日志模式 */ u8 useJournal; /* Use a rollback journal on this file */ u8 noSync; /* Do not sync the journal if true */ u8 fullSync; /* Do extra syncs of the journal for robustness */ @@ -648,8 +650,8 @@ struct Pager { u8 doNotSpill; /* Do not spill the cache when non-zero */ u8 subjInMemory; /* True to use in-memory sub-journals */ u8 bUseFetch; /* True to use xFetch() */ - u8 hasHeldSharedLock; /* True if a shared lock has ever been held */ - Pgno dbSize; /* Number of pages in the database */ + u8 hasHeldSharedLock; /* 是否占有 shared Lock */ + Pgno dbSize; /* 数据库中页面总数 */ Pgno dbOrigSize; /* dbSize before the current transaction */ Pgno dbFileSize; /* Number of pages in the database file */ Pgno dbHintSize; /* Value passed to FCNTL_SIZE_HINT call */ @@ -3185,8 +3187,7 @@ static int pagerWalFrames( } /* -** Begin a read transaction on the WAL. -** +** WAL 模式下进入读事务 ** This routine used to be called "pagerOpenSnapshot()" because it essentially ** makes a snapshot of the database at the current point in time and preserves ** that snapshot for use by the reader in spite of concurrently changes by @@ -5192,6 +5193,9 @@ int sqlite3PagerSharedLock(Pager *pPager){ ** outstanding pages. This implies that the pager state should either ** be OPEN or READER. READER is only possible if the pager is or was in ** exclusive access mode. */ + /* + ** 仅 OPEN 和 READER 状态能够进入 + */ assert( sqlite3PcacheRefCount(pPager->pPCache)==0 ); assert( assert_pager_state(pPager) ); assert( pPager->eState==PAGER_OPEN || pPager->eState==PAGER_READER ); @@ -5551,6 +5555,7 @@ static int getPageNormal( }else{ assert( pPg->pPager==pPager ); pPager->aStat[PAGER_STAT_MISS]++; + // 读取页面数据 rc = readDbPage(pPg); if( rc!=SQLITE_OK ){ goto pager_acquire_err; diff --git a/src/pcache.c b/src/pcache.c index 2a0ff4d..c09fcd4 100644 --- a/src/pcache.c +++ b/src/pcache.c @@ -41,7 +41,7 @@ struct PCache { PgHdr *pDirty, *pDirtyTail; /* LRU 顺序排列的脏页 */ PgHdr *pSynced; /* Last synced page in dirty page list */ - i64 nRefSum; /* Sum of ref counts over all pages */ + i64 nRefSum; /* 总页面引用计数 */ int szCache; /* Configured cache size */ int szSpill; /* Size before spilling occurs */ int szPage; /* Size of every page in this cache */ @@ -368,6 +368,7 @@ int sqlite3PcacheSetPageSize(PCache *pCache, int szPage){ /* ** Try to obtain a page from the cache. +** 尝试从 cache 中获取页面 ** ** This routine returns a pointer to an sqlite3_pcache_page object if ** such an object is already in cache, or if a new one is created. diff --git a/src/pcache.h b/src/pcache.h index f945dab..7dc32cf 100644 --- a/src/pcache.h +++ b/src/pcache.h @@ -24,12 +24,12 @@ typedef struct PCache PCache; */ struct PgHdr { sqlite3_pcache_page *pPage; /* Pcache object page handle */ - void *pData; /* Page data */ + void *pData; /* 页面数据 */ void *pExtra; /* Extra content */ PCache *pCache; /* PRIVATE: Cache that owns this page */ PgHdr *pDirty; /* Transient list of dirty sorted by pgno */ Pager *pPager; /* The pager this page is part of */ - Pgno pgno; /* Page number for this page */ + Pgno pgno; /* 页号 */ #ifdef SQLITE_CHECK_PAGES u32 pageHash; /* Hash of page content */ #endif @@ -40,7 +40,7 @@ struct PgHdr { ** private to pcache.c and should not be accessed by other modules. ** pCache is grouped with the public elements for efficiency. */ - i64 nRef; /* Number of users of this page */ + i64 nRef; /* 页面应用技术 */ PgHdr *pDirtyNext; /* Next element in list of dirty pages */ PgHdr *pDirtyPrev; /* Previous element in list of dirty pages */ /* NB: pDirtyNext and pDirtyPrev are undefined if the