000001  /*
000002  ** 2004 April 6
000003  **
000004  ** The author disclaims copyright to this source code.  In place of
000005  ** a legal notice, here is a blessing:
000006  **
000007  **    May you do good and not evil.
000008  **    May you find forgiveness for yourself and forgive others.
000009  **    May you share freely, never taking more than you give.
000010  **
000011  *************************************************************************
000012  ** This file implements an external (disk-based) database using BTrees.
000013  ** See the header comment on "btreeInt.h" for additional information.
000014  ** Including a description of file format and an overview of operation.
000015  */
000016  #include "btreeInt.h"
000017  
000018  /*
000019  ** The header string that appears at the beginning of every
000020  ** SQLite database.
000021  */
000022  static const char zMagicHeader[] = SQLITE_FILE_HEADER;
000023  
000024  /*
000025  ** Set this global variable to 1 to enable tracing using the TRACE
000026  ** macro.
000027  */
000028  #if 0
000029  int sqlite3BtreeTrace=1;  /* True to enable tracing */
000030  # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
000031  #else
000032  # define TRACE(X)
000033  #endif
000034  
000035  /*
000036  ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
000037  ** But if the value is zero, make it 65536.
000038  **
000039  ** This routine is used to extract the "offset to cell content area" value
000040  ** from the header of a btree page.  If the page size is 65536 and the page
000041  ** is empty, the offset should be 65536, but the 2-byte value stores zero.
000042  ** This routine makes the necessary adjustment to 65536.
000043  */
000044  #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
000045  
000046  /*
000047  ** Values passed as the 5th argument to allocateBtreePage()
000048  */
000049  #define BTALLOC_ANY   0           /* Allocate any page */
000050  #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
000051  #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
000052  
000053  /*
000054  ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 
000055  ** defined, or 0 if it is. For example:
000056  **
000057  **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
000058  */
000059  #ifndef SQLITE_OMIT_AUTOVACUUM
000060  #define IfNotOmitAV(expr) (expr)
000061  #else
000062  #define IfNotOmitAV(expr) 0
000063  #endif
000064  
000065  #ifndef SQLITE_OMIT_SHARED_CACHE
000066  /*
000067  ** A list of BtShared objects that are eligible for participation
000068  ** in shared cache.  This variable has file scope during normal builds,
000069  ** but the test harness needs to access it so we make it global for 
000070  ** test builds.
000071  **
000072  ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
000073  */
000074  #ifdef SQLITE_TEST
000075  BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
000076  #else
000077  static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
000078  #endif
000079  #endif /* SQLITE_OMIT_SHARED_CACHE */
000080  
000081  #ifndef SQLITE_OMIT_SHARED_CACHE
000082  /*
000083  ** Enable or disable the shared pager and schema features.
000084  **
000085  ** This routine has no effect on existing database connections.
000086  ** The shared cache setting effects only future calls to
000087  ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
000088  */
000089  int sqlite3_enable_shared_cache(int enable){
000090    sqlite3GlobalConfig.sharedCacheEnabled = enable;
000091    return SQLITE_OK;
000092  }
000093  #endif
000094  
000095  
000096  
000097  #ifdef SQLITE_OMIT_SHARED_CACHE
000098    /*
000099    ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
000100    ** and clearAllSharedCacheTableLocks()
000101    ** manipulate entries in the BtShared.pLock linked list used to store
000102    ** shared-cache table level locks. If the library is compiled with the
000103    ** shared-cache feature disabled, then there is only ever one user
000104    ** of each BtShared structure and so this locking is not necessary. 
000105    ** So define the lock related functions as no-ops.
000106    */
000107    #define querySharedCacheTableLock(a,b,c) SQLITE_OK
000108    #define setSharedCacheTableLock(a,b,c) SQLITE_OK
000109    #define clearAllSharedCacheTableLocks(a)
000110    #define downgradeAllSharedCacheTableLocks(a)
000111    #define hasSharedCacheTableLock(a,b,c,d) 1
000112    #define hasReadConflicts(a, b) 0
000113  #endif
000114  
000115  #ifndef SQLITE_OMIT_SHARED_CACHE
000116  
000117  #ifdef SQLITE_DEBUG
000118  /*
000119  **** This function is only used as part of an assert() statement. ***
000120  **
000121  ** Check to see if pBtree holds the required locks to read or write to the 
000122  ** table with root page iRoot.   Return 1 if it does and 0 if not.
000123  **
000124  ** For example, when writing to a table with root-page iRoot via 
000125  ** Btree connection pBtree:
000126  **
000127  **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
000128  **
000129  ** When writing to an index that resides in a sharable database, the 
000130  ** caller should have first obtained a lock specifying the root page of
000131  ** the corresponding table. This makes things a bit more complicated,
000132  ** as this module treats each table as a separate structure. To determine
000133  ** the table corresponding to the index being written, this
000134  ** function has to search through the database schema.
000135  **
000136  ** Instead of a lock on the table/index rooted at page iRoot, the caller may
000137  ** hold a write-lock on the schema table (root page 1). This is also
000138  ** acceptable.
000139  */
000140  static int hasSharedCacheTableLock(
000141    Btree *pBtree,         /* Handle that must hold lock */
000142    Pgno iRoot,            /* Root page of b-tree */
000143    int isIndex,           /* True if iRoot is the root of an index b-tree */
000144    int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
000145  ){
000146    Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
000147    Pgno iTab = 0;
000148    BtLock *pLock;
000149  
000150    /* If this database is not shareable, or if the client is reading
000151    ** and has the read-uncommitted flag set, then no lock is required. 
000152    ** Return true immediately.
000153    */
000154    if( (pBtree->sharable==0)
000155     || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
000156    ){
000157      return 1;
000158    }
000159  
000160    /* If the client is reading  or writing an index and the schema is
000161    ** not loaded, then it is too difficult to actually check to see if
000162    ** the correct locks are held.  So do not bother - just return true.
000163    ** This case does not come up very often anyhow.
000164    */
000165    if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
000166      return 1;
000167    }
000168  
000169    /* Figure out the root-page that the lock should be held on. For table
000170    ** b-trees, this is just the root page of the b-tree being read or
000171    ** written. For index b-trees, it is the root page of the associated
000172    ** table.  */
000173    if( isIndex ){
000174      HashElem *p;
000175      for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
000176        Index *pIdx = (Index *)sqliteHashData(p);
000177        if( pIdx->tnum==(int)iRoot ){
000178          if( iTab ){
000179            /* Two or more indexes share the same root page.  There must
000180            ** be imposter tables.  So just return true.  The assert is not
000181            ** useful in that case. */
000182            return 1;
000183          }
000184          iTab = pIdx->pTable->tnum;
000185        }
000186      }
000187    }else{
000188      iTab = iRoot;
000189    }
000190  
000191    /* Search for the required lock. Either a write-lock on root-page iTab, a 
000192    ** write-lock on the schema table, or (if the client is reading) a
000193    ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
000194    for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
000195      if( pLock->pBtree==pBtree 
000196       && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
000197       && pLock->eLock>=eLockType 
000198      ){
000199        return 1;
000200      }
000201    }
000202  
000203    /* Failed to find the required lock. */
000204    return 0;
000205  }
000206  #endif /* SQLITE_DEBUG */
000207  
000208  #ifdef SQLITE_DEBUG
000209  /*
000210  **** This function may be used as part of assert() statements only. ****
000211  **
000212  ** Return true if it would be illegal for pBtree to write into the
000213  ** table or index rooted at iRoot because other shared connections are
000214  ** simultaneously reading that same table or index.
000215  **
000216  ** It is illegal for pBtree to write if some other Btree object that
000217  ** shares the same BtShared object is currently reading or writing
000218  ** the iRoot table.  Except, if the other Btree object has the
000219  ** read-uncommitted flag set, then it is OK for the other object to
000220  ** have a read cursor.
000221  **
000222  ** For example, before writing to any part of the table or index
000223  ** rooted at page iRoot, one should call:
000224  **
000225  **    assert( !hasReadConflicts(pBtree, iRoot) );
000226  */
000227  static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
000228    BtCursor *p;
000229    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
000230      if( p->pgnoRoot==iRoot 
000231       && p->pBtree!=pBtree
000232       && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
000233      ){
000234        return 1;
000235      }
000236    }
000237    return 0;
000238  }
000239  #endif    /* #ifdef SQLITE_DEBUG */
000240  
000241  /*
000242  ** Query to see if Btree handle p may obtain a lock of type eLock 
000243  ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
000244  ** SQLITE_OK if the lock may be obtained (by calling
000245  ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
000246  */
000247  static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
000248    BtShared *pBt = p->pBt;
000249    BtLock *pIter;
000250  
000251    assert( sqlite3BtreeHoldsMutex(p) );
000252    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
000253    assert( p->db!=0 );
000254    assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
000255    
000256    /* If requesting a write-lock, then the Btree must have an open write
000257    ** transaction on this file. And, obviously, for this to be so there 
000258    ** must be an open write transaction on the file itself.
000259    */
000260    assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
000261    assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
000262    
000263    /* This routine is a no-op if the shared-cache is not enabled */
000264    if( !p->sharable ){
000265      return SQLITE_OK;
000266    }
000267  
000268    /* If some other connection is holding an exclusive lock, the
000269    ** requested lock may not be obtained.
000270    */
000271    if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
000272      sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
000273      return SQLITE_LOCKED_SHAREDCACHE;
000274    }
000275  
000276    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
000277      /* The condition (pIter->eLock!=eLock) in the following if(...) 
000278      ** statement is a simplification of:
000279      **
000280      **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
000281      **
000282      ** since we know that if eLock==WRITE_LOCK, then no other connection
000283      ** may hold a WRITE_LOCK on any table in this file (since there can
000284      ** only be a single writer).
000285      */
000286      assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
000287      assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
000288      if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
000289        sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
000290        if( eLock==WRITE_LOCK ){
000291          assert( p==pBt->pWriter );
000292          pBt->btsFlags |= BTS_PENDING;
000293        }
000294        return SQLITE_LOCKED_SHAREDCACHE;
000295      }
000296    }
000297    return SQLITE_OK;
000298  }
000299  #endif /* !SQLITE_OMIT_SHARED_CACHE */
000300  
000301  #ifndef SQLITE_OMIT_SHARED_CACHE
000302  /*
000303  ** Add a lock on the table with root-page iTable to the shared-btree used
000304  ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
000305  ** WRITE_LOCK.
000306  **
000307  ** This function assumes the following:
000308  **
000309  **   (a) The specified Btree object p is connected to a sharable
000310  **       database (one with the BtShared.sharable flag set), and
000311  **
000312  **   (b) No other Btree objects hold a lock that conflicts
000313  **       with the requested lock (i.e. querySharedCacheTableLock() has
000314  **       already been called and returned SQLITE_OK).
000315  **
000316  ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 
000317  ** is returned if a malloc attempt fails.
000318  */
000319  static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
000320    BtShared *pBt = p->pBt;
000321    BtLock *pLock = 0;
000322    BtLock *pIter;
000323  
000324    assert( sqlite3BtreeHoldsMutex(p) );
000325    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
000326    assert( p->db!=0 );
000327  
000328    /* A connection with the read-uncommitted flag set will never try to
000329    ** obtain a read-lock using this function. The only read-lock obtained
000330    ** by a connection in read-uncommitted mode is on the sqlite_master 
000331    ** table, and that lock is obtained in BtreeBeginTrans().  */
000332    assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
000333  
000334    /* This function should only be called on a sharable b-tree after it 
000335    ** has been determined that no other b-tree holds a conflicting lock.  */
000336    assert( p->sharable );
000337    assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
000338  
000339    /* First search the list for an existing lock on this table. */
000340    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
000341      if( pIter->iTable==iTable && pIter->pBtree==p ){
000342        pLock = pIter;
000343        break;
000344      }
000345    }
000346  
000347    /* If the above search did not find a BtLock struct associating Btree p
000348    ** with table iTable, allocate one and link it into the list.
000349    */
000350    if( !pLock ){
000351      pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
000352      if( !pLock ){
000353        return SQLITE_NOMEM_BKPT;
000354      }
000355      pLock->iTable = iTable;
000356      pLock->pBtree = p;
000357      pLock->pNext = pBt->pLock;
000358      pBt->pLock = pLock;
000359    }
000360  
000361    /* Set the BtLock.eLock variable to the maximum of the current lock
000362    ** and the requested lock. This means if a write-lock was already held
000363    ** and a read-lock requested, we don't incorrectly downgrade the lock.
000364    */
000365    assert( WRITE_LOCK>READ_LOCK );
000366    if( eLock>pLock->eLock ){
000367      pLock->eLock = eLock;
000368    }
000369  
000370    return SQLITE_OK;
000371  }
000372  #endif /* !SQLITE_OMIT_SHARED_CACHE */
000373  
000374  #ifndef SQLITE_OMIT_SHARED_CACHE
000375  /*
000376  ** Release all the table locks (locks obtained via calls to
000377  ** the setSharedCacheTableLock() procedure) held by Btree object p.
000378  **
000379  ** This function assumes that Btree p has an open read or write 
000380  ** transaction. If it does not, then the BTS_PENDING flag
000381  ** may be incorrectly cleared.
000382  */
000383  static void clearAllSharedCacheTableLocks(Btree *p){
000384    BtShared *pBt = p->pBt;
000385    BtLock **ppIter = &pBt->pLock;
000386  
000387    assert( sqlite3BtreeHoldsMutex(p) );
000388    assert( p->sharable || 0==*ppIter );
000389    assert( p->inTrans>0 );
000390  
000391    while( *ppIter ){
000392      BtLock *pLock = *ppIter;
000393      assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
000394      assert( pLock->pBtree->inTrans>=pLock->eLock );
000395      if( pLock->pBtree==p ){
000396        *ppIter = pLock->pNext;
000397        assert( pLock->iTable!=1 || pLock==&p->lock );
000398        if( pLock->iTable!=1 ){
000399          sqlite3_free(pLock);
000400        }
000401      }else{
000402        ppIter = &pLock->pNext;
000403      }
000404    }
000405  
000406    assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
000407    if( pBt->pWriter==p ){
000408      pBt->pWriter = 0;
000409      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
000410    }else if( pBt->nTransaction==2 ){
000411      /* This function is called when Btree p is concluding its 
000412      ** transaction. If there currently exists a writer, and p is not
000413      ** that writer, then the number of locks held by connections other
000414      ** than the writer must be about to drop to zero. In this case
000415      ** set the BTS_PENDING flag to 0.
000416      **
000417      ** If there is not currently a writer, then BTS_PENDING must
000418      ** be zero already. So this next line is harmless in that case.
000419      */
000420      pBt->btsFlags &= ~BTS_PENDING;
000421    }
000422  }
000423  
000424  /*
000425  ** This function changes all write-locks held by Btree p into read-locks.
000426  */
000427  static void downgradeAllSharedCacheTableLocks(Btree *p){
000428    BtShared *pBt = p->pBt;
000429    if( pBt->pWriter==p ){
000430      BtLock *pLock;
000431      pBt->pWriter = 0;
000432      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
000433      for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
000434        assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
000435        pLock->eLock = READ_LOCK;
000436      }
000437    }
000438  }
000439  
000440  #endif /* SQLITE_OMIT_SHARED_CACHE */
000441  
000442  static void releasePage(MemPage *pPage);  /* Forward reference */
000443  
000444  /*
000445  ***** This routine is used inside of assert() only ****
000446  **
000447  ** Verify that the cursor holds the mutex on its BtShared
000448  */
000449  #ifdef SQLITE_DEBUG
000450  static int cursorHoldsMutex(BtCursor *p){
000451    return sqlite3_mutex_held(p->pBt->mutex);
000452  }
000453  
000454  /* Verify that the cursor and the BtShared agree about what is the current
000455  ** database connetion. This is important in shared-cache mode. If the database 
000456  ** connection pointers get out-of-sync, it is possible for routines like
000457  ** btreeInitPage() to reference an stale connection pointer that references a
000458  ** a connection that has already closed.  This routine is used inside assert()
000459  ** statements only and for the purpose of double-checking that the btree code
000460  ** does keep the database connection pointers up-to-date.
000461  */
000462  static int cursorOwnsBtShared(BtCursor *p){
000463    assert( cursorHoldsMutex(p) );
000464    return (p->pBtree->db==p->pBt->db);
000465  }
000466  #endif
000467  
000468  /*
000469  ** Invalidate the overflow cache of the cursor passed as the first argument.
000470  ** on the shared btree structure pBt.
000471  */
000472  #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
000473  
000474  /*
000475  ** Invalidate the overflow page-list cache for all cursors opened
000476  ** on the shared btree structure pBt.
000477  */
000478  static void invalidateAllOverflowCache(BtShared *pBt){
000479    BtCursor *p;
000480    assert( sqlite3_mutex_held(pBt->mutex) );
000481    for(p=pBt->pCursor; p; p=p->pNext){
000482      invalidateOverflowCache(p);
000483    }
000484  }
000485  
000486  #ifndef SQLITE_OMIT_INCRBLOB
000487  /*
000488  ** This function is called before modifying the contents of a table
000489  ** to invalidate any incrblob cursors that are open on the
000490  ** row or one of the rows being modified.
000491  **
000492  ** If argument isClearTable is true, then the entire contents of the
000493  ** table is about to be deleted. In this case invalidate all incrblob
000494  ** cursors open on any row within the table with root-page pgnoRoot.
000495  **
000496  ** Otherwise, if argument isClearTable is false, then the row with
000497  ** rowid iRow is being replaced or deleted. In this case invalidate
000498  ** only those incrblob cursors open on that specific row.
000499  */
000500  static void invalidateIncrblobCursors(
000501    Btree *pBtree,          /* The database file to check */
000502    i64 iRow,               /* The rowid that might be changing */
000503    int isClearTable        /* True if all rows are being deleted */
000504  ){
000505    BtCursor *p;
000506    if( pBtree->hasIncrblobCur==0 ) return;
000507    assert( sqlite3BtreeHoldsMutex(pBtree) );
000508    pBtree->hasIncrblobCur = 0;
000509    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
000510      if( (p->curFlags & BTCF_Incrblob)!=0 ){
000511        pBtree->hasIncrblobCur = 1;
000512        if( isClearTable || p->info.nKey==iRow ){
000513          p->eState = CURSOR_INVALID;
000514        }
000515      }
000516    }
000517  }
000518  
000519  #else
000520    /* Stub function when INCRBLOB is omitted */
000521    #define invalidateIncrblobCursors(x,y,z)
000522  #endif /* SQLITE_OMIT_INCRBLOB */
000523  
000524  /*
000525  ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 
000526  ** when a page that previously contained data becomes a free-list leaf 
000527  ** page.
000528  **
000529  ** The BtShared.pHasContent bitvec exists to work around an obscure
000530  ** bug caused by the interaction of two useful IO optimizations surrounding
000531  ** free-list leaf pages:
000532  **
000533  **   1) When all data is deleted from a page and the page becomes
000534  **      a free-list leaf page, the page is not written to the database
000535  **      (as free-list leaf pages contain no meaningful data). Sometimes
000536  **      such a page is not even journalled (as it will not be modified,
000537  **      why bother journalling it?).
000538  **
000539  **   2) When a free-list leaf page is reused, its content is not read
000540  **      from the database or written to the journal file (why should it
000541  **      be, if it is not at all meaningful?).
000542  **
000543  ** By themselves, these optimizations work fine and provide a handy
000544  ** performance boost to bulk delete or insert operations. However, if
000545  ** a page is moved to the free-list and then reused within the same
000546  ** transaction, a problem comes up. If the page is not journalled when
000547  ** it is moved to the free-list and it is also not journalled when it
000548  ** is extracted from the free-list and reused, then the original data
000549  ** may be lost. In the event of a rollback, it may not be possible
000550  ** to restore the database to its original configuration.
000551  **
000552  ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 
000553  ** moved to become a free-list leaf page, the corresponding bit is
000554  ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
000555  ** optimization 2 above is omitted if the corresponding bit is already
000556  ** set in BtShared.pHasContent. The contents of the bitvec are cleared
000557  ** at the end of every transaction.
000558  */
000559  static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
000560    int rc = SQLITE_OK;
000561    if( !pBt->pHasContent ){
000562      assert( pgno<=pBt->nPage );
000563      pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
000564      if( !pBt->pHasContent ){
000565        rc = SQLITE_NOMEM_BKPT;
000566      }
000567    }
000568    if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
000569      rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
000570    }
000571    return rc;
000572  }
000573  
000574  /*
000575  ** Query the BtShared.pHasContent vector.
000576  **
000577  ** This function is called when a free-list leaf page is removed from the
000578  ** free-list for reuse. It returns false if it is safe to retrieve the
000579  ** page from the pager layer with the 'no-content' flag set. True otherwise.
000580  */
000581  static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
000582    Bitvec *p = pBt->pHasContent;
000583    return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
000584  }
000585  
000586  /*
000587  ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
000588  ** invoked at the conclusion of each write-transaction.
000589  */
000590  static void btreeClearHasContent(BtShared *pBt){
000591    sqlite3BitvecDestroy(pBt->pHasContent);
000592    pBt->pHasContent = 0;
000593  }
000594  
000595  /*
000596  ** Release all of the apPage[] pages for a cursor.
000597  */
000598  static void btreeReleaseAllCursorPages(BtCursor *pCur){
000599    int i;
000600    for(i=0; i<=pCur->iPage; i++){
000601      releasePage(pCur->apPage[i]);
000602      pCur->apPage[i] = 0;
000603    }
000604    pCur->iPage = -1;
000605  }
000606  
000607  /*
000608  ** The cursor passed as the only argument must point to a valid entry
000609  ** when this function is called (i.e. have eState==CURSOR_VALID). This
000610  ** function saves the current cursor key in variables pCur->nKey and
000611  ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 
000612  ** code otherwise.
000613  **
000614  ** If the cursor is open on an intkey table, then the integer key
000615  ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
000616  ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 
000617  ** set to point to a malloced buffer pCur->nKey bytes in size containing 
000618  ** the key.
000619  */
000620  static int saveCursorKey(BtCursor *pCur){
000621    int rc = SQLITE_OK;
000622    assert( CURSOR_VALID==pCur->eState );
000623    assert( 0==pCur->pKey );
000624    assert( cursorHoldsMutex(pCur) );
000625  
000626    if( pCur->curIntKey ){
000627      /* Only the rowid is required for a table btree */
000628      pCur->nKey = sqlite3BtreeIntegerKey(pCur);
000629    }else{
000630      /* For an index btree, save the complete key content */
000631      void *pKey;
000632      pCur->nKey = sqlite3BtreePayloadSize(pCur);
000633      pKey = sqlite3Malloc( pCur->nKey );
000634      if( pKey ){
000635        rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
000636        if( rc==SQLITE_OK ){
000637          pCur->pKey = pKey;
000638        }else{
000639          sqlite3_free(pKey);
000640        }
000641      }else{
000642        rc = SQLITE_NOMEM_BKPT;
000643      }
000644    }
000645    assert( !pCur->curIntKey || !pCur->pKey );
000646    return rc;
000647  }
000648  
000649  /*
000650  ** Save the current cursor position in the variables BtCursor.nKey 
000651  ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
000652  **
000653  ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
000654  ** prior to calling this routine.  
000655  */
000656  static int saveCursorPosition(BtCursor *pCur){
000657    int rc;
000658  
000659    assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
000660    assert( 0==pCur->pKey );
000661    assert( cursorHoldsMutex(pCur) );
000662  
000663    if( pCur->eState==CURSOR_SKIPNEXT ){
000664      pCur->eState = CURSOR_VALID;
000665    }else{
000666      pCur->skipNext = 0;
000667    }
000668  
000669    rc = saveCursorKey(pCur);
000670    if( rc==SQLITE_OK ){
000671      btreeReleaseAllCursorPages(pCur);
000672      pCur->eState = CURSOR_REQUIRESEEK;
000673    }
000674  
000675    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
000676    return rc;
000677  }
000678  
000679  /* Forward reference */
000680  static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
000681  
000682  /*
000683  ** Save the positions of all cursors (except pExcept) that are open on
000684  ** the table with root-page iRoot.  "Saving the cursor position" means that
000685  ** the location in the btree is remembered in such a way that it can be
000686  ** moved back to the same spot after the btree has been modified.  This
000687  ** routine is called just before cursor pExcept is used to modify the
000688  ** table, for example in BtreeDelete() or BtreeInsert().
000689  **
000690  ** If there are two or more cursors on the same btree, then all such 
000691  ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
000692  ** routine enforces that rule.  This routine only needs to be called in
000693  ** the uncommon case when pExpect has the BTCF_Multiple flag set.
000694  **
000695  ** If pExpect!=NULL and if no other cursors are found on the same root-page,
000696  ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
000697  ** pointless call to this routine.
000698  **
000699  ** Implementation note:  This routine merely checks to see if any cursors
000700  ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
000701  ** event that cursors are in need to being saved.
000702  */
000703  static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
000704    BtCursor *p;
000705    assert( sqlite3_mutex_held(pBt->mutex) );
000706    assert( pExcept==0 || pExcept->pBt==pBt );
000707    for(p=pBt->pCursor; p; p=p->pNext){
000708      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
000709    }
000710    if( p ) return saveCursorsOnList(p, iRoot, pExcept);
000711    if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
000712    return SQLITE_OK;
000713  }
000714  
000715  /* This helper routine to saveAllCursors does the actual work of saving
000716  ** the cursors if and when a cursor is found that actually requires saving.
000717  ** The common case is that no cursors need to be saved, so this routine is
000718  ** broken out from its caller to avoid unnecessary stack pointer movement.
000719  */
000720  static int SQLITE_NOINLINE saveCursorsOnList(
000721    BtCursor *p,         /* The first cursor that needs saving */
000722    Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
000723    BtCursor *pExcept    /* Do not save this cursor */
000724  ){
000725    do{
000726      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
000727        if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
000728          int rc = saveCursorPosition(p);
000729          if( SQLITE_OK!=rc ){
000730            return rc;
000731          }
000732        }else{
000733          testcase( p->iPage>0 );
000734          btreeReleaseAllCursorPages(p);
000735        }
000736      }
000737      p = p->pNext;
000738    }while( p );
000739    return SQLITE_OK;
000740  }
000741  
000742  /*
000743  ** Clear the current cursor position.
000744  */
000745  void sqlite3BtreeClearCursor(BtCursor *pCur){
000746    assert( cursorHoldsMutex(pCur) );
000747    sqlite3_free(pCur->pKey);
000748    pCur->pKey = 0;
000749    pCur->eState = CURSOR_INVALID;
000750  }
000751  
000752  /*
000753  ** In this version of BtreeMoveto, pKey is a packed index record
000754  ** such as is generated by the OP_MakeRecord opcode.  Unpack the
000755  ** record and then call BtreeMovetoUnpacked() to do the work.
000756  */
000757  static int btreeMoveto(
000758    BtCursor *pCur,     /* Cursor open on the btree to be searched */
000759    const void *pKey,   /* Packed key if the btree is an index */
000760    i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
000761    int bias,           /* Bias search to the high end */
000762    int *pRes           /* Write search results here */
000763  ){
000764    int rc;                    /* Status code */
000765    UnpackedRecord *pIdxKey;   /* Unpacked index key */
000766  
000767    if( pKey ){
000768      assert( nKey==(i64)(int)nKey );
000769      pIdxKey = sqlite3VdbeAllocUnpackedRecord(pCur->pKeyInfo);
000770      if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
000771      sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
000772      if( pIdxKey->nField==0 ){
000773        rc = SQLITE_CORRUPT_BKPT;
000774        goto moveto_done;
000775      }
000776    }else{
000777      pIdxKey = 0;
000778    }
000779    rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
000780  moveto_done:
000781    if( pIdxKey ){
000782      sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
000783    }
000784    return rc;
000785  }
000786  
000787  /*
000788  ** Restore the cursor to the position it was in (or as close to as possible)
000789  ** when saveCursorPosition() was called. Note that this call deletes the 
000790  ** saved position info stored by saveCursorPosition(), so there can be
000791  ** at most one effective restoreCursorPosition() call after each 
000792  ** saveCursorPosition().
000793  */
000794  static int btreeRestoreCursorPosition(BtCursor *pCur){
000795    int rc;
000796    int skipNext;
000797    assert( cursorOwnsBtShared(pCur) );
000798    assert( pCur->eState>=CURSOR_REQUIRESEEK );
000799    if( pCur->eState==CURSOR_FAULT ){
000800      return pCur->skipNext;
000801    }
000802    pCur->eState = CURSOR_INVALID;
000803    rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
000804    if( rc==SQLITE_OK ){
000805      sqlite3_free(pCur->pKey);
000806      pCur->pKey = 0;
000807      assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
000808      pCur->skipNext |= skipNext;
000809      if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
000810        pCur->eState = CURSOR_SKIPNEXT;
000811      }
000812    }
000813    return rc;
000814  }
000815  
000816  #define restoreCursorPosition(p) \
000817    (p->eState>=CURSOR_REQUIRESEEK ? \
000818           btreeRestoreCursorPosition(p) : \
000819           SQLITE_OK)
000820  
000821  /*
000822  ** Determine whether or not a cursor has moved from the position where
000823  ** it was last placed, or has been invalidated for any other reason.
000824  ** Cursors can move when the row they are pointing at is deleted out
000825  ** from under them, for example.  Cursor might also move if a btree
000826  ** is rebalanced.
000827  **
000828  ** Calling this routine with a NULL cursor pointer returns false.
000829  **
000830  ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
000831  ** back to where it ought to be if this routine returns true.
000832  */
000833  int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
000834    return pCur->eState!=CURSOR_VALID;
000835  }
000836  
000837  /*
000838  ** This routine restores a cursor back to its original position after it
000839  ** has been moved by some outside activity (such as a btree rebalance or
000840  ** a row having been deleted out from under the cursor).  
000841  **
000842  ** On success, the *pDifferentRow parameter is false if the cursor is left
000843  ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
000844  ** was pointing to has been deleted, forcing the cursor to point to some
000845  ** nearby row.
000846  **
000847  ** This routine should only be called for a cursor that just returned
000848  ** TRUE from sqlite3BtreeCursorHasMoved().
000849  */
000850  int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
000851    int rc;
000852  
000853    assert( pCur!=0 );
000854    assert( pCur->eState!=CURSOR_VALID );
000855    rc = restoreCursorPosition(pCur);
000856    if( rc ){
000857      *pDifferentRow = 1;
000858      return rc;
000859    }
000860    if( pCur->eState!=CURSOR_VALID ){
000861      *pDifferentRow = 1;
000862    }else{
000863      assert( pCur->skipNext==0 );
000864      *pDifferentRow = 0;
000865    }
000866    return SQLITE_OK;
000867  }
000868  
000869  #ifdef SQLITE_ENABLE_CURSOR_HINTS
000870  /*
000871  ** Provide hints to the cursor.  The particular hint given (and the type
000872  ** and number of the varargs parameters) is determined by the eHintType
000873  ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
000874  */
000875  void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
000876    /* Used only by system that substitute their own storage engine */
000877  }
000878  #endif
000879  
000880  /*
000881  ** Provide flag hints to the cursor.
000882  */
000883  void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
000884    assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
000885    pCur->hints = x;
000886  }
000887  
000888  
000889  #ifndef SQLITE_OMIT_AUTOVACUUM
000890  /*
000891  ** Given a page number of a regular database page, return the page
000892  ** number for the pointer-map page that contains the entry for the
000893  ** input page number.
000894  **
000895  ** Return 0 (not a valid page) for pgno==1 since there is
000896  ** no pointer map associated with page 1.  The integrity_check logic
000897  ** requires that ptrmapPageno(*,1)!=1.
000898  */
000899  static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
000900    int nPagesPerMapPage;
000901    Pgno iPtrMap, ret;
000902    assert( sqlite3_mutex_held(pBt->mutex) );
000903    if( pgno<2 ) return 0;
000904    nPagesPerMapPage = (pBt->usableSize/5)+1;
000905    iPtrMap = (pgno-2)/nPagesPerMapPage;
000906    ret = (iPtrMap*nPagesPerMapPage) + 2; 
000907    if( ret==PENDING_BYTE_PAGE(pBt) ){
000908      ret++;
000909    }
000910    return ret;
000911  }
000912  
000913  /*
000914  ** Write an entry into the pointer map.
000915  **
000916  ** This routine updates the pointer map entry for page number 'key'
000917  ** so that it maps to type 'eType' and parent page number 'pgno'.
000918  **
000919  ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
000920  ** a no-op.  If an error occurs, the appropriate error code is written
000921  ** into *pRC.
000922  */
000923  static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
000924    DbPage *pDbPage;  /* The pointer map page */
000925    u8 *pPtrmap;      /* The pointer map data */
000926    Pgno iPtrmap;     /* The pointer map page number */
000927    int offset;       /* Offset in pointer map page */
000928    int rc;           /* Return code from subfunctions */
000929  
000930    if( *pRC ) return;
000931  
000932    assert( sqlite3_mutex_held(pBt->mutex) );
000933    /* The master-journal page number must never be used as a pointer map page */
000934    assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
000935  
000936    assert( pBt->autoVacuum );
000937    if( key==0 ){
000938      *pRC = SQLITE_CORRUPT_BKPT;
000939      return;
000940    }
000941    iPtrmap = PTRMAP_PAGENO(pBt, key);
000942    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
000943    if( rc!=SQLITE_OK ){
000944      *pRC = rc;
000945      return;
000946    }
000947    offset = PTRMAP_PTROFFSET(iPtrmap, key);
000948    if( offset<0 ){
000949      *pRC = SQLITE_CORRUPT_BKPT;
000950      goto ptrmap_exit;
000951    }
000952    assert( offset <= (int)pBt->usableSize-5 );
000953    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
000954  
000955    if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
000956      TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
000957      *pRC= rc = sqlite3PagerWrite(pDbPage);
000958      if( rc==SQLITE_OK ){
000959        pPtrmap[offset] = eType;
000960        put4byte(&pPtrmap[offset+1], parent);
000961      }
000962    }
000963  
000964  ptrmap_exit:
000965    sqlite3PagerUnref(pDbPage);
000966  }
000967  
000968  /*
000969  ** Read an entry from the pointer map.
000970  **
000971  ** This routine retrieves the pointer map entry for page 'key', writing
000972  ** the type and parent page number to *pEType and *pPgno respectively.
000973  ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
000974  */
000975  static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
000976    DbPage *pDbPage;   /* The pointer map page */
000977    int iPtrmap;       /* Pointer map page index */
000978    u8 *pPtrmap;       /* Pointer map page data */
000979    int offset;        /* Offset of entry in pointer map */
000980    int rc;
000981  
000982    assert( sqlite3_mutex_held(pBt->mutex) );
000983  
000984    iPtrmap = PTRMAP_PAGENO(pBt, key);
000985    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
000986    if( rc!=0 ){
000987      return rc;
000988    }
000989    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
000990  
000991    offset = PTRMAP_PTROFFSET(iPtrmap, key);
000992    if( offset<0 ){
000993      sqlite3PagerUnref(pDbPage);
000994      return SQLITE_CORRUPT_BKPT;
000995    }
000996    assert( offset <= (int)pBt->usableSize-5 );
000997    assert( pEType!=0 );
000998    *pEType = pPtrmap[offset];
000999    if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
001000  
001001    sqlite3PagerUnref(pDbPage);
001002    if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
001003    return SQLITE_OK;
001004  }
001005  
001006  #else /* if defined SQLITE_OMIT_AUTOVACUUM */
001007    #define ptrmapPut(w,x,y,z,rc)
001008    #define ptrmapGet(w,x,y,z) SQLITE_OK
001009    #define ptrmapPutOvflPtr(x, y, rc)
001010  #endif
001011  
001012  /*
001013  ** Given a btree page and a cell index (0 means the first cell on
001014  ** the page, 1 means the second cell, and so forth) return a pointer
001015  ** to the cell content.
001016  **
001017  ** findCellPastPtr() does the same except it skips past the initial
001018  ** 4-byte child pointer found on interior pages, if there is one.
001019  **
001020  ** This routine works only for pages that do not contain overflow cells.
001021  */
001022  #define findCell(P,I) \
001023    ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
001024  #define findCellPastPtr(P,I) \
001025    ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
001026  
001027  
001028  /*
001029  ** This is common tail processing for btreeParseCellPtr() and
001030  ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
001031  ** on a single B-tree page.  Make necessary adjustments to the CellInfo
001032  ** structure.
001033  */
001034  static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
001035    MemPage *pPage,         /* Page containing the cell */
001036    u8 *pCell,              /* Pointer to the cell text. */
001037    CellInfo *pInfo         /* Fill in this structure */
001038  ){
001039    /* If the payload will not fit completely on the local page, we have
001040    ** to decide how much to store locally and how much to spill onto
001041    ** overflow pages.  The strategy is to minimize the amount of unused
001042    ** space on overflow pages while keeping the amount of local storage
001043    ** in between minLocal and maxLocal.
001044    **
001045    ** Warning:  changing the way overflow payload is distributed in any
001046    ** way will result in an incompatible file format.
001047    */
001048    int minLocal;  /* Minimum amount of payload held locally */
001049    int maxLocal;  /* Maximum amount of payload held locally */
001050    int surplus;   /* Overflow payload available for local storage */
001051  
001052    minLocal = pPage->minLocal;
001053    maxLocal = pPage->maxLocal;
001054    surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
001055    testcase( surplus==maxLocal );
001056    testcase( surplus==maxLocal+1 );
001057    if( surplus <= maxLocal ){
001058      pInfo->nLocal = (u16)surplus;
001059    }else{
001060      pInfo->nLocal = (u16)minLocal;
001061    }
001062    pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
001063  }
001064  
001065  /*
001066  ** The following routines are implementations of the MemPage.xParseCell()
001067  ** method.
001068  **
001069  ** Parse a cell content block and fill in the CellInfo structure.
001070  **
001071  ** btreeParseCellPtr()        =>   table btree leaf nodes
001072  ** btreeParseCellNoPayload()  =>   table btree internal nodes
001073  ** btreeParseCellPtrIndex()   =>   index btree nodes
001074  **
001075  ** There is also a wrapper function btreeParseCell() that works for
001076  ** all MemPage types and that references the cell by index rather than
001077  ** by pointer.
001078  */
001079  static void btreeParseCellPtrNoPayload(
001080    MemPage *pPage,         /* Page containing the cell */
001081    u8 *pCell,              /* Pointer to the cell text. */
001082    CellInfo *pInfo         /* Fill in this structure */
001083  ){
001084    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001085    assert( pPage->leaf==0 );
001086    assert( pPage->childPtrSize==4 );
001087  #ifndef SQLITE_DEBUG
001088    UNUSED_PARAMETER(pPage);
001089  #endif
001090    pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
001091    pInfo->nPayload = 0;
001092    pInfo->nLocal = 0;
001093    pInfo->pPayload = 0;
001094    return;
001095  }
001096  static void btreeParseCellPtr(
001097    MemPage *pPage,         /* Page containing the cell */
001098    u8 *pCell,              /* Pointer to the cell text. */
001099    CellInfo *pInfo         /* Fill in this structure */
001100  ){
001101    u8 *pIter;              /* For scanning through pCell */
001102    u32 nPayload;           /* Number of bytes of cell payload */
001103    u64 iKey;               /* Extracted Key value */
001104  
001105    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001106    assert( pPage->leaf==0 || pPage->leaf==1 );
001107    assert( pPage->intKeyLeaf );
001108    assert( pPage->childPtrSize==0 );
001109    pIter = pCell;
001110  
001111    /* The next block of code is equivalent to:
001112    **
001113    **     pIter += getVarint32(pIter, nPayload);
001114    **
001115    ** The code is inlined to avoid a function call.
001116    */
001117    nPayload = *pIter;
001118    if( nPayload>=0x80 ){
001119      u8 *pEnd = &pIter[8];
001120      nPayload &= 0x7f;
001121      do{
001122        nPayload = (nPayload<<7) | (*++pIter & 0x7f);
001123      }while( (*pIter)>=0x80 && pIter<pEnd );
001124    }
001125    pIter++;
001126  
001127    /* The next block of code is equivalent to:
001128    **
001129    **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
001130    **
001131    ** The code is inlined to avoid a function call.
001132    */
001133    iKey = *pIter;
001134    if( iKey>=0x80 ){
001135      u8 *pEnd = &pIter[7];
001136      iKey &= 0x7f;
001137      while(1){
001138        iKey = (iKey<<7) | (*++pIter & 0x7f);
001139        if( (*pIter)<0x80 ) break;
001140        if( pIter>=pEnd ){
001141          iKey = (iKey<<8) | *++pIter;
001142          break;
001143        }
001144      }
001145    }
001146    pIter++;
001147  
001148    pInfo->nKey = *(i64*)&iKey;
001149    pInfo->nPayload = nPayload;
001150    pInfo->pPayload = pIter;
001151    testcase( nPayload==pPage->maxLocal );
001152    testcase( nPayload==pPage->maxLocal+1 );
001153    if( nPayload<=pPage->maxLocal ){
001154      /* This is the (easy) common case where the entire payload fits
001155      ** on the local page.  No overflow is required.
001156      */
001157      pInfo->nSize = nPayload + (u16)(pIter - pCell);
001158      if( pInfo->nSize<4 ) pInfo->nSize = 4;
001159      pInfo->nLocal = (u16)nPayload;
001160    }else{
001161      btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
001162    }
001163  }
001164  static void btreeParseCellPtrIndex(
001165    MemPage *pPage,         /* Page containing the cell */
001166    u8 *pCell,              /* Pointer to the cell text. */
001167    CellInfo *pInfo         /* Fill in this structure */
001168  ){
001169    u8 *pIter;              /* For scanning through pCell */
001170    u32 nPayload;           /* Number of bytes of cell payload */
001171  
001172    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001173    assert( pPage->leaf==0 || pPage->leaf==1 );
001174    assert( pPage->intKeyLeaf==0 );
001175    pIter = pCell + pPage->childPtrSize;
001176    nPayload = *pIter;
001177    if( nPayload>=0x80 ){
001178      u8 *pEnd = &pIter[8];
001179      nPayload &= 0x7f;
001180      do{
001181        nPayload = (nPayload<<7) | (*++pIter & 0x7f);
001182      }while( *(pIter)>=0x80 && pIter<pEnd );
001183    }
001184    pIter++;
001185    pInfo->nKey = nPayload;
001186    pInfo->nPayload = nPayload;
001187    pInfo->pPayload = pIter;
001188    testcase( nPayload==pPage->maxLocal );
001189    testcase( nPayload==pPage->maxLocal+1 );
001190    if( nPayload<=pPage->maxLocal ){
001191      /* This is the (easy) common case where the entire payload fits
001192      ** on the local page.  No overflow is required.
001193      */
001194      pInfo->nSize = nPayload + (u16)(pIter - pCell);
001195      if( pInfo->nSize<4 ) pInfo->nSize = 4;
001196      pInfo->nLocal = (u16)nPayload;
001197    }else{
001198      btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
001199    }
001200  }
001201  static void btreeParseCell(
001202    MemPage *pPage,         /* Page containing the cell */
001203    int iCell,              /* The cell index.  First cell is 0 */
001204    CellInfo *pInfo         /* Fill in this structure */
001205  ){
001206    pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
001207  }
001208  
001209  /*
001210  ** The following routines are implementations of the MemPage.xCellSize
001211  ** method.
001212  **
001213  ** Compute the total number of bytes that a Cell needs in the cell
001214  ** data area of the btree-page.  The return number includes the cell
001215  ** data header and the local payload, but not any overflow page or
001216  ** the space used by the cell pointer.
001217  **
001218  ** cellSizePtrNoPayload()    =>   table internal nodes
001219  ** cellSizePtr()             =>   all index nodes & table leaf nodes
001220  */
001221  static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
001222    u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
001223    u8 *pEnd;                                /* End mark for a varint */
001224    u32 nSize;                               /* Size value to return */
001225  
001226  #ifdef SQLITE_DEBUG
001227    /* The value returned by this function should always be the same as
001228    ** the (CellInfo.nSize) value found by doing a full parse of the
001229    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001230    ** this function verifies that this invariant is not violated. */
001231    CellInfo debuginfo;
001232    pPage->xParseCell(pPage, pCell, &debuginfo);
001233  #endif
001234  
001235    nSize = *pIter;
001236    if( nSize>=0x80 ){
001237      pEnd = &pIter[8];
001238      nSize &= 0x7f;
001239      do{
001240        nSize = (nSize<<7) | (*++pIter & 0x7f);
001241      }while( *(pIter)>=0x80 && pIter<pEnd );
001242    }
001243    pIter++;
001244    if( pPage->intKey ){
001245      /* pIter now points at the 64-bit integer key value, a variable length 
001246      ** integer. The following block moves pIter to point at the first byte
001247      ** past the end of the key value. */
001248      pEnd = &pIter[9];
001249      while( (*pIter++)&0x80 && pIter<pEnd );
001250    }
001251    testcase( nSize==pPage->maxLocal );
001252    testcase( nSize==pPage->maxLocal+1 );
001253    if( nSize<=pPage->maxLocal ){
001254      nSize += (u32)(pIter - pCell);
001255      if( nSize<4 ) nSize = 4;
001256    }else{
001257      int minLocal = pPage->minLocal;
001258      nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
001259      testcase( nSize==pPage->maxLocal );
001260      testcase( nSize==pPage->maxLocal+1 );
001261      if( nSize>pPage->maxLocal ){
001262        nSize = minLocal;
001263      }
001264      nSize += 4 + (u16)(pIter - pCell);
001265    }
001266    assert( nSize==debuginfo.nSize || CORRUPT_DB );
001267    return (u16)nSize;
001268  }
001269  static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
001270    u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
001271    u8 *pEnd;              /* End mark for a varint */
001272  
001273  #ifdef SQLITE_DEBUG
001274    /* The value returned by this function should always be the same as
001275    ** the (CellInfo.nSize) value found by doing a full parse of the
001276    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001277    ** this function verifies that this invariant is not violated. */
001278    CellInfo debuginfo;
001279    pPage->xParseCell(pPage, pCell, &debuginfo);
001280  #else
001281    UNUSED_PARAMETER(pPage);
001282  #endif
001283  
001284    assert( pPage->childPtrSize==4 );
001285    pEnd = pIter + 9;
001286    while( (*pIter++)&0x80 && pIter<pEnd );
001287    assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
001288    return (u16)(pIter - pCell);
001289  }
001290  
001291  
001292  #ifdef SQLITE_DEBUG
001293  /* This variation on cellSizePtr() is used inside of assert() statements
001294  ** only. */
001295  static u16 cellSize(MemPage *pPage, int iCell){
001296    return pPage->xCellSize(pPage, findCell(pPage, iCell));
001297  }
001298  #endif
001299  
001300  #ifndef SQLITE_OMIT_AUTOVACUUM
001301  /*
001302  ** If the cell pCell, part of page pPage contains a pointer
001303  ** to an overflow page, insert an entry into the pointer-map
001304  ** for the overflow page.
001305  */
001306  static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
001307    CellInfo info;
001308    if( *pRC ) return;
001309    assert( pCell!=0 );
001310    pPage->xParseCell(pPage, pCell, &info);
001311    if( info.nLocal<info.nPayload ){
001312      Pgno ovfl = get4byte(&pCell[info.nSize-4]);
001313      ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
001314    }
001315  }
001316  #endif
001317  
001318  
001319  /*
001320  ** Defragment the page given.  All Cells are moved to the
001321  ** end of the page and all free space is collected into one
001322  ** big FreeBlk that occurs in between the header and cell
001323  ** pointer array and the cell content area.
001324  **
001325  ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
001326  ** b-tree page so that there are no freeblocks or fragment bytes, all
001327  ** unused bytes are contained in the unallocated space region, and all
001328  ** cells are packed tightly at the end of the page.
001329  */
001330  static int defragmentPage(MemPage *pPage){
001331    int i;                     /* Loop counter */
001332    int pc;                    /* Address of the i-th cell */
001333    int hdr;                   /* Offset to the page header */
001334    int size;                  /* Size of a cell */
001335    int usableSize;            /* Number of usable bytes on a page */
001336    int cellOffset;            /* Offset to the cell pointer array */
001337    int cbrk;                  /* Offset to the cell content area */
001338    int nCell;                 /* Number of cells on the page */
001339    unsigned char *data;       /* The page data */
001340    unsigned char *temp;       /* Temp area for cell content */
001341    unsigned char *src;        /* Source of content */
001342    int iCellFirst;            /* First allowable cell index */
001343    int iCellLast;             /* Last possible cell index */
001344  
001345  
001346    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001347    assert( pPage->pBt!=0 );
001348    assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
001349    assert( pPage->nOverflow==0 );
001350    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001351    temp = 0;
001352    src = data = pPage->aData;
001353    hdr = pPage->hdrOffset;
001354    cellOffset = pPage->cellOffset;
001355    nCell = pPage->nCell;
001356    assert( nCell==get2byte(&data[hdr+3]) );
001357    usableSize = pPage->pBt->usableSize;
001358    cbrk = usableSize;
001359    iCellFirst = cellOffset + 2*nCell;
001360    iCellLast = usableSize - 4;
001361    for(i=0; i<nCell; i++){
001362      u8 *pAddr;     /* The i-th cell pointer */
001363      pAddr = &data[cellOffset + i*2];
001364      pc = get2byte(pAddr);
001365      testcase( pc==iCellFirst );
001366      testcase( pc==iCellLast );
001367      /* These conditions have already been verified in btreeInitPage()
001368      ** if PRAGMA cell_size_check=ON.
001369      */
001370      if( pc<iCellFirst || pc>iCellLast ){
001371        return SQLITE_CORRUPT_BKPT;
001372      }
001373      assert( pc>=iCellFirst && pc<=iCellLast );
001374      size = pPage->xCellSize(pPage, &src[pc]);
001375      cbrk -= size;
001376      if( cbrk<iCellFirst || pc+size>usableSize ){
001377        return SQLITE_CORRUPT_BKPT;
001378      }
001379      assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
001380      testcase( cbrk+size==usableSize );
001381      testcase( pc+size==usableSize );
001382      put2byte(pAddr, cbrk);
001383      if( temp==0 ){
001384        int x;
001385        if( cbrk==pc ) continue;
001386        temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
001387        x = get2byte(&data[hdr+5]);
001388        memcpy(&temp[x], &data[x], (cbrk+size) - x);
001389        src = temp;
001390      }
001391      memcpy(&data[cbrk], &src[pc], size);
001392    }
001393    assert( cbrk>=iCellFirst );
001394    put2byte(&data[hdr+5], cbrk);
001395    data[hdr+1] = 0;
001396    data[hdr+2] = 0;
001397    data[hdr+7] = 0;
001398    memset(&data[iCellFirst], 0, cbrk-iCellFirst);
001399    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001400    if( cbrk-iCellFirst!=pPage->nFree ){
001401      return SQLITE_CORRUPT_BKPT;
001402    }
001403    return SQLITE_OK;
001404  }
001405  
001406  /*
001407  ** Search the free-list on page pPg for space to store a cell nByte bytes in
001408  ** size. If one can be found, return a pointer to the space and remove it
001409  ** from the free-list.
001410  **
001411  ** If no suitable space can be found on the free-list, return NULL.
001412  **
001413  ** This function may detect corruption within pPg.  If corruption is
001414  ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
001415  **
001416  ** Slots on the free list that are between 1 and 3 bytes larger than nByte
001417  ** will be ignored if adding the extra space to the fragmentation count
001418  ** causes the fragmentation count to exceed 60.
001419  */
001420  static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
001421    const int hdr = pPg->hdrOffset;
001422    u8 * const aData = pPg->aData;
001423    int iAddr = hdr + 1;
001424    int pc = get2byte(&aData[iAddr]);
001425    int x;
001426    int usableSize = pPg->pBt->usableSize;
001427  
001428    assert( pc>0 );
001429    do{
001430      int size;            /* Size of the free slot */
001431      /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
001432      ** increasing offset. */
001433      if( pc>usableSize-4 || pc<iAddr+4 ){
001434        *pRc = SQLITE_CORRUPT_BKPT;
001435        return 0;
001436      }
001437      /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
001438      ** freeblock form a big-endian integer which is the size of the freeblock
001439      ** in bytes, including the 4-byte header. */
001440      size = get2byte(&aData[pc+2]);
001441      if( (x = size - nByte)>=0 ){
001442        testcase( x==4 );
001443        testcase( x==3 );
001444        if( pc < pPg->cellOffset+2*pPg->nCell || size+pc > usableSize ){
001445          *pRc = SQLITE_CORRUPT_BKPT;
001446          return 0;
001447        }else if( x<4 ){
001448          /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
001449          ** number of bytes in fragments may not exceed 60. */
001450          if( aData[hdr+7]>57 ) return 0;
001451  
001452          /* Remove the slot from the free-list. Update the number of
001453          ** fragmented bytes within the page. */
001454          memcpy(&aData[iAddr], &aData[pc], 2);
001455          aData[hdr+7] += (u8)x;
001456        }else{
001457          /* The slot remains on the free-list. Reduce its size to account
001458           ** for the portion used by the new allocation. */
001459          put2byte(&aData[pc+2], x);
001460        }
001461        return &aData[pc + x];
001462      }
001463      iAddr = pc;
001464      pc = get2byte(&aData[pc]);
001465    }while( pc );
001466  
001467    return 0;
001468  }
001469  
001470  /*
001471  ** Allocate nByte bytes of space from within the B-Tree page passed
001472  ** as the first argument. Write into *pIdx the index into pPage->aData[]
001473  ** of the first byte of allocated space. Return either SQLITE_OK or
001474  ** an error code (usually SQLITE_CORRUPT).
001475  **
001476  ** The caller guarantees that there is sufficient space to make the
001477  ** allocation.  This routine might need to defragment in order to bring
001478  ** all the space together, however.  This routine will avoid using
001479  ** the first two bytes past the cell pointer area since presumably this
001480  ** allocation is being made in order to insert a new cell, so we will
001481  ** also end up needing a new cell pointer.
001482  */
001483  static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
001484    const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
001485    u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
001486    int top;                             /* First byte of cell content area */
001487    int rc = SQLITE_OK;                  /* Integer return code */
001488    int gap;        /* First byte of gap between cell pointers and cell content */
001489    
001490    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001491    assert( pPage->pBt );
001492    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001493    assert( nByte>=0 );  /* Minimum cell size is 4 */
001494    assert( pPage->nFree>=nByte );
001495    assert( pPage->nOverflow==0 );
001496    assert( nByte < (int)(pPage->pBt->usableSize-8) );
001497  
001498    assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
001499    gap = pPage->cellOffset + 2*pPage->nCell;
001500    assert( gap<=65536 );
001501    /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
001502    ** and the reserved space is zero (the usual value for reserved space)
001503    ** then the cell content offset of an empty page wants to be 65536.
001504    ** However, that integer is too large to be stored in a 2-byte unsigned
001505    ** integer, so a value of 0 is used in its place. */
001506    top = get2byte(&data[hdr+5]);
001507    assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */
001508    if( gap>top ){
001509      if( top==0 && pPage->pBt->usableSize==65536 ){
001510        top = 65536;
001511      }else{
001512        return SQLITE_CORRUPT_BKPT;
001513      }
001514    }
001515  
001516    /* If there is enough space between gap and top for one more cell pointer
001517    ** array entry offset, and if the freelist is not empty, then search the
001518    ** freelist looking for a free slot big enough to satisfy the request.
001519    */
001520    testcase( gap+2==top );
001521    testcase( gap+1==top );
001522    testcase( gap==top );
001523    if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
001524      u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
001525      if( pSpace ){
001526        assert( pSpace>=data && (pSpace - data)<65536 );
001527        *pIdx = (int)(pSpace - data);
001528        return SQLITE_OK;
001529      }else if( rc ){
001530        return rc;
001531      }
001532    }
001533  
001534    /* The request could not be fulfilled using a freelist slot.  Check
001535    ** to see if defragmentation is necessary.
001536    */
001537    testcase( gap+2+nByte==top );
001538    if( gap+2+nByte>top ){
001539      assert( pPage->nCell>0 || CORRUPT_DB );
001540      rc = defragmentPage(pPage);
001541      if( rc ) return rc;
001542      top = get2byteNotZero(&data[hdr+5]);
001543      assert( gap+nByte<=top );
001544    }
001545  
001546  
001547    /* Allocate memory from the gap in between the cell pointer array
001548    ** and the cell content area.  The btreeInitPage() call has already
001549    ** validated the freelist.  Given that the freelist is valid, there
001550    ** is no way that the allocation can extend off the end of the page.
001551    ** The assert() below verifies the previous sentence.
001552    */
001553    top -= nByte;
001554    put2byte(&data[hdr+5], top);
001555    assert( top+nByte <= (int)pPage->pBt->usableSize );
001556    *pIdx = top;
001557    return SQLITE_OK;
001558  }
001559  
001560  /*
001561  ** Return a section of the pPage->aData to the freelist.
001562  ** The first byte of the new free block is pPage->aData[iStart]
001563  ** and the size of the block is iSize bytes.
001564  **
001565  ** Adjacent freeblocks are coalesced.
001566  **
001567  ** Note that even though the freeblock list was checked by btreeInitPage(),
001568  ** that routine will not detect overlap between cells or freeblocks.  Nor
001569  ** does it detect cells or freeblocks that encrouch into the reserved bytes
001570  ** at the end of the page.  So do additional corruption checks inside this
001571  ** routine and return SQLITE_CORRUPT if any problems are found.
001572  */
001573  static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
001574    u16 iPtr;                             /* Address of ptr to next freeblock */
001575    u16 iFreeBlk;                         /* Address of the next freeblock */
001576    u8 hdr;                               /* Page header size.  0 or 100 */
001577    u8 nFrag = 0;                         /* Reduction in fragmentation */
001578    u16 iOrigSize = iSize;                /* Original value of iSize */
001579    u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
001580    u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
001581    unsigned char *data = pPage->aData;   /* Page content */
001582  
001583    assert( pPage->pBt!=0 );
001584    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001585    assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
001586    assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
001587    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001588    assert( iSize>=4 );   /* Minimum cell size is 4 */
001589    assert( iStart<=iLast );
001590  
001591    /* Overwrite deleted information with zeros when the secure_delete
001592    ** option is enabled */
001593    if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
001594      memset(&data[iStart], 0, iSize);
001595    }
001596  
001597    /* The list of freeblocks must be in ascending order.  Find the 
001598    ** spot on the list where iStart should be inserted.
001599    */
001600    hdr = pPage->hdrOffset;
001601    iPtr = hdr + 1;
001602    if( data[iPtr+1]==0 && data[iPtr]==0 ){
001603      iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
001604    }else{
001605      while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
001606        if( iFreeBlk<iPtr+4 ){
001607          if( iFreeBlk==0 ) break;
001608          return SQLITE_CORRUPT_BKPT;
001609        }
001610        iPtr = iFreeBlk;
001611      }
001612      if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;
001613      assert( iFreeBlk>iPtr || iFreeBlk==0 );
001614    
001615      /* At this point:
001616      **    iFreeBlk:   First freeblock after iStart, or zero if none
001617      **    iPtr:       The address of a pointer to iFreeBlk
001618      **
001619      ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
001620      */
001621      if( iFreeBlk && iEnd+3>=iFreeBlk ){
001622        nFrag = iFreeBlk - iEnd;
001623        if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;
001624        iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
001625        if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT;
001626        iSize = iEnd - iStart;
001627        iFreeBlk = get2byte(&data[iFreeBlk]);
001628      }
001629    
001630      /* If iPtr is another freeblock (that is, if iPtr is not the freelist
001631      ** pointer in the page header) then check to see if iStart should be
001632      ** coalesced onto the end of iPtr.
001633      */
001634      if( iPtr>hdr+1 ){
001635        int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
001636        if( iPtrEnd+3>=iStart ){
001637          if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;
001638          nFrag += iStart - iPtrEnd;
001639          iSize = iEnd - iPtr;
001640          iStart = iPtr;
001641        }
001642      }
001643      if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;
001644      data[hdr+7] -= nFrag;
001645    }
001646    if( iStart==get2byte(&data[hdr+5]) ){
001647      /* The new freeblock is at the beginning of the cell content area,
001648      ** so just extend the cell content area rather than create another
001649      ** freelist entry */
001650      if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;
001651      put2byte(&data[hdr+1], iFreeBlk);
001652      put2byte(&data[hdr+5], iEnd);
001653    }else{
001654      /* Insert the new freeblock into the freelist */
001655      put2byte(&data[iPtr], iStart);
001656      put2byte(&data[iStart], iFreeBlk);
001657      put2byte(&data[iStart+2], iSize);
001658    }
001659    pPage->nFree += iOrigSize;
001660    return SQLITE_OK;
001661  }
001662  
001663  /*
001664  ** Decode the flags byte (the first byte of the header) for a page
001665  ** and initialize fields of the MemPage structure accordingly.
001666  **
001667  ** Only the following combinations are supported.  Anything different
001668  ** indicates a corrupt database files:
001669  **
001670  **         PTF_ZERODATA
001671  **         PTF_ZERODATA | PTF_LEAF
001672  **         PTF_LEAFDATA | PTF_INTKEY
001673  **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
001674  */
001675  static int decodeFlags(MemPage *pPage, int flagByte){
001676    BtShared *pBt;     /* A copy of pPage->pBt */
001677  
001678    assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
001679    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001680    pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
001681    flagByte &= ~PTF_LEAF;
001682    pPage->childPtrSize = 4-4*pPage->leaf;
001683    pPage->xCellSize = cellSizePtr;
001684    pBt = pPage->pBt;
001685    if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
001686      /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
001687      ** interior table b-tree page. */
001688      assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
001689      /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
001690      ** leaf table b-tree page. */
001691      assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
001692      pPage->intKey = 1;
001693      if( pPage->leaf ){
001694        pPage->intKeyLeaf = 1;
001695        pPage->xParseCell = btreeParseCellPtr;
001696      }else{
001697        pPage->intKeyLeaf = 0;
001698        pPage->xCellSize = cellSizePtrNoPayload;
001699        pPage->xParseCell = btreeParseCellPtrNoPayload;
001700      }
001701      pPage->maxLocal = pBt->maxLeaf;
001702      pPage->minLocal = pBt->minLeaf;
001703    }else if( flagByte==PTF_ZERODATA ){
001704      /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
001705      ** interior index b-tree page. */
001706      assert( (PTF_ZERODATA)==2 );
001707      /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
001708      ** leaf index b-tree page. */
001709      assert( (PTF_ZERODATA|PTF_LEAF)==10 );
001710      pPage->intKey = 0;
001711      pPage->intKeyLeaf = 0;
001712      pPage->xParseCell = btreeParseCellPtrIndex;
001713      pPage->maxLocal = pBt->maxLocal;
001714      pPage->minLocal = pBt->minLocal;
001715    }else{
001716      /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
001717      ** an error. */
001718      return SQLITE_CORRUPT_BKPT;
001719    }
001720    pPage->max1bytePayload = pBt->max1bytePayload;
001721    return SQLITE_OK;
001722  }
001723  
001724  /*
001725  ** Initialize the auxiliary information for a disk block.
001726  **
001727  ** Return SQLITE_OK on success.  If we see that the page does
001728  ** not contain a well-formed database page, then return 
001729  ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
001730  ** guarantee that the page is well-formed.  It only shows that
001731  ** we failed to detect any corruption.
001732  */
001733  static int btreeInitPage(MemPage *pPage){
001734  
001735    assert( pPage->pBt!=0 );
001736    assert( pPage->pBt->db!=0 );
001737    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001738    assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
001739    assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
001740    assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
001741  
001742    if( !pPage->isInit ){
001743      int pc;            /* Address of a freeblock within pPage->aData[] */
001744      u8 hdr;            /* Offset to beginning of page header */
001745      u8 *data;          /* Equal to pPage->aData */
001746      BtShared *pBt;        /* The main btree structure */
001747      int usableSize;    /* Amount of usable space on each page */
001748      u16 cellOffset;    /* Offset from start of page to first cell pointer */
001749      int nFree;         /* Number of unused bytes on the page */
001750      int top;           /* First byte of the cell content area */
001751      int iCellFirst;    /* First allowable cell or freeblock offset */
001752      int iCellLast;     /* Last possible cell or freeblock offset */
001753  
001754      pBt = pPage->pBt;
001755  
001756      hdr = pPage->hdrOffset;
001757      data = pPage->aData;
001758      /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
001759      ** the b-tree page type. */
001760      if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
001761      assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
001762      pPage->maskPage = (u16)(pBt->pageSize - 1);
001763      pPage->nOverflow = 0;
001764      usableSize = pBt->usableSize;
001765      pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
001766      pPage->aDataEnd = &data[usableSize];
001767      pPage->aCellIdx = &data[cellOffset];
001768      pPage->aDataOfst = &data[pPage->childPtrSize];
001769      /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
001770      ** the start of the cell content area. A zero value for this integer is
001771      ** interpreted as 65536. */
001772      top = get2byteNotZero(&data[hdr+5]);
001773      /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
001774      ** number of cells on the page. */
001775      pPage->nCell = get2byte(&data[hdr+3]);
001776      if( pPage->nCell>MX_CELL(pBt) ){
001777        /* To many cells for a single page.  The page must be corrupt */
001778        return SQLITE_CORRUPT_BKPT;
001779      }
001780      testcase( pPage->nCell==MX_CELL(pBt) );
001781      /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
001782      ** possible for a root page of a table that contains no rows) then the
001783      ** offset to the cell content area will equal the page size minus the
001784      ** bytes of reserved space. */
001785      assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
001786  
001787      /* A malformed database page might cause us to read past the end
001788      ** of page when parsing a cell.  
001789      **
001790      ** The following block of code checks early to see if a cell extends
001791      ** past the end of a page boundary and causes SQLITE_CORRUPT to be 
001792      ** returned if it does.
001793      */
001794      iCellFirst = cellOffset + 2*pPage->nCell;
001795      iCellLast = usableSize - 4;
001796      if( pBt->db->flags & SQLITE_CellSizeCk ){
001797        int i;            /* Index into the cell pointer array */
001798        int sz;           /* Size of a cell */
001799  
001800        if( !pPage->leaf ) iCellLast--;
001801        for(i=0; i<pPage->nCell; i++){
001802          pc = get2byteAligned(&data[cellOffset+i*2]);
001803          testcase( pc==iCellFirst );
001804          testcase( pc==iCellLast );
001805          if( pc<iCellFirst || pc>iCellLast ){
001806            return SQLITE_CORRUPT_BKPT;
001807          }
001808          sz = pPage->xCellSize(pPage, &data[pc]);
001809          testcase( pc+sz==usableSize );
001810          if( pc+sz>usableSize ){
001811            return SQLITE_CORRUPT_BKPT;
001812          }
001813        }
001814        if( !pPage->leaf ) iCellLast++;
001815      }  
001816  
001817      /* Compute the total free space on the page
001818      ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
001819      ** start of the first freeblock on the page, or is zero if there are no
001820      ** freeblocks. */
001821      pc = get2byte(&data[hdr+1]);
001822      nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
001823      if( pc>0 ){
001824        u32 next, size;
001825        if( pc<iCellFirst ){
001826          /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
001827          ** always be at least one cell before the first freeblock.
001828          */
001829          return SQLITE_CORRUPT_BKPT; 
001830        }
001831        while( 1 ){
001832          if( pc>iCellLast ){
001833            return SQLITE_CORRUPT_BKPT; /* Freeblock off the end of the page */
001834          }
001835          next = get2byte(&data[pc]);
001836          size = get2byte(&data[pc+2]);
001837          nFree = nFree + size;
001838          if( next<=pc+size+3 ) break;
001839          pc = next;
001840        }
001841        if( next>0 ){
001842          return SQLITE_CORRUPT_BKPT;  /* Freeblock not in ascending order */
001843        }
001844        if( pc+size>(unsigned int)usableSize ){
001845          return SQLITE_CORRUPT_BKPT;  /* Last freeblock extends past page end */
001846        }
001847      }
001848  
001849      /* At this point, nFree contains the sum of the offset to the start
001850      ** of the cell-content area plus the number of free bytes within
001851      ** the cell-content area. If this is greater than the usable-size
001852      ** of the page, then the page must be corrupted. This check also
001853      ** serves to verify that the offset to the start of the cell-content
001854      ** area, according to the page header, lies within the page.
001855      */
001856      if( nFree>usableSize ){
001857        return SQLITE_CORRUPT_BKPT; 
001858      }
001859      pPage->nFree = (u16)(nFree - iCellFirst);
001860      pPage->isInit = 1;
001861    }
001862    return SQLITE_OK;
001863  }
001864  
001865  /*
001866  ** Set up a raw page so that it looks like a database page holding
001867  ** no entries.
001868  */
001869  static void zeroPage(MemPage *pPage, int flags){
001870    unsigned char *data = pPage->aData;
001871    BtShared *pBt = pPage->pBt;
001872    u8 hdr = pPage->hdrOffset;
001873    u16 first;
001874  
001875    assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
001876    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
001877    assert( sqlite3PagerGetData(pPage->pDbPage) == data );
001878    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001879    assert( sqlite3_mutex_held(pBt->mutex) );
001880    if( pBt->btsFlags & BTS_SECURE_DELETE ){
001881      memset(&data[hdr], 0, pBt->usableSize - hdr);
001882    }
001883    data[hdr] = (char)flags;
001884    first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
001885    memset(&data[hdr+1], 0, 4);
001886    data[hdr+7] = 0;
001887    put2byte(&data[hdr+5], pBt->usableSize);
001888    pPage->nFree = (u16)(pBt->usableSize - first);
001889    decodeFlags(pPage, flags);
001890    pPage->cellOffset = first;
001891    pPage->aDataEnd = &data[pBt->usableSize];
001892    pPage->aCellIdx = &data[first];
001893    pPage->aDataOfst = &data[pPage->childPtrSize];
001894    pPage->nOverflow = 0;
001895    assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
001896    pPage->maskPage = (u16)(pBt->pageSize - 1);
001897    pPage->nCell = 0;
001898    pPage->isInit = 1;
001899  }
001900  
001901  
001902  /*
001903  ** Convert a DbPage obtained from the pager into a MemPage used by
001904  ** the btree layer.
001905  */
001906  static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
001907    MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
001908    if( pgno!=pPage->pgno ){
001909      pPage->aData = sqlite3PagerGetData(pDbPage);
001910      pPage->pDbPage = pDbPage;
001911      pPage->pBt = pBt;
001912      pPage->pgno = pgno;
001913      pPage->hdrOffset = pgno==1 ? 100 : 0;
001914    }
001915    assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
001916    return pPage; 
001917  }
001918  
001919  /*
001920  ** Get a page from the pager.  Initialize the MemPage.pBt and
001921  ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
001922  **
001923  ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
001924  ** about the content of the page at this time.  So do not go to the disk
001925  ** to fetch the content.  Just fill in the content with zeros for now.
001926  ** If in the future we call sqlite3PagerWrite() on this page, that
001927  ** means we have started to be concerned about content and the disk
001928  ** read should occur at that point.
001929  */
001930  static int btreeGetPage(
001931    BtShared *pBt,       /* The btree */
001932    Pgno pgno,           /* Number of the page to fetch */
001933    MemPage **ppPage,    /* Return the page in this parameter */
001934    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
001935  ){
001936    int rc;
001937    DbPage *pDbPage;
001938  
001939    assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
001940    assert( sqlite3_mutex_held(pBt->mutex) );
001941    rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
001942    if( rc ) return rc;
001943    *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
001944    return SQLITE_OK;
001945  }
001946  
001947  /*
001948  ** Retrieve a page from the pager cache. If the requested page is not
001949  ** already in the pager cache return NULL. Initialize the MemPage.pBt and
001950  ** MemPage.aData elements if needed.
001951  */
001952  static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
001953    DbPage *pDbPage;
001954    assert( sqlite3_mutex_held(pBt->mutex) );
001955    pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
001956    if( pDbPage ){
001957      return btreePageFromDbPage(pDbPage, pgno, pBt);
001958    }
001959    return 0;
001960  }
001961  
001962  /*
001963  ** Return the size of the database file in pages. If there is any kind of
001964  ** error, return ((unsigned int)-1).
001965  */
001966  static Pgno btreePagecount(BtShared *pBt){
001967    return pBt->nPage;
001968  }
001969  u32 sqlite3BtreeLastPage(Btree *p){
001970    assert( sqlite3BtreeHoldsMutex(p) );
001971    assert( ((p->pBt->nPage)&0x8000000)==0 );
001972    return btreePagecount(p->pBt);
001973  }
001974  
001975  /*
001976  ** Get a page from the pager and initialize it.
001977  **
001978  ** If pCur!=0 then the page is being fetched as part of a moveToChild()
001979  ** call.  Do additional sanity checking on the page in this case.
001980  ** And if the fetch fails, this routine must decrement pCur->iPage.
001981  **
001982  ** The page is fetched as read-write unless pCur is not NULL and is
001983  ** a read-only cursor.
001984  **
001985  ** If an error occurs, then *ppPage is undefined. It
001986  ** may remain unchanged, or it may be set to an invalid value.
001987  */
001988  static int getAndInitPage(
001989    BtShared *pBt,                  /* The database file */
001990    Pgno pgno,                      /* Number of the page to get */
001991    MemPage **ppPage,               /* Write the page pointer here */
001992    BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
001993    int bReadOnly                   /* True for a read-only page */
001994  ){
001995    int rc;
001996    DbPage *pDbPage;
001997    assert( sqlite3_mutex_held(pBt->mutex) );
001998    assert( pCur==0 || ppPage==&pCur->apPage[pCur->iPage] );
001999    assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
002000    assert( pCur==0 || pCur->iPage>0 );
002001  
002002    if( pgno>btreePagecount(pBt) ){
002003      rc = SQLITE_CORRUPT_BKPT;
002004      goto getAndInitPage_error;
002005    }
002006    rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
002007    if( rc ){
002008      goto getAndInitPage_error;
002009    }
002010    *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
002011    if( (*ppPage)->isInit==0 ){
002012      btreePageFromDbPage(pDbPage, pgno, pBt);
002013      rc = btreeInitPage(*ppPage);
002014      if( rc!=SQLITE_OK ){
002015        releasePage(*ppPage);
002016        goto getAndInitPage_error;
002017      }
002018    }
002019    assert( (*ppPage)->pgno==pgno );
002020    assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
002021  
002022    /* If obtaining a child page for a cursor, we must verify that the page is
002023    ** compatible with the root page. */
002024    if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
002025      rc = SQLITE_CORRUPT_BKPT;
002026      releasePage(*ppPage);
002027      goto getAndInitPage_error;
002028    }
002029    return SQLITE_OK;
002030  
002031  getAndInitPage_error:
002032    if( pCur ) pCur->iPage--;
002033    testcase( pgno==0 );
002034    assert( pgno!=0 || rc==SQLITE_CORRUPT );
002035    return rc;
002036  }
002037  
002038  /*
002039  ** Release a MemPage.  This should be called once for each prior
002040  ** call to btreeGetPage.
002041  */
002042  static void releasePageNotNull(MemPage *pPage){
002043    assert( pPage->aData );
002044    assert( pPage->pBt );
002045    assert( pPage->pDbPage!=0 );
002046    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002047    assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
002048    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002049    sqlite3PagerUnrefNotNull(pPage->pDbPage);
002050  }
002051  static void releasePage(MemPage *pPage){
002052    if( pPage ) releasePageNotNull(pPage);
002053  }
002054  
002055  /*
002056  ** Get an unused page.
002057  **
002058  ** This works just like btreeGetPage() with the addition:
002059  **
002060  **   *  If the page is already in use for some other purpose, immediately
002061  **      release it and return an SQLITE_CURRUPT error.
002062  **   *  Make sure the isInit flag is clear
002063  */
002064  static int btreeGetUnusedPage(
002065    BtShared *pBt,       /* The btree */
002066    Pgno pgno,           /* Number of the page to fetch */
002067    MemPage **ppPage,    /* Return the page in this parameter */
002068    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
002069  ){
002070    int rc = btreeGetPage(pBt, pgno, ppPage, flags);
002071    if( rc==SQLITE_OK ){
002072      if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
002073        releasePage(*ppPage);
002074        *ppPage = 0;
002075        return SQLITE_CORRUPT_BKPT;
002076      }
002077      (*ppPage)->isInit = 0;
002078    }else{
002079      *ppPage = 0;
002080    }
002081    return rc;
002082  }
002083  
002084  
002085  /*
002086  ** During a rollback, when the pager reloads information into the cache
002087  ** so that the cache is restored to its original state at the start of
002088  ** the transaction, for each page restored this routine is called.
002089  **
002090  ** This routine needs to reset the extra data section at the end of the
002091  ** page to agree with the restored data.
002092  */
002093  static void pageReinit(DbPage *pData){
002094    MemPage *pPage;
002095    pPage = (MemPage *)sqlite3PagerGetExtra(pData);
002096    assert( sqlite3PagerPageRefcount(pData)>0 );
002097    if( pPage->isInit ){
002098      assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002099      pPage->isInit = 0;
002100      if( sqlite3PagerPageRefcount(pData)>1 ){
002101        /* pPage might not be a btree page;  it might be an overflow page
002102        ** or ptrmap page or a free page.  In those cases, the following
002103        ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
002104        ** But no harm is done by this.  And it is very important that
002105        ** btreeInitPage() be called on every btree page so we make
002106        ** the call for every page that comes in for re-initing. */
002107        btreeInitPage(pPage);
002108      }
002109    }
002110  }
002111  
002112  /*
002113  ** Invoke the busy handler for a btree.
002114  */
002115  static int btreeInvokeBusyHandler(void *pArg){
002116    BtShared *pBt = (BtShared*)pArg;
002117    assert( pBt->db );
002118    assert( sqlite3_mutex_held(pBt->db->mutex) );
002119    return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
002120  }
002121  
002122  /*
002123  ** Open a database file.
002124  ** 
002125  ** zFilename is the name of the database file.  If zFilename is NULL
002126  ** then an ephemeral database is created.  The ephemeral database might
002127  ** be exclusively in memory, or it might use a disk-based memory cache.
002128  ** Either way, the ephemeral database will be automatically deleted 
002129  ** when sqlite3BtreeClose() is called.
002130  **
002131  ** If zFilename is ":memory:" then an in-memory database is created
002132  ** that is automatically destroyed when it is closed.
002133  **
002134  ** The "flags" parameter is a bitmask that might contain bits like
002135  ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
002136  **
002137  ** If the database is already opened in the same database connection
002138  ** and we are in shared cache mode, then the open will fail with an
002139  ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
002140  ** objects in the same database connection since doing so will lead
002141  ** to problems with locking.
002142  */
002143  int sqlite3BtreeOpen(
002144    sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
002145    const char *zFilename,  /* Name of the file containing the BTree database */
002146    sqlite3 *db,            /* Associated database handle */
002147    Btree **ppBtree,        /* Pointer to new Btree object written here */
002148    int flags,              /* Options */
002149    int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
002150  ){
002151    BtShared *pBt = 0;             /* Shared part of btree structure */
002152    Btree *p;                      /* Handle to return */
002153    sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
002154    int rc = SQLITE_OK;            /* Result code from this function */
002155    u8 nReserve;                   /* Byte of unused space on each page */
002156    unsigned char zDbHeader[100];  /* Database header content */
002157  
002158    /* True if opening an ephemeral, temporary database */
002159    const int isTempDb = zFilename==0 || zFilename[0]==0;
002160  
002161    /* Set the variable isMemdb to true for an in-memory database, or 
002162    ** false for a file-based database.
002163    */
002164  #ifdef SQLITE_OMIT_MEMORYDB
002165    const int isMemdb = 0;
002166  #else
002167    const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
002168                         || (isTempDb && sqlite3TempInMemory(db))
002169                         || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
002170  #endif
002171  
002172    assert( db!=0 );
002173    assert( pVfs!=0 );
002174    assert( sqlite3_mutex_held(db->mutex) );
002175    assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
002176  
002177    /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
002178    assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
002179  
002180    /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
002181    assert( (flags & BTREE_SINGLE)==0 || isTempDb );
002182  
002183    if( isMemdb ){
002184      flags |= BTREE_MEMORY;
002185    }
002186    if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
002187      vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
002188    }
002189    p = sqlite3MallocZero(sizeof(Btree));
002190    if( !p ){
002191      return SQLITE_NOMEM_BKPT;
002192    }
002193    p->inTrans = TRANS_NONE;
002194    p->db = db;
002195  #ifndef SQLITE_OMIT_SHARED_CACHE
002196    p->lock.pBtree = p;
002197    p->lock.iTable = 1;
002198  #endif
002199  
002200  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002201    /*
002202    ** If this Btree is a candidate for shared cache, try to find an
002203    ** existing BtShared object that we can share with
002204    */
002205    if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
002206      if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
002207        int nFilename = sqlite3Strlen30(zFilename)+1;
002208        int nFullPathname = pVfs->mxPathname+1;
002209        char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
002210        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
002211  
002212        p->sharable = 1;
002213        if( !zFullPathname ){
002214          sqlite3_free(p);
002215          return SQLITE_NOMEM_BKPT;
002216        }
002217        if( isMemdb ){
002218          memcpy(zFullPathname, zFilename, nFilename);
002219        }else{
002220          rc = sqlite3OsFullPathname(pVfs, zFilename,
002221                                     nFullPathname, zFullPathname);
002222          if( rc ){
002223            sqlite3_free(zFullPathname);
002224            sqlite3_free(p);
002225            return rc;
002226          }
002227        }
002228  #if SQLITE_THREADSAFE
002229        mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
002230        sqlite3_mutex_enter(mutexOpen);
002231        mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
002232        sqlite3_mutex_enter(mutexShared);
002233  #endif
002234        for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
002235          assert( pBt->nRef>0 );
002236          if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
002237                   && sqlite3PagerVfs(pBt->pPager)==pVfs ){
002238            int iDb;
002239            for(iDb=db->nDb-1; iDb>=0; iDb--){
002240              Btree *pExisting = db->aDb[iDb].pBt;
002241              if( pExisting && pExisting->pBt==pBt ){
002242                sqlite3_mutex_leave(mutexShared);
002243                sqlite3_mutex_leave(mutexOpen);
002244                sqlite3_free(zFullPathname);
002245                sqlite3_free(p);
002246                return SQLITE_CONSTRAINT;
002247              }
002248            }
002249            p->pBt = pBt;
002250            pBt->nRef++;
002251            break;
002252          }
002253        }
002254        sqlite3_mutex_leave(mutexShared);
002255        sqlite3_free(zFullPathname);
002256      }
002257  #ifdef SQLITE_DEBUG
002258      else{
002259        /* In debug mode, we mark all persistent databases as sharable
002260        ** even when they are not.  This exercises the locking code and
002261        ** gives more opportunity for asserts(sqlite3_mutex_held())
002262        ** statements to find locking problems.
002263        */
002264        p->sharable = 1;
002265      }
002266  #endif
002267    }
002268  #endif
002269    if( pBt==0 ){
002270      /*
002271      ** The following asserts make sure that structures used by the btree are
002272      ** the right size.  This is to guard against size changes that result
002273      ** when compiling on a different architecture.
002274      */
002275      assert( sizeof(i64)==8 );
002276      assert( sizeof(u64)==8 );
002277      assert( sizeof(u32)==4 );
002278      assert( sizeof(u16)==2 );
002279      assert( sizeof(Pgno)==4 );
002280    
002281      pBt = sqlite3MallocZero( sizeof(*pBt) );
002282      if( pBt==0 ){
002283        rc = SQLITE_NOMEM_BKPT;
002284        goto btree_open_out;
002285      }
002286      rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
002287                            sizeof(MemPage), flags, vfsFlags, pageReinit);
002288      if( rc==SQLITE_OK ){
002289        sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
002290        rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
002291      }
002292      if( rc!=SQLITE_OK ){
002293        goto btree_open_out;
002294      }
002295      pBt->openFlags = (u8)flags;
002296      pBt->db = db;
002297      sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
002298      p->pBt = pBt;
002299    
002300      pBt->pCursor = 0;
002301      pBt->pPage1 = 0;
002302      if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
002303  #ifdef SQLITE_SECURE_DELETE
002304      pBt->btsFlags |= BTS_SECURE_DELETE;
002305  #endif
002306      /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
002307      ** determined by the 2-byte integer located at an offset of 16 bytes from
002308      ** the beginning of the database file. */
002309      pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
002310      if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
002311           || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
002312        pBt->pageSize = 0;
002313  #ifndef SQLITE_OMIT_AUTOVACUUM
002314        /* If the magic name ":memory:" will create an in-memory database, then
002315        ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
002316        ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
002317        ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
002318        ** regular file-name. In this case the auto-vacuum applies as per normal.
002319        */
002320        if( zFilename && !isMemdb ){
002321          pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
002322          pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
002323        }
002324  #endif
002325        nReserve = 0;
002326      }else{
002327        /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
002328        ** determined by the one-byte unsigned integer found at an offset of 20
002329        ** into the database file header. */
002330        nReserve = zDbHeader[20];
002331        pBt->btsFlags |= BTS_PAGESIZE_FIXED;
002332  #ifndef SQLITE_OMIT_AUTOVACUUM
002333        pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
002334        pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
002335  #endif
002336      }
002337      rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
002338      if( rc ) goto btree_open_out;
002339      pBt->usableSize = pBt->pageSize - nReserve;
002340      assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
002341     
002342  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002343      /* Add the new BtShared object to the linked list sharable BtShareds.
002344      */
002345      pBt->nRef = 1;
002346      if( p->sharable ){
002347        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
002348        MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
002349        if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
002350          pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
002351          if( pBt->mutex==0 ){
002352            rc = SQLITE_NOMEM_BKPT;
002353            goto btree_open_out;
002354          }
002355        }
002356        sqlite3_mutex_enter(mutexShared);
002357        pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
002358        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
002359        sqlite3_mutex_leave(mutexShared);
002360      }
002361  #endif
002362    }
002363  
002364  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002365    /* If the new Btree uses a sharable pBtShared, then link the new
002366    ** Btree into the list of all sharable Btrees for the same connection.
002367    ** The list is kept in ascending order by pBt address.
002368    */
002369    if( p->sharable ){
002370      int i;
002371      Btree *pSib;
002372      for(i=0; i<db->nDb; i++){
002373        if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
002374          while( pSib->pPrev ){ pSib = pSib->pPrev; }
002375          if( (uptr)p->pBt<(uptr)pSib->pBt ){
002376            p->pNext = pSib;
002377            p->pPrev = 0;
002378            pSib->pPrev = p;
002379          }else{
002380            while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
002381              pSib = pSib->pNext;
002382            }
002383            p->pNext = pSib->pNext;
002384            p->pPrev = pSib;
002385            if( p->pNext ){
002386              p->pNext->pPrev = p;
002387            }
002388            pSib->pNext = p;
002389          }
002390          break;
002391        }
002392      }
002393    }
002394  #endif
002395    *ppBtree = p;
002396  
002397  btree_open_out:
002398    if( rc!=SQLITE_OK ){
002399      if( pBt && pBt->pPager ){
002400        sqlite3PagerClose(pBt->pPager, 0);
002401      }
002402      sqlite3_free(pBt);
002403      sqlite3_free(p);
002404      *ppBtree = 0;
002405    }else{
002406      sqlite3_file *pFile;
002407  
002408      /* If the B-Tree was successfully opened, set the pager-cache size to the
002409      ** default value. Except, when opening on an existing shared pager-cache,
002410      ** do not change the pager-cache size.
002411      */
002412      if( sqlite3BtreeSchema(p, 0, 0)==0 ){
002413        sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
002414      }
002415  
002416      pFile = sqlite3PagerFile(pBt->pPager);
002417      if( pFile->pMethods ){
002418        sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
002419      }
002420    }
002421    if( mutexOpen ){
002422      assert( sqlite3_mutex_held(mutexOpen) );
002423      sqlite3_mutex_leave(mutexOpen);
002424    }
002425    assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
002426    return rc;
002427  }
002428  
002429  /*
002430  ** Decrement the BtShared.nRef counter.  When it reaches zero,
002431  ** remove the BtShared structure from the sharing list.  Return
002432  ** true if the BtShared.nRef counter reaches zero and return
002433  ** false if it is still positive.
002434  */
002435  static int removeFromSharingList(BtShared *pBt){
002436  #ifndef SQLITE_OMIT_SHARED_CACHE
002437    MUTEX_LOGIC( sqlite3_mutex *pMaster; )
002438    BtShared *pList;
002439    int removed = 0;
002440  
002441    assert( sqlite3_mutex_notheld(pBt->mutex) );
002442    MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
002443    sqlite3_mutex_enter(pMaster);
002444    pBt->nRef--;
002445    if( pBt->nRef<=0 ){
002446      if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
002447        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
002448      }else{
002449        pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
002450        while( ALWAYS(pList) && pList->pNext!=pBt ){
002451          pList=pList->pNext;
002452        }
002453        if( ALWAYS(pList) ){
002454          pList->pNext = pBt->pNext;
002455        }
002456      }
002457      if( SQLITE_THREADSAFE ){
002458        sqlite3_mutex_free(pBt->mutex);
002459      }
002460      removed = 1;
002461    }
002462    sqlite3_mutex_leave(pMaster);
002463    return removed;
002464  #else
002465    return 1;
002466  #endif
002467  }
002468  
002469  /*
002470  ** Make sure pBt->pTmpSpace points to an allocation of 
002471  ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
002472  ** pointer.
002473  */
002474  static void allocateTempSpace(BtShared *pBt){
002475    if( !pBt->pTmpSpace ){
002476      pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
002477  
002478      /* One of the uses of pBt->pTmpSpace is to format cells before
002479      ** inserting them into a leaf page (function fillInCell()). If
002480      ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
002481      ** by the various routines that manipulate binary cells. Which
002482      ** can mean that fillInCell() only initializes the first 2 or 3
002483      ** bytes of pTmpSpace, but that the first 4 bytes are copied from
002484      ** it into a database page. This is not actually a problem, but it
002485      ** does cause a valgrind error when the 1 or 2 bytes of unitialized 
002486      ** data is passed to system call write(). So to avoid this error,
002487      ** zero the first 4 bytes of temp space here.
002488      **
002489      ** Also:  Provide four bytes of initialized space before the
002490      ** beginning of pTmpSpace as an area available to prepend the
002491      ** left-child pointer to the beginning of a cell.
002492      */
002493      if( pBt->pTmpSpace ){
002494        memset(pBt->pTmpSpace, 0, 8);
002495        pBt->pTmpSpace += 4;
002496      }
002497    }
002498  }
002499  
002500  /*
002501  ** Free the pBt->pTmpSpace allocation
002502  */
002503  static void freeTempSpace(BtShared *pBt){
002504    if( pBt->pTmpSpace ){
002505      pBt->pTmpSpace -= 4;
002506      sqlite3PageFree(pBt->pTmpSpace);
002507      pBt->pTmpSpace = 0;
002508    }
002509  }
002510  
002511  /*
002512  ** Close an open database and invalidate all cursors.
002513  */
002514  int sqlite3BtreeClose(Btree *p){
002515    BtShared *pBt = p->pBt;
002516    BtCursor *pCur;
002517  
002518    /* Close all cursors opened via this handle.  */
002519    assert( sqlite3_mutex_held(p->db->mutex) );
002520    sqlite3BtreeEnter(p);
002521    pCur = pBt->pCursor;
002522    while( pCur ){
002523      BtCursor *pTmp = pCur;
002524      pCur = pCur->pNext;
002525      if( pTmp->pBtree==p ){
002526        sqlite3BtreeCloseCursor(pTmp);
002527      }
002528    }
002529  
002530    /* Rollback any active transaction and free the handle structure.
002531    ** The call to sqlite3BtreeRollback() drops any table-locks held by
002532    ** this handle.
002533    */
002534    sqlite3BtreeRollback(p, SQLITE_OK, 0);
002535    sqlite3BtreeLeave(p);
002536  
002537    /* If there are still other outstanding references to the shared-btree
002538    ** structure, return now. The remainder of this procedure cleans 
002539    ** up the shared-btree.
002540    */
002541    assert( p->wantToLock==0 && p->locked==0 );
002542    if( !p->sharable || removeFromSharingList(pBt) ){
002543      /* The pBt is no longer on the sharing list, so we can access
002544      ** it without having to hold the mutex.
002545      **
002546      ** Clean out and delete the BtShared object.
002547      */
002548      assert( !pBt->pCursor );
002549      sqlite3PagerClose(pBt->pPager, p->db);
002550      if( pBt->xFreeSchema && pBt->pSchema ){
002551        pBt->xFreeSchema(pBt->pSchema);
002552      }
002553      sqlite3DbFree(0, pBt->pSchema);
002554      freeTempSpace(pBt);
002555      sqlite3_free(pBt);
002556    }
002557  
002558  #ifndef SQLITE_OMIT_SHARED_CACHE
002559    assert( p->wantToLock==0 );
002560    assert( p->locked==0 );
002561    if( p->pPrev ) p->pPrev->pNext = p->pNext;
002562    if( p->pNext ) p->pNext->pPrev = p->pPrev;
002563  #endif
002564  
002565    sqlite3_free(p);
002566    return SQLITE_OK;
002567  }
002568  
002569  /*
002570  ** Change the "soft" limit on the number of pages in the cache.
002571  ** Unused and unmodified pages will be recycled when the number of
002572  ** pages in the cache exceeds this soft limit.  But the size of the
002573  ** cache is allowed to grow larger than this limit if it contains
002574  ** dirty pages or pages still in active use.
002575  */
002576  int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
002577    BtShared *pBt = p->pBt;
002578    assert( sqlite3_mutex_held(p->db->mutex) );
002579    sqlite3BtreeEnter(p);
002580    sqlite3PagerSetCachesize(pBt->pPager, mxPage);
002581    sqlite3BtreeLeave(p);
002582    return SQLITE_OK;
002583  }
002584  
002585  /*
002586  ** Change the "spill" limit on the number of pages in the cache.
002587  ** If the number of pages exceeds this limit during a write transaction,
002588  ** the pager might attempt to "spill" pages to the journal early in
002589  ** order to free up memory.
002590  **
002591  ** The value returned is the current spill size.  If zero is passed
002592  ** as an argument, no changes are made to the spill size setting, so
002593  ** using mxPage of 0 is a way to query the current spill size.
002594  */
002595  int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
002596    BtShared *pBt = p->pBt;
002597    int res;
002598    assert( sqlite3_mutex_held(p->db->mutex) );
002599    sqlite3BtreeEnter(p);
002600    res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
002601    sqlite3BtreeLeave(p);
002602    return res;
002603  }
002604  
002605  #if SQLITE_MAX_MMAP_SIZE>0
002606  /*
002607  ** Change the limit on the amount of the database file that may be
002608  ** memory mapped.
002609  */
002610  int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
002611    BtShared *pBt = p->pBt;
002612    assert( sqlite3_mutex_held(p->db->mutex) );
002613    sqlite3BtreeEnter(p);
002614    sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
002615    sqlite3BtreeLeave(p);
002616    return SQLITE_OK;
002617  }
002618  #endif /* SQLITE_MAX_MMAP_SIZE>0 */
002619  
002620  /*
002621  ** Change the way data is synced to disk in order to increase or decrease
002622  ** how well the database resists damage due to OS crashes and power
002623  ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
002624  ** there is a high probability of damage)  Level 2 is the default.  There
002625  ** is a very low but non-zero probability of damage.  Level 3 reduces the
002626  ** probability of damage to near zero but with a write performance reduction.
002627  */
002628  #ifndef SQLITE_OMIT_PAGER_PRAGMAS
002629  int sqlite3BtreeSetPagerFlags(
002630    Btree *p,              /* The btree to set the safety level on */
002631    unsigned pgFlags       /* Various PAGER_* flags */
002632  ){
002633    BtShared *pBt = p->pBt;
002634    assert( sqlite3_mutex_held(p->db->mutex) );
002635    sqlite3BtreeEnter(p);
002636    sqlite3PagerSetFlags(pBt->pPager, pgFlags);
002637    sqlite3BtreeLeave(p);
002638    return SQLITE_OK;
002639  }
002640  #endif
002641  
002642  /*
002643  ** Change the default pages size and the number of reserved bytes per page.
002644  ** Or, if the page size has already been fixed, return SQLITE_READONLY 
002645  ** without changing anything.
002646  **
002647  ** The page size must be a power of 2 between 512 and 65536.  If the page
002648  ** size supplied does not meet this constraint then the page size is not
002649  ** changed.
002650  **
002651  ** Page sizes are constrained to be a power of two so that the region
002652  ** of the database file used for locking (beginning at PENDING_BYTE,
002653  ** the first byte past the 1GB boundary, 0x40000000) needs to occur
002654  ** at the beginning of a page.
002655  **
002656  ** If parameter nReserve is less than zero, then the number of reserved
002657  ** bytes per page is left unchanged.
002658  **
002659  ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
002660  ** and autovacuum mode can no longer be changed.
002661  */
002662  int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
002663    int rc = SQLITE_OK;
002664    BtShared *pBt = p->pBt;
002665    assert( nReserve>=-1 && nReserve<=255 );
002666    sqlite3BtreeEnter(p);
002667  #if SQLITE_HAS_CODEC
002668    if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
002669  #endif
002670    if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
002671      sqlite3BtreeLeave(p);
002672      return SQLITE_READONLY;
002673    }
002674    if( nReserve<0 ){
002675      nReserve = pBt->pageSize - pBt->usableSize;
002676    }
002677    assert( nReserve>=0 && nReserve<=255 );
002678    if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
002679          ((pageSize-1)&pageSize)==0 ){
002680      assert( (pageSize & 7)==0 );
002681      assert( !pBt->pCursor );
002682      pBt->pageSize = (u32)pageSize;
002683      freeTempSpace(pBt);
002684    }
002685    rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
002686    pBt->usableSize = pBt->pageSize - (u16)nReserve;
002687    if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
002688    sqlite3BtreeLeave(p);
002689    return rc;
002690  }
002691  
002692  /*
002693  ** Return the currently defined page size
002694  */
002695  int sqlite3BtreeGetPageSize(Btree *p){
002696    return p->pBt->pageSize;
002697  }
002698  
002699  /*
002700  ** This function is similar to sqlite3BtreeGetReserve(), except that it
002701  ** may only be called if it is guaranteed that the b-tree mutex is already
002702  ** held.
002703  **
002704  ** This is useful in one special case in the backup API code where it is
002705  ** known that the shared b-tree mutex is held, but the mutex on the 
002706  ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
002707  ** were to be called, it might collide with some other operation on the
002708  ** database handle that owns *p, causing undefined behavior.
002709  */
002710  int sqlite3BtreeGetReserveNoMutex(Btree *p){
002711    int n;
002712    assert( sqlite3_mutex_held(p->pBt->mutex) );
002713    n = p->pBt->pageSize - p->pBt->usableSize;
002714    return n;
002715  }
002716  
002717  /*
002718  ** Return the number of bytes of space at the end of every page that
002719  ** are intentually left unused.  This is the "reserved" space that is
002720  ** sometimes used by extensions.
002721  **
002722  ** If SQLITE_HAS_MUTEX is defined then the number returned is the
002723  ** greater of the current reserved space and the maximum requested
002724  ** reserve space.
002725  */
002726  int sqlite3BtreeGetOptimalReserve(Btree *p){
002727    int n;
002728    sqlite3BtreeEnter(p);
002729    n = sqlite3BtreeGetReserveNoMutex(p);
002730  #ifdef SQLITE_HAS_CODEC
002731    if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
002732  #endif
002733    sqlite3BtreeLeave(p);
002734    return n;
002735  }
002736  
002737  
002738  /*
002739  ** Set the maximum page count for a database if mxPage is positive.
002740  ** No changes are made if mxPage is 0 or negative.
002741  ** Regardless of the value of mxPage, return the maximum page count.
002742  */
002743  int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
002744    int n;
002745    sqlite3BtreeEnter(p);
002746    n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
002747    sqlite3BtreeLeave(p);
002748    return n;
002749  }
002750  
002751  /*
002752  ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
002753  ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
002754  ** setting after the change.
002755  */
002756  int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
002757    int b;
002758    if( p==0 ) return 0;
002759    sqlite3BtreeEnter(p);
002760    if( newFlag>=0 ){
002761      p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
002762      if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
002763    } 
002764    b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
002765    sqlite3BtreeLeave(p);
002766    return b;
002767  }
002768  
002769  /*
002770  ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
002771  ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
002772  ** is disabled. The default value for the auto-vacuum property is 
002773  ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
002774  */
002775  int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
002776  #ifdef SQLITE_OMIT_AUTOVACUUM
002777    return SQLITE_READONLY;
002778  #else
002779    BtShared *pBt = p->pBt;
002780    int rc = SQLITE_OK;
002781    u8 av = (u8)autoVacuum;
002782  
002783    sqlite3BtreeEnter(p);
002784    if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
002785      rc = SQLITE_READONLY;
002786    }else{
002787      pBt->autoVacuum = av ?1:0;
002788      pBt->incrVacuum = av==2 ?1:0;
002789    }
002790    sqlite3BtreeLeave(p);
002791    return rc;
002792  #endif
002793  }
002794  
002795  /*
002796  ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
002797  ** enabled 1 is returned. Otherwise 0.
002798  */
002799  int sqlite3BtreeGetAutoVacuum(Btree *p){
002800  #ifdef SQLITE_OMIT_AUTOVACUUM
002801    return BTREE_AUTOVACUUM_NONE;
002802  #else
002803    int rc;
002804    sqlite3BtreeEnter(p);
002805    rc = (
002806      (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
002807      (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
002808      BTREE_AUTOVACUUM_INCR
002809    );
002810    sqlite3BtreeLeave(p);
002811    return rc;
002812  #endif
002813  }
002814  
002815  
002816  /*
002817  ** Get a reference to pPage1 of the database file.  This will
002818  ** also acquire a readlock on that file.
002819  **
002820  ** SQLITE_OK is returned on success.  If the file is not a
002821  ** well-formed database file, then SQLITE_CORRUPT is returned.
002822  ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
002823  ** is returned if we run out of memory. 
002824  */
002825  static int lockBtree(BtShared *pBt){
002826    int rc;              /* Result code from subfunctions */
002827    MemPage *pPage1;     /* Page 1 of the database file */
002828    int nPage;           /* Number of pages in the database */
002829    int nPageFile = 0;   /* Number of pages in the database file */
002830    int nPageHeader;     /* Number of pages in the database according to hdr */
002831  
002832    assert( sqlite3_mutex_held(pBt->mutex) );
002833    assert( pBt->pPage1==0 );
002834    rc = sqlite3PagerSharedLock(pBt->pPager);
002835    if( rc!=SQLITE_OK ) return rc;
002836    rc = btreeGetPage(pBt, 1, &pPage1, 0);
002837    if( rc!=SQLITE_OK ) return rc;
002838  
002839    /* Do some checking to help insure the file we opened really is
002840    ** a valid database file. 
002841    */
002842    nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
002843    sqlite3PagerPagecount(pBt->pPager, &nPageFile);
002844    if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
002845      nPage = nPageFile;
002846    }
002847    if( nPage>0 ){
002848      u32 pageSize;
002849      u32 usableSize;
002850      u8 *page1 = pPage1->aData;
002851      rc = SQLITE_NOTADB;
002852      /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
002853      ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
002854      ** 61 74 20 33 00. */
002855      if( memcmp(page1, zMagicHeader, 16)!=0 ){
002856        goto page1_init_failed;
002857      }
002858  
002859  #ifdef SQLITE_OMIT_WAL
002860      if( page1[18]>1 ){
002861        pBt->btsFlags |= BTS_READ_ONLY;
002862      }
002863      if( page1[19]>1 ){
002864        goto page1_init_failed;
002865      }
002866  #else
002867      if( page1[18]>2 ){
002868        pBt->btsFlags |= BTS_READ_ONLY;
002869      }
002870      if( page1[19]>2 ){
002871        goto page1_init_failed;
002872      }
002873  
002874      /* If the write version is set to 2, this database should be accessed
002875      ** in WAL mode. If the log is not already open, open it now. Then 
002876      ** return SQLITE_OK and return without populating BtShared.pPage1.
002877      ** The caller detects this and calls this function again. This is
002878      ** required as the version of page 1 currently in the page1 buffer
002879      ** may not be the latest version - there may be a newer one in the log
002880      ** file.
002881      */
002882      if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
002883        int isOpen = 0;
002884        rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
002885        if( rc!=SQLITE_OK ){
002886          goto page1_init_failed;
002887        }else{
002888  #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS
002889          sqlite3 *db;
002890          Db *pDb;
002891          if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
002892            while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
002893            if( pDb->bSyncSet==0
002894             && pDb->safety_level==SQLITE_DEFAULT_SYNCHRONOUS+1
002895            ){
002896              pDb->safety_level = SQLITE_DEFAULT_WAL_SYNCHRONOUS+1;
002897              sqlite3PagerSetFlags(pBt->pPager,
002898                 pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
002899            }
002900          }
002901  #endif
002902          if( isOpen==0 ){
002903            releasePage(pPage1);
002904            return SQLITE_OK;
002905          }
002906        }
002907        rc = SQLITE_NOTADB;
002908      }
002909  #endif
002910  
002911      /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
002912      ** fractions and the leaf payload fraction values must be 64, 32, and 32.
002913      **
002914      ** The original design allowed these amounts to vary, but as of
002915      ** version 3.6.0, we require them to be fixed.
002916      */
002917      if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
002918        goto page1_init_failed;
002919      }
002920      /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
002921      ** determined by the 2-byte integer located at an offset of 16 bytes from
002922      ** the beginning of the database file. */
002923      pageSize = (page1[16]<<8) | (page1[17]<<16);
002924      /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
002925      ** between 512 and 65536 inclusive. */
002926      if( ((pageSize-1)&pageSize)!=0
002927       || pageSize>SQLITE_MAX_PAGE_SIZE 
002928       || pageSize<=256 
002929      ){
002930        goto page1_init_failed;
002931      }
002932      assert( (pageSize & 7)==0 );
002933      /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
002934      ** integer at offset 20 is the number of bytes of space at the end of
002935      ** each page to reserve for extensions. 
002936      **
002937      ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
002938      ** determined by the one-byte unsigned integer found at an offset of 20
002939      ** into the database file header. */
002940      usableSize = pageSize - page1[20];
002941      if( (u32)pageSize!=pBt->pageSize ){
002942        /* After reading the first page of the database assuming a page size
002943        ** of BtShared.pageSize, we have discovered that the page-size is
002944        ** actually pageSize. Unlock the database, leave pBt->pPage1 at
002945        ** zero and return SQLITE_OK. The caller will call this function
002946        ** again with the correct page-size.
002947        */
002948        releasePage(pPage1);
002949        pBt->usableSize = usableSize;
002950        pBt->pageSize = pageSize;
002951        freeTempSpace(pBt);
002952        rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
002953                                     pageSize-usableSize);
002954        return rc;
002955      }
002956      if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
002957        rc = SQLITE_CORRUPT_BKPT;
002958        goto page1_init_failed;
002959      }
002960      /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
002961      ** be less than 480. In other words, if the page size is 512, then the
002962      ** reserved space size cannot exceed 32. */
002963      if( usableSize<480 ){
002964        goto page1_init_failed;
002965      }
002966      pBt->pageSize = pageSize;
002967      pBt->usableSize = usableSize;
002968  #ifndef SQLITE_OMIT_AUTOVACUUM
002969      pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
002970      pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
002971  #endif
002972    }
002973  
002974    /* maxLocal is the maximum amount of payload to store locally for
002975    ** a cell.  Make sure it is small enough so that at least minFanout
002976    ** cells can will fit on one page.  We assume a 10-byte page header.
002977    ** Besides the payload, the cell must store:
002978    **     2-byte pointer to the cell
002979    **     4-byte child pointer
002980    **     9-byte nKey value
002981    **     4-byte nData value
002982    **     4-byte overflow page pointer
002983    ** So a cell consists of a 2-byte pointer, a header which is as much as
002984    ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
002985    ** page pointer.
002986    */
002987    pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
002988    pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
002989    pBt->maxLeaf = (u16)(pBt->usableSize - 35);
002990    pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
002991    if( pBt->maxLocal>127 ){
002992      pBt->max1bytePayload = 127;
002993    }else{
002994      pBt->max1bytePayload = (u8)pBt->maxLocal;
002995    }
002996    assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
002997    pBt->pPage1 = pPage1;
002998    pBt->nPage = nPage;
002999    return SQLITE_OK;
003000  
003001  page1_init_failed:
003002    releasePage(pPage1);
003003    pBt->pPage1 = 0;
003004    return rc;
003005  }
003006  
003007  #ifndef NDEBUG
003008  /*
003009  ** Return the number of cursors open on pBt. This is for use
003010  ** in assert() expressions, so it is only compiled if NDEBUG is not
003011  ** defined.
003012  **
003013  ** Only write cursors are counted if wrOnly is true.  If wrOnly is
003014  ** false then all cursors are counted.
003015  **
003016  ** For the purposes of this routine, a cursor is any cursor that
003017  ** is capable of reading or writing to the database.  Cursors that
003018  ** have been tripped into the CURSOR_FAULT state are not counted.
003019  */
003020  static int countValidCursors(BtShared *pBt, int wrOnly){
003021    BtCursor *pCur;
003022    int r = 0;
003023    for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
003024      if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
003025       && pCur->eState!=CURSOR_FAULT ) r++; 
003026    }
003027    return r;
003028  }
003029  #endif
003030  
003031  /*
003032  ** If there are no outstanding cursors and we are not in the middle
003033  ** of a transaction but there is a read lock on the database, then
003034  ** this routine unrefs the first page of the database file which 
003035  ** has the effect of releasing the read lock.
003036  **
003037  ** If there is a transaction in progress, this routine is a no-op.
003038  */
003039  static void unlockBtreeIfUnused(BtShared *pBt){
003040    assert( sqlite3_mutex_held(pBt->mutex) );
003041    assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
003042    if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
003043      MemPage *pPage1 = pBt->pPage1;
003044      assert( pPage1->aData );
003045      assert( sqlite3PagerRefcount(pBt->pPager)==1 );
003046      pBt->pPage1 = 0;
003047      releasePageNotNull(pPage1);
003048    }
003049  }
003050  
003051  /*
003052  ** If pBt points to an empty file then convert that empty file
003053  ** into a new empty database by initializing the first page of
003054  ** the database.
003055  */
003056  static int newDatabase(BtShared *pBt){
003057    MemPage *pP1;
003058    unsigned char *data;
003059    int rc;
003060  
003061    assert( sqlite3_mutex_held(pBt->mutex) );
003062    if( pBt->nPage>0 ){
003063      return SQLITE_OK;
003064    }
003065    pP1 = pBt->pPage1;
003066    assert( pP1!=0 );
003067    data = pP1->aData;
003068    rc = sqlite3PagerWrite(pP1->pDbPage);
003069    if( rc ) return rc;
003070    memcpy(data, zMagicHeader, sizeof(zMagicHeader));
003071    assert( sizeof(zMagicHeader)==16 );
003072    data[16] = (u8)((pBt->pageSize>>8)&0xff);
003073    data[17] = (u8)((pBt->pageSize>>16)&0xff);
003074    data[18] = 1;
003075    data[19] = 1;
003076    assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
003077    data[20] = (u8)(pBt->pageSize - pBt->usableSize);
003078    data[21] = 64;
003079    data[22] = 32;
003080    data[23] = 32;
003081    memset(&data[24], 0, 100-24);
003082    zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
003083    pBt->btsFlags |= BTS_PAGESIZE_FIXED;
003084  #ifndef SQLITE_OMIT_AUTOVACUUM
003085    assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
003086    assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
003087    put4byte(&data[36 + 4*4], pBt->autoVacuum);
003088    put4byte(&data[36 + 7*4], pBt->incrVacuum);
003089  #endif
003090    pBt->nPage = 1;
003091    data[31] = 1;
003092    return SQLITE_OK;
003093  }
003094  
003095  /*
003096  ** Initialize the first page of the database file (creating a database
003097  ** consisting of a single page and no schema objects). Return SQLITE_OK
003098  ** if successful, or an SQLite error code otherwise.
003099  */
003100  int sqlite3BtreeNewDb(Btree *p){
003101    int rc;
003102    sqlite3BtreeEnter(p);
003103    p->pBt->nPage = 0;
003104    rc = newDatabase(p->pBt);
003105    sqlite3BtreeLeave(p);
003106    return rc;
003107  }
003108  
003109  /*
003110  ** Attempt to start a new transaction. A write-transaction
003111  ** is started if the second argument is nonzero, otherwise a read-
003112  ** transaction.  If the second argument is 2 or more and exclusive
003113  ** transaction is started, meaning that no other process is allowed
003114  ** to access the database.  A preexisting transaction may not be
003115  ** upgraded to exclusive by calling this routine a second time - the
003116  ** exclusivity flag only works for a new transaction.
003117  **
003118  ** A write-transaction must be started before attempting any 
003119  ** changes to the database.  None of the following routines 
003120  ** will work unless a transaction is started first:
003121  **
003122  **      sqlite3BtreeCreateTable()
003123  **      sqlite3BtreeCreateIndex()
003124  **      sqlite3BtreeClearTable()
003125  **      sqlite3BtreeDropTable()
003126  **      sqlite3BtreeInsert()
003127  **      sqlite3BtreeDelete()
003128  **      sqlite3BtreeUpdateMeta()
003129  **
003130  ** If an initial attempt to acquire the lock fails because of lock contention
003131  ** and the database was previously unlocked, then invoke the busy handler
003132  ** if there is one.  But if there was previously a read-lock, do not
003133  ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
003134  ** returned when there is already a read-lock in order to avoid a deadlock.
003135  **
003136  ** Suppose there are two processes A and B.  A has a read lock and B has
003137  ** a reserved lock.  B tries to promote to exclusive but is blocked because
003138  ** of A's read lock.  A tries to promote to reserved but is blocked by B.
003139  ** One or the other of the two processes must give way or there can be
003140  ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
003141  ** when A already has a read lock, we encourage A to give up and let B
003142  ** proceed.
003143  */
003144  int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
003145    BtShared *pBt = p->pBt;
003146    int rc = SQLITE_OK;
003147  
003148    sqlite3BtreeEnter(p);
003149    btreeIntegrity(p);
003150  
003151    /* If the btree is already in a write-transaction, or it
003152    ** is already in a read-transaction and a read-transaction
003153    ** is requested, this is a no-op.
003154    */
003155    if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
003156      goto trans_begun;
003157    }
003158    assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
003159  
003160    /* Write transactions are not possible on a read-only database */
003161    if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
003162      rc = SQLITE_READONLY;
003163      goto trans_begun;
003164    }
003165  
003166  #ifndef SQLITE_OMIT_SHARED_CACHE
003167    {
003168      sqlite3 *pBlock = 0;
003169      /* If another database handle has already opened a write transaction 
003170      ** on this shared-btree structure and a second write transaction is
003171      ** requested, return SQLITE_LOCKED.
003172      */
003173      if( (wrflag && pBt->inTransaction==TRANS_WRITE)
003174       || (pBt->btsFlags & BTS_PENDING)!=0
003175      ){
003176        pBlock = pBt->pWriter->db;
003177      }else if( wrflag>1 ){
003178        BtLock *pIter;
003179        for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
003180          if( pIter->pBtree!=p ){
003181            pBlock = pIter->pBtree->db;
003182            break;
003183          }
003184        }
003185      }
003186      if( pBlock ){
003187        sqlite3ConnectionBlocked(p->db, pBlock);
003188        rc = SQLITE_LOCKED_SHAREDCACHE;
003189        goto trans_begun;
003190      }
003191    }
003192  #endif
003193  
003194    /* Any read-only or read-write transaction implies a read-lock on 
003195    ** page 1. So if some other shared-cache client already has a write-lock 
003196    ** on page 1, the transaction cannot be opened. */
003197    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
003198    if( SQLITE_OK!=rc ) goto trans_begun;
003199  
003200    pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
003201    if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
003202    do {
003203      /* Call lockBtree() until either pBt->pPage1 is populated or
003204      ** lockBtree() returns something other than SQLITE_OK. lockBtree()
003205      ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
003206      ** reading page 1 it discovers that the page-size of the database 
003207      ** file is not pBt->pageSize. In this case lockBtree() will update
003208      ** pBt->pageSize to the page-size of the file on disk.
003209      */
003210      while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
003211  
003212      if( rc==SQLITE_OK && wrflag ){
003213        if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
003214          rc = SQLITE_READONLY;
003215        }else{
003216          rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
003217          if( rc==SQLITE_OK ){
003218            rc = newDatabase(pBt);
003219          }
003220        }
003221      }
003222    
003223      if( rc!=SQLITE_OK ){
003224        unlockBtreeIfUnused(pBt);
003225      }
003226    }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
003227            btreeInvokeBusyHandler(pBt) );
003228  
003229    if( rc==SQLITE_OK ){
003230      if( p->inTrans==TRANS_NONE ){
003231        pBt->nTransaction++;
003232  #ifndef SQLITE_OMIT_SHARED_CACHE
003233        if( p->sharable ){
003234          assert( p->lock.pBtree==p && p->lock.iTable==1 );
003235          p->lock.eLock = READ_LOCK;
003236          p->lock.pNext = pBt->pLock;
003237          pBt->pLock = &p->lock;
003238        }
003239  #endif
003240      }
003241      p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
003242      if( p->inTrans>pBt->inTransaction ){
003243        pBt->inTransaction = p->inTrans;
003244      }
003245      if( wrflag ){
003246        MemPage *pPage1 = pBt->pPage1;
003247  #ifndef SQLITE_OMIT_SHARED_CACHE
003248        assert( !pBt->pWriter );
003249        pBt->pWriter = p;
003250        pBt->btsFlags &= ~BTS_EXCLUSIVE;
003251        if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
003252  #endif
003253  
003254        /* If the db-size header field is incorrect (as it may be if an old
003255        ** client has been writing the database file), update it now. Doing
003256        ** this sooner rather than later means the database size can safely 
003257        ** re-read the database size from page 1 if a savepoint or transaction
003258        ** rollback occurs within the transaction.
003259        */
003260        if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
003261          rc = sqlite3PagerWrite(pPage1->pDbPage);
003262          if( rc==SQLITE_OK ){
003263            put4byte(&pPage1->aData[28], pBt->nPage);
003264          }
003265        }
003266      }
003267    }
003268  
003269  
003270  trans_begun:
003271    if( rc==SQLITE_OK && wrflag ){
003272      /* This call makes sure that the pager has the correct number of
003273      ** open savepoints. If the second parameter is greater than 0 and
003274      ** the sub-journal is not already open, then it will be opened here.
003275      */
003276      rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
003277    }
003278  
003279    btreeIntegrity(p);
003280    sqlite3BtreeLeave(p);
003281    return rc;
003282  }
003283  
003284  #ifndef SQLITE_OMIT_AUTOVACUUM
003285  
003286  /*
003287  ** Set the pointer-map entries for all children of page pPage. Also, if
003288  ** pPage contains cells that point to overflow pages, set the pointer
003289  ** map entries for the overflow pages as well.
003290  */
003291  static int setChildPtrmaps(MemPage *pPage){
003292    int i;                             /* Counter variable */
003293    int nCell;                         /* Number of cells in page pPage */
003294    int rc;                            /* Return code */
003295    BtShared *pBt = pPage->pBt;
003296    Pgno pgno = pPage->pgno;
003297  
003298    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
003299    rc = btreeInitPage(pPage);
003300    if( rc!=SQLITE_OK ) return rc;
003301    nCell = pPage->nCell;
003302  
003303    for(i=0; i<nCell; i++){
003304      u8 *pCell = findCell(pPage, i);
003305  
003306      ptrmapPutOvflPtr(pPage, pCell, &rc);
003307  
003308      if( !pPage->leaf ){
003309        Pgno childPgno = get4byte(pCell);
003310        ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
003311      }
003312    }
003313  
003314    if( !pPage->leaf ){
003315      Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
003316      ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
003317    }
003318  
003319    return rc;
003320  }
003321  
003322  /*
003323  ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
003324  ** that it points to iTo. Parameter eType describes the type of pointer to
003325  ** be modified, as  follows:
003326  **
003327  ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
003328  **                   page of pPage.
003329  **
003330  ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
003331  **                   page pointed to by one of the cells on pPage.
003332  **
003333  ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
003334  **                   overflow page in the list.
003335  */
003336  static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
003337    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
003338    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
003339    if( eType==PTRMAP_OVERFLOW2 ){
003340      /* The pointer is always the first 4 bytes of the page in this case.  */
003341      if( get4byte(pPage->aData)!=iFrom ){
003342        return SQLITE_CORRUPT_BKPT;
003343      }
003344      put4byte(pPage->aData, iTo);
003345    }else{
003346      int i;
003347      int nCell;
003348      int rc;
003349  
003350      rc = btreeInitPage(pPage);
003351      if( rc ) return rc;
003352      nCell = pPage->nCell;
003353  
003354      for(i=0; i<nCell; i++){
003355        u8 *pCell = findCell(pPage, i);
003356        if( eType==PTRMAP_OVERFLOW1 ){
003357          CellInfo info;
003358          pPage->xParseCell(pPage, pCell, &info);
003359          if( info.nLocal<info.nPayload
003360           && pCell+info.nSize-1<=pPage->aData+pPage->maskPage
003361           && iFrom==get4byte(pCell+info.nSize-4)
003362          ){
003363            put4byte(pCell+info.nSize-4, iTo);
003364            break;
003365          }
003366        }else{
003367          if( get4byte(pCell)==iFrom ){
003368            put4byte(pCell, iTo);
003369            break;
003370          }
003371        }
003372      }
003373    
003374      if( i==nCell ){
003375        if( eType!=PTRMAP_BTREE || 
003376            get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
003377          return SQLITE_CORRUPT_BKPT;
003378        }
003379        put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
003380      }
003381    }
003382    return SQLITE_OK;
003383  }
003384  
003385  
003386  /*
003387  ** Move the open database page pDbPage to location iFreePage in the 
003388  ** database. The pDbPage reference remains valid.
003389  **
003390  ** The isCommit flag indicates that there is no need to remember that
003391  ** the journal needs to be sync()ed before database page pDbPage->pgno 
003392  ** can be written to. The caller has already promised not to write to that
003393  ** page.
003394  */
003395  static int relocatePage(
003396    BtShared *pBt,           /* Btree */
003397    MemPage *pDbPage,        /* Open page to move */
003398    u8 eType,                /* Pointer map 'type' entry for pDbPage */
003399    Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
003400    Pgno iFreePage,          /* The location to move pDbPage to */
003401    int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
003402  ){
003403    MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
003404    Pgno iDbPage = pDbPage->pgno;
003405    Pager *pPager = pBt->pPager;
003406    int rc;
003407  
003408    assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
003409        eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
003410    assert( sqlite3_mutex_held(pBt->mutex) );
003411    assert( pDbPage->pBt==pBt );
003412  
003413    /* Move page iDbPage from its current location to page number iFreePage */
003414    TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
003415        iDbPage, iFreePage, iPtrPage, eType));
003416    rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
003417    if( rc!=SQLITE_OK ){
003418      return rc;
003419    }
003420    pDbPage->pgno = iFreePage;
003421  
003422    /* If pDbPage was a btree-page, then it may have child pages and/or cells
003423    ** that point to overflow pages. The pointer map entries for all these
003424    ** pages need to be changed.
003425    **
003426    ** If pDbPage is an overflow page, then the first 4 bytes may store a
003427    ** pointer to a subsequent overflow page. If this is the case, then
003428    ** the pointer map needs to be updated for the subsequent overflow page.
003429    */
003430    if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
003431      rc = setChildPtrmaps(pDbPage);
003432      if( rc!=SQLITE_OK ){
003433        return rc;
003434      }
003435    }else{
003436      Pgno nextOvfl = get4byte(pDbPage->aData);
003437      if( nextOvfl!=0 ){
003438        ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
003439        if( rc!=SQLITE_OK ){
003440          return rc;
003441        }
003442      }
003443    }
003444  
003445    /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
003446    ** that it points at iFreePage. Also fix the pointer map entry for
003447    ** iPtrPage.
003448    */
003449    if( eType!=PTRMAP_ROOTPAGE ){
003450      rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
003451      if( rc!=SQLITE_OK ){
003452        return rc;
003453      }
003454      rc = sqlite3PagerWrite(pPtrPage->pDbPage);
003455      if( rc!=SQLITE_OK ){
003456        releasePage(pPtrPage);
003457        return rc;
003458      }
003459      rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
003460      releasePage(pPtrPage);
003461      if( rc==SQLITE_OK ){
003462        ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
003463      }
003464    }
003465    return rc;
003466  }
003467  
003468  /* Forward declaration required by incrVacuumStep(). */
003469  static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
003470  
003471  /*
003472  ** Perform a single step of an incremental-vacuum. If successful, return
003473  ** SQLITE_OK. If there is no work to do (and therefore no point in 
003474  ** calling this function again), return SQLITE_DONE. Or, if an error 
003475  ** occurs, return some other error code.
003476  **
003477  ** More specifically, this function attempts to re-organize the database so 
003478  ** that the last page of the file currently in use is no longer in use.
003479  **
003480  ** Parameter nFin is the number of pages that this database would contain
003481  ** were this function called until it returns SQLITE_DONE.
003482  **
003483  ** If the bCommit parameter is non-zero, this function assumes that the 
003484  ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 
003485  ** or an error. bCommit is passed true for an auto-vacuum-on-commit 
003486  ** operation, or false for an incremental vacuum.
003487  */
003488  static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
003489    Pgno nFreeList;           /* Number of pages still on the free-list */
003490    int rc;
003491  
003492    assert( sqlite3_mutex_held(pBt->mutex) );
003493    assert( iLastPg>nFin );
003494  
003495    if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
003496      u8 eType;
003497      Pgno iPtrPage;
003498  
003499      nFreeList = get4byte(&pBt->pPage1->aData[36]);
003500      if( nFreeList==0 ){
003501        return SQLITE_DONE;
003502      }
003503  
003504      rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
003505      if( rc!=SQLITE_OK ){
003506        return rc;
003507      }
003508      if( eType==PTRMAP_ROOTPAGE ){
003509        return SQLITE_CORRUPT_BKPT;
003510      }
003511  
003512      if( eType==PTRMAP_FREEPAGE ){
003513        if( bCommit==0 ){
003514          /* Remove the page from the files free-list. This is not required
003515          ** if bCommit is non-zero. In that case, the free-list will be
003516          ** truncated to zero after this function returns, so it doesn't 
003517          ** matter if it still contains some garbage entries.
003518          */
003519          Pgno iFreePg;
003520          MemPage *pFreePg;
003521          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
003522          if( rc!=SQLITE_OK ){
003523            return rc;
003524          }
003525          assert( iFreePg==iLastPg );
003526          releasePage(pFreePg);
003527        }
003528      } else {
003529        Pgno iFreePg;             /* Index of free page to move pLastPg to */
003530        MemPage *pLastPg;
003531        u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
003532        Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
003533  
003534        rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
003535        if( rc!=SQLITE_OK ){
003536          return rc;
003537        }
003538  
003539        /* If bCommit is zero, this loop runs exactly once and page pLastPg
003540        ** is swapped with the first free page pulled off the free list.
003541        **
003542        ** On the other hand, if bCommit is greater than zero, then keep
003543        ** looping until a free-page located within the first nFin pages
003544        ** of the file is found.
003545        */
003546        if( bCommit==0 ){
003547          eMode = BTALLOC_LE;
003548          iNear = nFin;
003549        }
003550        do {
003551          MemPage *pFreePg;
003552          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
003553          if( rc!=SQLITE_OK ){
003554            releasePage(pLastPg);
003555            return rc;
003556          }
003557          releasePage(pFreePg);
003558        }while( bCommit && iFreePg>nFin );
003559        assert( iFreePg<iLastPg );
003560        
003561        rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
003562        releasePage(pLastPg);
003563        if( rc!=SQLITE_OK ){
003564          return rc;
003565        }
003566      }
003567    }
003568  
003569    if( bCommit==0 ){
003570      do {
003571        iLastPg--;
003572      }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
003573      pBt->bDoTruncate = 1;
003574      pBt->nPage = iLastPg;
003575    }
003576    return SQLITE_OK;
003577  }
003578  
003579  /*
003580  ** The database opened by the first argument is an auto-vacuum database
003581  ** nOrig pages in size containing nFree free pages. Return the expected 
003582  ** size of the database in pages following an auto-vacuum operation.
003583  */
003584  static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
003585    int nEntry;                     /* Number of entries on one ptrmap page */
003586    Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
003587    Pgno nFin;                      /* Return value */
003588  
003589    nEntry = pBt->usableSize/5;
003590    nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
003591    nFin = nOrig - nFree - nPtrmap;
003592    if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
003593      nFin--;
003594    }
003595    while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
003596      nFin--;
003597    }
003598  
003599    return nFin;
003600  }
003601  
003602  /*
003603  ** A write-transaction must be opened before calling this function.
003604  ** It performs a single unit of work towards an incremental vacuum.
003605  **
003606  ** If the incremental vacuum is finished after this function has run,
003607  ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
003608  ** SQLITE_OK is returned. Otherwise an SQLite error code. 
003609  */
003610  int sqlite3BtreeIncrVacuum(Btree *p){
003611    int rc;
003612    BtShared *pBt = p->pBt;
003613  
003614    sqlite3BtreeEnter(p);
003615    assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
003616    if( !pBt->autoVacuum ){
003617      rc = SQLITE_DONE;
003618    }else{
003619      Pgno nOrig = btreePagecount(pBt);
003620      Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
003621      Pgno nFin = finalDbSize(pBt, nOrig, nFree);
003622  
003623      if( nOrig<nFin ){
003624        rc = SQLITE_CORRUPT_BKPT;
003625      }else if( nFree>0 ){
003626        rc = saveAllCursors(pBt, 0, 0);
003627        if( rc==SQLITE_OK ){
003628          invalidateAllOverflowCache(pBt);
003629          rc = incrVacuumStep(pBt, nFin, nOrig, 0);
003630        }
003631        if( rc==SQLITE_OK ){
003632          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
003633          put4byte(&pBt->pPage1->aData[28], pBt->nPage);
003634        }
003635      }else{
003636        rc = SQLITE_DONE;
003637      }
003638    }
003639    sqlite3BtreeLeave(p);
003640    return rc;
003641  }
003642  
003643  /*
003644  ** This routine is called prior to sqlite3PagerCommit when a transaction
003645  ** is committed for an auto-vacuum database.
003646  **
003647  ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
003648  ** the database file should be truncated to during the commit process. 
003649  ** i.e. the database has been reorganized so that only the first *pnTrunc
003650  ** pages are in use.
003651  */
003652  static int autoVacuumCommit(BtShared *pBt){
003653    int rc = SQLITE_OK;
003654    Pager *pPager = pBt->pPager;
003655    VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
003656  
003657    assert( sqlite3_mutex_held(pBt->mutex) );
003658    invalidateAllOverflowCache(pBt);
003659    assert(pBt->autoVacuum);
003660    if( !pBt->incrVacuum ){
003661      Pgno nFin;         /* Number of pages in database after autovacuuming */
003662      Pgno nFree;        /* Number of pages on the freelist initially */
003663      Pgno iFree;        /* The next page to be freed */
003664      Pgno nOrig;        /* Database size before freeing */
003665  
003666      nOrig = btreePagecount(pBt);
003667      if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
003668        /* It is not possible to create a database for which the final page
003669        ** is either a pointer-map page or the pending-byte page. If one
003670        ** is encountered, this indicates corruption.
003671        */
003672        return SQLITE_CORRUPT_BKPT;
003673      }
003674  
003675      nFree = get4byte(&pBt->pPage1->aData[36]);
003676      nFin = finalDbSize(pBt, nOrig, nFree);
003677      if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
003678      if( nFin<nOrig ){
003679        rc = saveAllCursors(pBt, 0, 0);
003680      }
003681      for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
003682        rc = incrVacuumStep(pBt, nFin, iFree, 1);
003683      }
003684      if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
003685        rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
003686        put4byte(&pBt->pPage1->aData[32], 0);
003687        put4byte(&pBt->pPage1->aData[36], 0);
003688        put4byte(&pBt->pPage1->aData[28], nFin);
003689        pBt->bDoTruncate = 1;
003690        pBt->nPage = nFin;
003691      }
003692      if( rc!=SQLITE_OK ){
003693        sqlite3PagerRollback(pPager);
003694      }
003695    }
003696  
003697    assert( nRef>=sqlite3PagerRefcount(pPager) );
003698    return rc;
003699  }
003700  
003701  #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
003702  # define setChildPtrmaps(x) SQLITE_OK
003703  #endif
003704  
003705  /*
003706  ** This routine does the first phase of a two-phase commit.  This routine
003707  ** causes a rollback journal to be created (if it does not already exist)
003708  ** and populated with enough information so that if a power loss occurs
003709  ** the database can be restored to its original state by playing back
003710  ** the journal.  Then the contents of the journal are flushed out to
003711  ** the disk.  After the journal is safely on oxide, the changes to the
003712  ** database are written into the database file and flushed to oxide.
003713  ** At the end of this call, the rollback journal still exists on the
003714  ** disk and we are still holding all locks, so the transaction has not
003715  ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
003716  ** commit process.
003717  **
003718  ** This call is a no-op if no write-transaction is currently active on pBt.
003719  **
003720  ** Otherwise, sync the database file for the btree pBt. zMaster points to
003721  ** the name of a master journal file that should be written into the
003722  ** individual journal file, or is NULL, indicating no master journal file 
003723  ** (single database transaction).
003724  **
003725  ** When this is called, the master journal should already have been
003726  ** created, populated with this journal pointer and synced to disk.
003727  **
003728  ** Once this is routine has returned, the only thing required to commit
003729  ** the write-transaction for this database file is to delete the journal.
003730  */
003731  int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
003732    int rc = SQLITE_OK;
003733    if( p->inTrans==TRANS_WRITE ){
003734      BtShared *pBt = p->pBt;
003735      sqlite3BtreeEnter(p);
003736  #ifndef SQLITE_OMIT_AUTOVACUUM
003737      if( pBt->autoVacuum ){
003738        rc = autoVacuumCommit(pBt);
003739        if( rc!=SQLITE_OK ){
003740          sqlite3BtreeLeave(p);
003741          return rc;
003742        }
003743      }
003744      if( pBt->bDoTruncate ){
003745        sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
003746      }
003747  #endif
003748      rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
003749      sqlite3BtreeLeave(p);
003750    }
003751    return rc;
003752  }
003753  
003754  /*
003755  ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
003756  ** at the conclusion of a transaction.
003757  */
003758  static void btreeEndTransaction(Btree *p){
003759    BtShared *pBt = p->pBt;
003760    sqlite3 *db = p->db;
003761    assert( sqlite3BtreeHoldsMutex(p) );
003762  
003763  #ifndef SQLITE_OMIT_AUTOVACUUM
003764    pBt->bDoTruncate = 0;
003765  #endif
003766    if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
003767      /* If there are other active statements that belong to this database
003768      ** handle, downgrade to a read-only transaction. The other statements
003769      ** may still be reading from the database.  */
003770      downgradeAllSharedCacheTableLocks(p);
003771      p->inTrans = TRANS_READ;
003772    }else{
003773      /* If the handle had any kind of transaction open, decrement the 
003774      ** transaction count of the shared btree. If the transaction count 
003775      ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
003776      ** call below will unlock the pager.  */
003777      if( p->inTrans!=TRANS_NONE ){
003778        clearAllSharedCacheTableLocks(p);
003779        pBt->nTransaction--;
003780        if( 0==pBt->nTransaction ){
003781          pBt->inTransaction = TRANS_NONE;
003782        }
003783      }
003784  
003785      /* Set the current transaction state to TRANS_NONE and unlock the 
003786      ** pager if this call closed the only read or write transaction.  */
003787      p->inTrans = TRANS_NONE;
003788      unlockBtreeIfUnused(pBt);
003789    }
003790  
003791    btreeIntegrity(p);
003792  }
003793  
003794  /*
003795  ** Commit the transaction currently in progress.
003796  **
003797  ** This routine implements the second phase of a 2-phase commit.  The
003798  ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
003799  ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
003800  ** routine did all the work of writing information out to disk and flushing the
003801  ** contents so that they are written onto the disk platter.  All this
003802  ** routine has to do is delete or truncate or zero the header in the
003803  ** the rollback journal (which causes the transaction to commit) and
003804  ** drop locks.
003805  **
003806  ** Normally, if an error occurs while the pager layer is attempting to 
003807  ** finalize the underlying journal file, this function returns an error and
003808  ** the upper layer will attempt a rollback. However, if the second argument
003809  ** is non-zero then this b-tree transaction is part of a multi-file 
003810  ** transaction. In this case, the transaction has already been committed 
003811  ** (by deleting a master journal file) and the caller will ignore this 
003812  ** functions return code. So, even if an error occurs in the pager layer,
003813  ** reset the b-tree objects internal state to indicate that the write
003814  ** transaction has been closed. This is quite safe, as the pager will have
003815  ** transitioned to the error state.
003816  **
003817  ** This will release the write lock on the database file.  If there
003818  ** are no active cursors, it also releases the read lock.
003819  */
003820  int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
003821  
003822    if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
003823    sqlite3BtreeEnter(p);
003824    btreeIntegrity(p);
003825  
003826    /* If the handle has a write-transaction open, commit the shared-btrees 
003827    ** transaction and set the shared state to TRANS_READ.
003828    */
003829    if( p->inTrans==TRANS_WRITE ){
003830      int rc;
003831      BtShared *pBt = p->pBt;
003832      assert( pBt->inTransaction==TRANS_WRITE );
003833      assert( pBt->nTransaction>0 );
003834      rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
003835      if( rc!=SQLITE_OK && bCleanup==0 ){
003836        sqlite3BtreeLeave(p);
003837        return rc;
003838      }
003839      p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
003840      pBt->inTransaction = TRANS_READ;
003841      btreeClearHasContent(pBt);
003842    }
003843  
003844    btreeEndTransaction(p);
003845    sqlite3BtreeLeave(p);
003846    return SQLITE_OK;
003847  }
003848  
003849  /*
003850  ** Do both phases of a commit.
003851  */
003852  int sqlite3BtreeCommit(Btree *p){
003853    int rc;
003854    sqlite3BtreeEnter(p);
003855    rc = sqlite3BtreeCommitPhaseOne(p, 0);
003856    if( rc==SQLITE_OK ){
003857      rc = sqlite3BtreeCommitPhaseTwo(p, 0);
003858    }
003859    sqlite3BtreeLeave(p);
003860    return rc;
003861  }
003862  
003863  /*
003864  ** This routine sets the state to CURSOR_FAULT and the error
003865  ** code to errCode for every cursor on any BtShared that pBtree
003866  ** references.  Or if the writeOnly flag is set to 1, then only
003867  ** trip write cursors and leave read cursors unchanged.
003868  **
003869  ** Every cursor is a candidate to be tripped, including cursors
003870  ** that belong to other database connections that happen to be
003871  ** sharing the cache with pBtree.
003872  **
003873  ** This routine gets called when a rollback occurs. If the writeOnly
003874  ** flag is true, then only write-cursors need be tripped - read-only
003875  ** cursors save their current positions so that they may continue 
003876  ** following the rollback. Or, if writeOnly is false, all cursors are 
003877  ** tripped. In general, writeOnly is false if the transaction being
003878  ** rolled back modified the database schema. In this case b-tree root
003879  ** pages may be moved or deleted from the database altogether, making
003880  ** it unsafe for read cursors to continue.
003881  **
003882  ** If the writeOnly flag is true and an error is encountered while 
003883  ** saving the current position of a read-only cursor, all cursors, 
003884  ** including all read-cursors are tripped.
003885  **
003886  ** SQLITE_OK is returned if successful, or if an error occurs while
003887  ** saving a cursor position, an SQLite error code.
003888  */
003889  int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
003890    BtCursor *p;
003891    int rc = SQLITE_OK;
003892  
003893    assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
003894    if( pBtree ){
003895      sqlite3BtreeEnter(pBtree);
003896      for(p=pBtree->pBt->pCursor; p; p=p->pNext){
003897        int i;
003898        if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
003899          if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
003900            rc = saveCursorPosition(p);
003901            if( rc!=SQLITE_OK ){
003902              (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
003903              break;
003904            }
003905          }
003906        }else{
003907          sqlite3BtreeClearCursor(p);
003908          p->eState = CURSOR_FAULT;
003909          p->skipNext = errCode;
003910        }
003911        for(i=0; i<=p->iPage; i++){
003912          releasePage(p->apPage[i]);
003913          p->apPage[i] = 0;
003914        }
003915      }
003916      sqlite3BtreeLeave(pBtree);
003917    }
003918    return rc;
003919  }
003920  
003921  /*
003922  ** Rollback the transaction in progress.
003923  **
003924  ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
003925  ** Only write cursors are tripped if writeOnly is true but all cursors are
003926  ** tripped if writeOnly is false.  Any attempt to use
003927  ** a tripped cursor will result in an error.
003928  **
003929  ** This will release the write lock on the database file.  If there
003930  ** are no active cursors, it also releases the read lock.
003931  */
003932  int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
003933    int rc;
003934    BtShared *pBt = p->pBt;
003935    MemPage *pPage1;
003936  
003937    assert( writeOnly==1 || writeOnly==0 );
003938    assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
003939    sqlite3BtreeEnter(p);
003940    if( tripCode==SQLITE_OK ){
003941      rc = tripCode = saveAllCursors(pBt, 0, 0);
003942      if( rc ) writeOnly = 0;
003943    }else{
003944      rc = SQLITE_OK;
003945    }
003946    if( tripCode ){
003947      int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
003948      assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
003949      if( rc2!=SQLITE_OK ) rc = rc2;
003950    }
003951    btreeIntegrity(p);
003952  
003953    if( p->inTrans==TRANS_WRITE ){
003954      int rc2;
003955  
003956      assert( TRANS_WRITE==pBt->inTransaction );
003957      rc2 = sqlite3PagerRollback(pBt->pPager);
003958      if( rc2!=SQLITE_OK ){
003959        rc = rc2;
003960      }
003961  
003962      /* The rollback may have destroyed the pPage1->aData value.  So
003963      ** call btreeGetPage() on page 1 again to make
003964      ** sure pPage1->aData is set correctly. */
003965      if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
003966        int nPage = get4byte(28+(u8*)pPage1->aData);
003967        testcase( nPage==0 );
003968        if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
003969        testcase( pBt->nPage!=nPage );
003970        pBt->nPage = nPage;
003971        releasePage(pPage1);
003972      }
003973      assert( countValidCursors(pBt, 1)==0 );
003974      pBt->inTransaction = TRANS_READ;
003975      btreeClearHasContent(pBt);
003976    }
003977  
003978    btreeEndTransaction(p);
003979    sqlite3BtreeLeave(p);
003980    return rc;
003981  }
003982  
003983  /*
003984  ** Start a statement subtransaction. The subtransaction can be rolled
003985  ** back independently of the main transaction. You must start a transaction 
003986  ** before starting a subtransaction. The subtransaction is ended automatically 
003987  ** if the main transaction commits or rolls back.
003988  **
003989  ** Statement subtransactions are used around individual SQL statements
003990  ** that are contained within a BEGIN...COMMIT block.  If a constraint
003991  ** error occurs within the statement, the effect of that one statement
003992  ** can be rolled back without having to rollback the entire transaction.
003993  **
003994  ** A statement sub-transaction is implemented as an anonymous savepoint. The
003995  ** value passed as the second parameter is the total number of savepoints,
003996  ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
003997  ** are no active savepoints and no other statement-transactions open,
003998  ** iStatement is 1. This anonymous savepoint can be released or rolled back
003999  ** using the sqlite3BtreeSavepoint() function.
004000  */
004001  int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
004002    int rc;
004003    BtShared *pBt = p->pBt;
004004    sqlite3BtreeEnter(p);
004005    assert( p->inTrans==TRANS_WRITE );
004006    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
004007    assert( iStatement>0 );
004008    assert( iStatement>p->db->nSavepoint );
004009    assert( pBt->inTransaction==TRANS_WRITE );
004010    /* At the pager level, a statement transaction is a savepoint with
004011    ** an index greater than all savepoints created explicitly using
004012    ** SQL statements. It is illegal to open, release or rollback any
004013    ** such savepoints while the statement transaction savepoint is active.
004014    */
004015    rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
004016    sqlite3BtreeLeave(p);
004017    return rc;
004018  }
004019  
004020  /*
004021  ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
004022  ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
004023  ** savepoint identified by parameter iSavepoint, depending on the value 
004024  ** of op.
004025  **
004026  ** Normally, iSavepoint is greater than or equal to zero. However, if op is
004027  ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 
004028  ** contents of the entire transaction are rolled back. This is different
004029  ** from a normal transaction rollback, as no locks are released and the
004030  ** transaction remains open.
004031  */
004032  int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
004033    int rc = SQLITE_OK;
004034    if( p && p->inTrans==TRANS_WRITE ){
004035      BtShared *pBt = p->pBt;
004036      assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
004037      assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
004038      sqlite3BtreeEnter(p);
004039      rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
004040      if( rc==SQLITE_OK ){
004041        if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
004042          pBt->nPage = 0;
004043        }
004044        rc = newDatabase(pBt);
004045        pBt->nPage = get4byte(28 + pBt->pPage1->aData);
004046  
004047        /* The database size was written into the offset 28 of the header
004048        ** when the transaction started, so we know that the value at offset
004049        ** 28 is nonzero. */
004050        assert( pBt->nPage>0 );
004051      }
004052      sqlite3BtreeLeave(p);
004053    }
004054    return rc;
004055  }
004056  
004057  /*
004058  ** Create a new cursor for the BTree whose root is on the page
004059  ** iTable. If a read-only cursor is requested, it is assumed that
004060  ** the caller already has at least a read-only transaction open
004061  ** on the database already. If a write-cursor is requested, then
004062  ** the caller is assumed to have an open write transaction.
004063  **
004064  ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
004065  ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
004066  ** can be used for reading or for writing if other conditions for writing
004067  ** are also met.  These are the conditions that must be met in order
004068  ** for writing to be allowed:
004069  **
004070  ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
004071  **
004072  ** 2:  Other database connections that share the same pager cache
004073  **     but which are not in the READ_UNCOMMITTED state may not have
004074  **     cursors open with wrFlag==0 on the same table.  Otherwise
004075  **     the changes made by this write cursor would be visible to
004076  **     the read cursors in the other database connection.
004077  **
004078  ** 3:  The database must be writable (not on read-only media)
004079  **
004080  ** 4:  There must be an active transaction.
004081  **
004082  ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
004083  ** is set.  If FORDELETE is set, that is a hint to the implementation that
004084  ** this cursor will only be used to seek to and delete entries of an index
004085  ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
004086  ** this implementation.  But in a hypothetical alternative storage engine 
004087  ** in which index entries are automatically deleted when corresponding table
004088  ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
004089  ** operations on this cursor can be no-ops and all READ operations can 
004090  ** return a null row (2-bytes: 0x01 0x00).
004091  **
004092  ** No checking is done to make sure that page iTable really is the
004093  ** root page of a b-tree.  If it is not, then the cursor acquired
004094  ** will not work correctly.
004095  **
004096  ** It is assumed that the sqlite3BtreeCursorZero() has been called
004097  ** on pCur to initialize the memory space prior to invoking this routine.
004098  */
004099  static int btreeCursor(
004100    Btree *p,                              /* The btree */
004101    int iTable,                            /* Root page of table to open */
004102    int wrFlag,                            /* 1 to write. 0 read-only */
004103    struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
004104    BtCursor *pCur                         /* Space for new cursor */
004105  ){
004106    BtShared *pBt = p->pBt;                /* Shared b-tree handle */
004107    BtCursor *pX;                          /* Looping over other all cursors */
004108  
004109    assert( sqlite3BtreeHoldsMutex(p) );
004110    assert( wrFlag==0 
004111         || wrFlag==BTREE_WRCSR 
004112         || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 
004113    );
004114  
004115    /* The following assert statements verify that if this is a sharable 
004116    ** b-tree database, the connection is holding the required table locks, 
004117    ** and that no other connection has any open cursor that conflicts with 
004118    ** this lock.  */
004119    assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );
004120    assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
004121  
004122    /* Assert that the caller has opened the required transaction. */
004123    assert( p->inTrans>TRANS_NONE );
004124    assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
004125    assert( pBt->pPage1 && pBt->pPage1->aData );
004126    assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
004127  
004128    if( wrFlag ){
004129      allocateTempSpace(pBt);
004130      if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
004131    }
004132    if( iTable==1 && btreePagecount(pBt)==0 ){
004133      assert( wrFlag==0 );
004134      iTable = 0;
004135    }
004136  
004137    /* Now that no other errors can occur, finish filling in the BtCursor
004138    ** variables and link the cursor into the BtShared list.  */
004139    pCur->pgnoRoot = (Pgno)iTable;
004140    pCur->iPage = -1;
004141    pCur->pKeyInfo = pKeyInfo;
004142    pCur->pBtree = p;
004143    pCur->pBt = pBt;
004144    pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
004145    pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
004146    /* If there are two or more cursors on the same btree, then all such
004147    ** cursors *must* have the BTCF_Multiple flag set. */
004148    for(pX=pBt->pCursor; pX; pX=pX->pNext){
004149      if( pX->pgnoRoot==(Pgno)iTable ){
004150        pX->curFlags |= BTCF_Multiple;
004151        pCur->curFlags |= BTCF_Multiple;
004152      }
004153    }
004154    pCur->pNext = pBt->pCursor;
004155    pBt->pCursor = pCur;
004156    pCur->eState = CURSOR_INVALID;
004157    return SQLITE_OK;
004158  }
004159  int sqlite3BtreeCursor(
004160    Btree *p,                                   /* The btree */
004161    int iTable,                                 /* Root page of table to open */
004162    int wrFlag,                                 /* 1 to write. 0 read-only */
004163    struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
004164    BtCursor *pCur                              /* Write new cursor here */
004165  ){
004166    int rc;
004167    if( iTable<1 ){
004168      rc = SQLITE_CORRUPT_BKPT;
004169    }else{
004170      sqlite3BtreeEnter(p);
004171      rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
004172      sqlite3BtreeLeave(p);
004173    }
004174    return rc;
004175  }
004176  
004177  /*
004178  ** Return the size of a BtCursor object in bytes.
004179  **
004180  ** This interfaces is needed so that users of cursors can preallocate
004181  ** sufficient storage to hold a cursor.  The BtCursor object is opaque
004182  ** to users so they cannot do the sizeof() themselves - they must call
004183  ** this routine.
004184  */
004185  int sqlite3BtreeCursorSize(void){
004186    return ROUND8(sizeof(BtCursor));
004187  }
004188  
004189  /*
004190  ** Initialize memory that will be converted into a BtCursor object.
004191  **
004192  ** The simple approach here would be to memset() the entire object
004193  ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
004194  ** do not need to be zeroed and they are large, so we can save a lot
004195  ** of run-time by skipping the initialization of those elements.
004196  */
004197  void sqlite3BtreeCursorZero(BtCursor *p){
004198    memset(p, 0, offsetof(BtCursor, iPage));
004199  }
004200  
004201  /*
004202  ** Close a cursor.  The read lock on the database file is released
004203  ** when the last cursor is closed.
004204  */
004205  int sqlite3BtreeCloseCursor(BtCursor *pCur){
004206    Btree *pBtree = pCur->pBtree;
004207    if( pBtree ){
004208      int i;
004209      BtShared *pBt = pCur->pBt;
004210      sqlite3BtreeEnter(pBtree);
004211      sqlite3BtreeClearCursor(pCur);
004212      assert( pBt->pCursor!=0 );
004213      if( pBt->pCursor==pCur ){
004214        pBt->pCursor = pCur->pNext;
004215      }else{
004216        BtCursor *pPrev = pBt->pCursor;
004217        do{
004218          if( pPrev->pNext==pCur ){
004219            pPrev->pNext = pCur->pNext;
004220            break;
004221          }
004222          pPrev = pPrev->pNext;
004223        }while( ALWAYS(pPrev) );
004224      }
004225      for(i=0; i<=pCur->iPage; i++){
004226        releasePage(pCur->apPage[i]);
004227      }
004228      unlockBtreeIfUnused(pBt);
004229      sqlite3_free(pCur->aOverflow);
004230      /* sqlite3_free(pCur); */
004231      sqlite3BtreeLeave(pBtree);
004232    }
004233    return SQLITE_OK;
004234  }
004235  
004236  /*
004237  ** Make sure the BtCursor* given in the argument has a valid
004238  ** BtCursor.info structure.  If it is not already valid, call
004239  ** btreeParseCell() to fill it in.
004240  **
004241  ** BtCursor.info is a cache of the information in the current cell.
004242  ** Using this cache reduces the number of calls to btreeParseCell().
004243  */
004244  #ifndef NDEBUG
004245    static void assertCellInfo(BtCursor *pCur){
004246      CellInfo info;
004247      int iPage = pCur->iPage;
004248      memset(&info, 0, sizeof(info));
004249      btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
004250      assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
004251    }
004252  #else
004253    #define assertCellInfo(x)
004254  #endif
004255  static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
004256    if( pCur->info.nSize==0 ){
004257      int iPage = pCur->iPage;
004258      pCur->curFlags |= BTCF_ValidNKey;
004259      btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
004260    }else{
004261      assertCellInfo(pCur);
004262    }
004263  }
004264  
004265  #ifndef NDEBUG  /* The next routine used only within assert() statements */
004266  /*
004267  ** Return true if the given BtCursor is valid.  A valid cursor is one
004268  ** that is currently pointing to a row in a (non-empty) table.
004269  ** This is a verification routine is used only within assert() statements.
004270  */
004271  int sqlite3BtreeCursorIsValid(BtCursor *pCur){
004272    return pCur && pCur->eState==CURSOR_VALID;
004273  }
004274  #endif /* NDEBUG */
004275  int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
004276    assert( pCur!=0 );
004277    return pCur->eState==CURSOR_VALID;
004278  }
004279  
004280  /*
004281  ** Return the value of the integer key or "rowid" for a table btree.
004282  ** This routine is only valid for a cursor that is pointing into a
004283  ** ordinary table btree.  If the cursor points to an index btree or
004284  ** is invalid, the result of this routine is undefined.
004285  */
004286  i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
004287    assert( cursorHoldsMutex(pCur) );
004288    assert( pCur->eState==CURSOR_VALID );
004289    assert( pCur->curIntKey );
004290    getCellInfo(pCur);
004291    return pCur->info.nKey;
004292  }
004293  
004294  /*
004295  ** Return the number of bytes of payload for the entry that pCur is
004296  ** currently pointing to.  For table btrees, this will be the amount
004297  ** of data.  For index btrees, this will be the size of the key.
004298  **
004299  ** The caller must guarantee that the cursor is pointing to a non-NULL
004300  ** valid entry.  In other words, the calling procedure must guarantee
004301  ** that the cursor has Cursor.eState==CURSOR_VALID.
004302  */
004303  u32 sqlite3BtreePayloadSize(BtCursor *pCur){
004304    assert( cursorHoldsMutex(pCur) );
004305    assert( pCur->eState==CURSOR_VALID );
004306    getCellInfo(pCur);
004307    return pCur->info.nPayload;
004308  }
004309  
004310  /*
004311  ** Given the page number of an overflow page in the database (parameter
004312  ** ovfl), this function finds the page number of the next page in the 
004313  ** linked list of overflow pages. If possible, it uses the auto-vacuum
004314  ** pointer-map data instead of reading the content of page ovfl to do so. 
004315  **
004316  ** If an error occurs an SQLite error code is returned. Otherwise:
004317  **
004318  ** The page number of the next overflow page in the linked list is 
004319  ** written to *pPgnoNext. If page ovfl is the last page in its linked 
004320  ** list, *pPgnoNext is set to zero. 
004321  **
004322  ** If ppPage is not NULL, and a reference to the MemPage object corresponding
004323  ** to page number pOvfl was obtained, then *ppPage is set to point to that
004324  ** reference. It is the responsibility of the caller to call releasePage()
004325  ** on *ppPage to free the reference. In no reference was obtained (because
004326  ** the pointer-map was used to obtain the value for *pPgnoNext), then
004327  ** *ppPage is set to zero.
004328  */
004329  static int getOverflowPage(
004330    BtShared *pBt,               /* The database file */
004331    Pgno ovfl,                   /* Current overflow page number */
004332    MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
004333    Pgno *pPgnoNext              /* OUT: Next overflow page number */
004334  ){
004335    Pgno next = 0;
004336    MemPage *pPage = 0;
004337    int rc = SQLITE_OK;
004338  
004339    assert( sqlite3_mutex_held(pBt->mutex) );
004340    assert(pPgnoNext);
004341  
004342  #ifndef SQLITE_OMIT_AUTOVACUUM
004343    /* Try to find the next page in the overflow list using the
004344    ** autovacuum pointer-map pages. Guess that the next page in 
004345    ** the overflow list is page number (ovfl+1). If that guess turns 
004346    ** out to be wrong, fall back to loading the data of page 
004347    ** number ovfl to determine the next page number.
004348    */
004349    if( pBt->autoVacuum ){
004350      Pgno pgno;
004351      Pgno iGuess = ovfl+1;
004352      u8 eType;
004353  
004354      while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
004355        iGuess++;
004356      }
004357  
004358      if( iGuess<=btreePagecount(pBt) ){
004359        rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
004360        if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
004361          next = iGuess;
004362          rc = SQLITE_DONE;
004363        }
004364      }
004365    }
004366  #endif
004367  
004368    assert( next==0 || rc==SQLITE_DONE );
004369    if( rc==SQLITE_OK ){
004370      rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
004371      assert( rc==SQLITE_OK || pPage==0 );
004372      if( rc==SQLITE_OK ){
004373        next = get4byte(pPage->aData);
004374      }
004375    }
004376  
004377    *pPgnoNext = next;
004378    if( ppPage ){
004379      *ppPage = pPage;
004380    }else{
004381      releasePage(pPage);
004382    }
004383    return (rc==SQLITE_DONE ? SQLITE_OK : rc);
004384  }
004385  
004386  /*
004387  ** Copy data from a buffer to a page, or from a page to a buffer.
004388  **
004389  ** pPayload is a pointer to data stored on database page pDbPage.
004390  ** If argument eOp is false, then nByte bytes of data are copied
004391  ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
004392  ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
004393  ** of data are copied from the buffer pBuf to pPayload.
004394  **
004395  ** SQLITE_OK is returned on success, otherwise an error code.
004396  */
004397  static int copyPayload(
004398    void *pPayload,           /* Pointer to page data */
004399    void *pBuf,               /* Pointer to buffer */
004400    int nByte,                /* Number of bytes to copy */
004401    int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
004402    DbPage *pDbPage           /* Page containing pPayload */
004403  ){
004404    if( eOp ){
004405      /* Copy data from buffer to page (a write operation) */
004406      int rc = sqlite3PagerWrite(pDbPage);
004407      if( rc!=SQLITE_OK ){
004408        return rc;
004409      }
004410      memcpy(pPayload, pBuf, nByte);
004411    }else{
004412      /* Copy data from page to buffer (a read operation) */
004413      memcpy(pBuf, pPayload, nByte);
004414    }
004415    return SQLITE_OK;
004416  }
004417  
004418  /*
004419  ** This function is used to read or overwrite payload information
004420  ** for the entry that the pCur cursor is pointing to. The eOp
004421  ** argument is interpreted as follows:
004422  **
004423  **   0: The operation is a read. Populate the overflow cache.
004424  **   1: The operation is a write. Populate the overflow cache.
004425  **   2: The operation is a read. Do not populate the overflow cache.
004426  **
004427  ** A total of "amt" bytes are read or written beginning at "offset".
004428  ** Data is read to or from the buffer pBuf.
004429  **
004430  ** The content being read or written might appear on the main page
004431  ** or be scattered out on multiple overflow pages.
004432  **
004433  ** If the current cursor entry uses one or more overflow pages and the
004434  ** eOp argument is not 2, this function may allocate space for and lazily 
004435  ** populates the overflow page-list cache array (BtCursor.aOverflow). 
004436  ** Subsequent calls use this cache to make seeking to the supplied offset 
004437  ** more efficient.
004438  **
004439  ** Once an overflow page-list cache has been allocated, it may be
004440  ** invalidated if some other cursor writes to the same table, or if
004441  ** the cursor is moved to a different row. Additionally, in auto-vacuum
004442  ** mode, the following events may invalidate an overflow page-list cache.
004443  **
004444  **   * An incremental vacuum,
004445  **   * A commit in auto_vacuum="full" mode,
004446  **   * Creating a table (may require moving an overflow page).
004447  */
004448  static int accessPayload(
004449    BtCursor *pCur,      /* Cursor pointing to entry to read from */
004450    u32 offset,          /* Begin reading this far into payload */
004451    u32 amt,             /* Read this many bytes */
004452    unsigned char *pBuf, /* Write the bytes into this buffer */ 
004453    int eOp              /* zero to read. non-zero to write. */
004454  ){
004455    unsigned char *aPayload;
004456    int rc = SQLITE_OK;
004457    int iIdx = 0;
004458    MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
004459    BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
004460  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004461    unsigned char * const pBufStart = pBuf;
004462    int bEnd;                                 /* True if reading to end of data */
004463  #endif
004464  
004465    assert( pPage );
004466    assert( pCur->eState==CURSOR_VALID );
004467    assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
004468    assert( cursorHoldsMutex(pCur) );
004469    assert( eOp!=2 || offset==0 );    /* Always start from beginning for eOp==2 */
004470  
004471    getCellInfo(pCur);
004472    aPayload = pCur->info.pPayload;
004473  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004474    bEnd = offset+amt==pCur->info.nPayload;
004475  #endif
004476    assert( offset+amt <= pCur->info.nPayload );
004477  
004478    assert( aPayload > pPage->aData );
004479    if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
004480      /* Trying to read or write past the end of the data is an error.  The
004481      ** conditional above is really:
004482      **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
004483      ** but is recast into its current form to avoid integer overflow problems
004484      */
004485      return SQLITE_CORRUPT_BKPT;
004486    }
004487  
004488    /* Check if data must be read/written to/from the btree page itself. */
004489    if( offset<pCur->info.nLocal ){
004490      int a = amt;
004491      if( a+offset>pCur->info.nLocal ){
004492        a = pCur->info.nLocal - offset;
004493      }
004494      rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);
004495      offset = 0;
004496      pBuf += a;
004497      amt -= a;
004498    }else{
004499      offset -= pCur->info.nLocal;
004500    }
004501  
004502  
004503    if( rc==SQLITE_OK && amt>0 ){
004504      const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
004505      Pgno nextPage;
004506  
004507      nextPage = get4byte(&aPayload[pCur->info.nLocal]);
004508  
004509      /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
004510      ** Except, do not allocate aOverflow[] for eOp==2.
004511      **
004512      ** The aOverflow[] array is sized at one entry for each overflow page
004513      ** in the overflow chain. The page number of the first overflow page is
004514      ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
004515      ** means "not yet known" (the cache is lazily populated).
004516      */
004517      if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
004518        int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
004519        if( nOvfl>pCur->nOvflAlloc ){
004520          Pgno *aNew = (Pgno*)sqlite3Realloc(
004521              pCur->aOverflow, nOvfl*2*sizeof(Pgno)
004522          );
004523          if( aNew==0 ){
004524            rc = SQLITE_NOMEM_BKPT;
004525          }else{
004526            pCur->nOvflAlloc = nOvfl*2;
004527            pCur->aOverflow = aNew;
004528          }
004529        }
004530        if( rc==SQLITE_OK ){
004531          memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
004532          pCur->curFlags |= BTCF_ValidOvfl;
004533        }
004534      }
004535  
004536      /* If the overflow page-list cache has been allocated and the
004537      ** entry for the first required overflow page is valid, skip
004538      ** directly to it.
004539      */
004540      if( (pCur->curFlags & BTCF_ValidOvfl)!=0
004541       && pCur->aOverflow[offset/ovflSize]
004542      ){
004543        iIdx = (offset/ovflSize);
004544        nextPage = pCur->aOverflow[iIdx];
004545        offset = (offset%ovflSize);
004546      }
004547  
004548      for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
004549  
004550        /* If required, populate the overflow page-list cache. */
004551        if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){
004552          assert( pCur->aOverflow[iIdx]==0
004553                  || pCur->aOverflow[iIdx]==nextPage
004554                  || CORRUPT_DB );
004555          pCur->aOverflow[iIdx] = nextPage;
004556        }
004557  
004558        if( offset>=ovflSize ){
004559          /* The only reason to read this page is to obtain the page
004560          ** number for the next page in the overflow chain. The page
004561          ** data is not required. So first try to lookup the overflow
004562          ** page-list cache, if any, then fall back to the getOverflowPage()
004563          ** function.
004564          **
004565          ** Note that the aOverflow[] array must be allocated because eOp!=2
004566          ** here.  If eOp==2, then offset==0 and this branch is never taken.
004567          */
004568          assert( eOp!=2 );
004569          assert( pCur->curFlags & BTCF_ValidOvfl );
004570          assert( pCur->pBtree->db==pBt->db );
004571          if( pCur->aOverflow[iIdx+1] ){
004572            nextPage = pCur->aOverflow[iIdx+1];
004573          }else{
004574            rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
004575          }
004576          offset -= ovflSize;
004577        }else{
004578          /* Need to read this page properly. It contains some of the
004579          ** range of data that is being read (eOp==0) or written (eOp!=0).
004580          */
004581  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004582          sqlite3_file *fd;
004583  #endif
004584          int a = amt;
004585          if( a + offset > ovflSize ){
004586            a = ovflSize - offset;
004587          }
004588  
004589  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004590          /* If all the following are true:
004591          **
004592          **   1) this is a read operation, and 
004593          **   2) data is required from the start of this overflow page, and
004594          **   3) the database is file-backed, and
004595          **   4) there is no open write-transaction, and
004596          **   5) the database is not a WAL database,
004597          **   6) all data from the page is being read.
004598          **   7) at least 4 bytes have already been read into the output buffer 
004599          **
004600          ** then data can be read directly from the database file into the
004601          ** output buffer, bypassing the page-cache altogether. This speeds
004602          ** up loading large records that span many overflow pages.
004603          */
004604          if( (eOp&0x01)==0                                      /* (1) */
004605           && offset==0                                          /* (2) */
004606           && (bEnd || a==ovflSize)                              /* (6) */
004607           && pBt->inTransaction==TRANS_READ                     /* (4) */
004608           && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (3) */
004609           && 0==sqlite3PagerUseWal(pBt->pPager)                 /* (5) */
004610           && &pBuf[-4]>=pBufStart                               /* (7) */
004611          ){
004612            u8 aSave[4];
004613            u8 *aWrite = &pBuf[-4];
004614            assert( aWrite>=pBufStart );                         /* hence (7) */
004615            memcpy(aSave, aWrite, 4);
004616            rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
004617            nextPage = get4byte(aWrite);
004618            memcpy(aWrite, aSave, 4);
004619          }else
004620  #endif
004621  
004622          {
004623            DbPage *pDbPage;
004624            rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
004625                ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)
004626            );
004627            if( rc==SQLITE_OK ){
004628              aPayload = sqlite3PagerGetData(pDbPage);
004629              nextPage = get4byte(aPayload);
004630              rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);
004631              sqlite3PagerUnref(pDbPage);
004632              offset = 0;
004633            }
004634          }
004635          amt -= a;
004636          pBuf += a;
004637        }
004638      }
004639    }
004640  
004641    if( rc==SQLITE_OK && amt>0 ){
004642      return SQLITE_CORRUPT_BKPT;
004643    }
004644    return rc;
004645  }
004646  
004647  /*
004648  ** Read part of the payload for the row at which that cursor pCur is currently
004649  ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
004650  ** begins at "offset".
004651  **
004652  ** pCur can be pointing to either a table or an index b-tree.
004653  ** If pointing to a table btree, then the content section is read.  If
004654  ** pCur is pointing to an index b-tree then the key section is read.
004655  **
004656  ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
004657  ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
004658  ** cursor might be invalid or might need to be restored before being read.
004659  **
004660  ** Return SQLITE_OK on success or an error code if anything goes
004661  ** wrong.  An error is returned if "offset+amt" is larger than
004662  ** the available payload.
004663  */
004664  int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
004665    assert( cursorHoldsMutex(pCur) );
004666    assert( pCur->eState==CURSOR_VALID );
004667    assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
004668    assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
004669    return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
004670  }
004671  #ifndef SQLITE_OMIT_INCRBLOB
004672  int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
004673    int rc;
004674    if ( pCur->eState==CURSOR_INVALID ){
004675      return SQLITE_ABORT;
004676    }
004677    assert( cursorOwnsBtShared(pCur) );
004678    rc = restoreCursorPosition(pCur);
004679    if( rc==SQLITE_OK ){
004680      assert( pCur->eState==CURSOR_VALID );
004681      assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
004682      assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
004683      rc = accessPayload(pCur, offset, amt, pBuf, 0);
004684    }
004685    return rc;
004686  }
004687  #endif /* SQLITE_OMIT_INCRBLOB */
004688  
004689  /*
004690  ** Return a pointer to payload information from the entry that the 
004691  ** pCur cursor is pointing to.  The pointer is to the beginning of
004692  ** the key if index btrees (pPage->intKey==0) and is the data for
004693  ** table btrees (pPage->intKey==1). The number of bytes of available
004694  ** key/data is written into *pAmt.  If *pAmt==0, then the value
004695  ** returned will not be a valid pointer.
004696  **
004697  ** This routine is an optimization.  It is common for the entire key
004698  ** and data to fit on the local page and for there to be no overflow
004699  ** pages.  When that is so, this routine can be used to access the
004700  ** key and data without making a copy.  If the key and/or data spills
004701  ** onto overflow pages, then accessPayload() must be used to reassemble
004702  ** the key/data and copy it into a preallocated buffer.
004703  **
004704  ** The pointer returned by this routine looks directly into the cached
004705  ** page of the database.  The data might change or move the next time
004706  ** any btree routine is called.
004707  */
004708  static const void *fetchPayload(
004709    BtCursor *pCur,      /* Cursor pointing to entry to read from */
004710    u32 *pAmt            /* Write the number of available bytes here */
004711  ){
004712    u32 amt;
004713    assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
004714    assert( pCur->eState==CURSOR_VALID );
004715    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
004716    assert( cursorOwnsBtShared(pCur) );
004717    assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
004718    assert( pCur->info.nSize>0 );
004719    assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData || CORRUPT_DB );
004720    assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd ||CORRUPT_DB);
004721    amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload);
004722    if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal;
004723    *pAmt = amt;
004724    return (void*)pCur->info.pPayload;
004725  }
004726  
004727  
004728  /*
004729  ** For the entry that cursor pCur is point to, return as
004730  ** many bytes of the key or data as are available on the local
004731  ** b-tree page.  Write the number of available bytes into *pAmt.
004732  **
004733  ** The pointer returned is ephemeral.  The key/data may move
004734  ** or be destroyed on the next call to any Btree routine,
004735  ** including calls from other threads against the same cache.
004736  ** Hence, a mutex on the BtShared should be held prior to calling
004737  ** this routine.
004738  **
004739  ** These routines is used to get quick access to key and data
004740  ** in the common case where no overflow pages are used.
004741  */
004742  const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
004743    return fetchPayload(pCur, pAmt);
004744  }
004745  
004746  
004747  /*
004748  ** Move the cursor down to a new child page.  The newPgno argument is the
004749  ** page number of the child page to move to.
004750  **
004751  ** This function returns SQLITE_CORRUPT if the page-header flags field of
004752  ** the new child page does not match the flags field of the parent (i.e.
004753  ** if an intkey page appears to be the parent of a non-intkey page, or
004754  ** vice-versa).
004755  */
004756  static int moveToChild(BtCursor *pCur, u32 newPgno){
004757    BtShared *pBt = pCur->pBt;
004758  
004759    assert( cursorOwnsBtShared(pCur) );
004760    assert( pCur->eState==CURSOR_VALID );
004761    assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
004762    assert( pCur->iPage>=0 );
004763    if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
004764      return SQLITE_CORRUPT_BKPT;
004765    }
004766    pCur->info.nSize = 0;
004767    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
004768    pCur->iPage++;
004769    pCur->aiIdx[pCur->iPage] = 0;
004770    return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage],
004771                          pCur, pCur->curPagerFlags);
004772  }
004773  
004774  #if SQLITE_DEBUG
004775  /*
004776  ** Page pParent is an internal (non-leaf) tree page. This function 
004777  ** asserts that page number iChild is the left-child if the iIdx'th
004778  ** cell in page pParent. Or, if iIdx is equal to the total number of
004779  ** cells in pParent, that page number iChild is the right-child of
004780  ** the page.
004781  */
004782  static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
004783    if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
004784                              ** in a corrupt database */
004785    assert( iIdx<=pParent->nCell );
004786    if( iIdx==pParent->nCell ){
004787      assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
004788    }else{
004789      assert( get4byte(findCell(pParent, iIdx))==iChild );
004790    }
004791  }
004792  #else
004793  #  define assertParentIndex(x,y,z) 
004794  #endif
004795  
004796  /*
004797  ** Move the cursor up to the parent page.
004798  **
004799  ** pCur->idx is set to the cell index that contains the pointer
004800  ** to the page we are coming from.  If we are coming from the
004801  ** right-most child page then pCur->idx is set to one more than
004802  ** the largest cell index.
004803  */
004804  static void moveToParent(BtCursor *pCur){
004805    assert( cursorOwnsBtShared(pCur) );
004806    assert( pCur->eState==CURSOR_VALID );
004807    assert( pCur->iPage>0 );
004808    assert( pCur->apPage[pCur->iPage] );
004809    assertParentIndex(
004810      pCur->apPage[pCur->iPage-1], 
004811      pCur->aiIdx[pCur->iPage-1], 
004812      pCur->apPage[pCur->iPage]->pgno
004813    );
004814    testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
004815    pCur->info.nSize = 0;
004816    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
004817    releasePageNotNull(pCur->apPage[pCur->iPage--]);
004818  }
004819  
004820  /*
004821  ** Move the cursor to point to the root page of its b-tree structure.
004822  **
004823  ** If the table has a virtual root page, then the cursor is moved to point
004824  ** to the virtual root page instead of the actual root page. A table has a
004825  ** virtual root page when the actual root page contains no cells and a 
004826  ** single child page. This can only happen with the table rooted at page 1.
004827  **
004828  ** If the b-tree structure is empty, the cursor state is set to 
004829  ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
004830  ** cell located on the root (or virtual root) page and the cursor state
004831  ** is set to CURSOR_VALID.
004832  **
004833  ** If this function returns successfully, it may be assumed that the
004834  ** page-header flags indicate that the [virtual] root-page is the expected 
004835  ** kind of b-tree page (i.e. if when opening the cursor the caller did not
004836  ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
004837  ** indicating a table b-tree, or if the caller did specify a KeyInfo 
004838  ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
004839  ** b-tree).
004840  */
004841  static int moveToRoot(BtCursor *pCur){
004842    MemPage *pRoot;
004843    int rc = SQLITE_OK;
004844  
004845    assert( cursorOwnsBtShared(pCur) );
004846    assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
004847    assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
004848    assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
004849    if( pCur->eState>=CURSOR_REQUIRESEEK ){
004850      if( pCur->eState==CURSOR_FAULT ){
004851        assert( pCur->skipNext!=SQLITE_OK );
004852        return pCur->skipNext;
004853      }
004854      sqlite3BtreeClearCursor(pCur);
004855    }
004856  
004857    if( pCur->iPage>=0 ){
004858      if( pCur->iPage ){
004859        do{
004860          assert( pCur->apPage[pCur->iPage]!=0 );
004861          releasePageNotNull(pCur->apPage[pCur->iPage--]);
004862        }while( pCur->iPage);
004863        goto skip_init;
004864      }
004865    }else if( pCur->pgnoRoot==0 ){
004866      pCur->eState = CURSOR_INVALID;
004867      return SQLITE_OK;
004868    }else{
004869      assert( pCur->iPage==(-1) );
004870      rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],
004871                          0, pCur->curPagerFlags);
004872      if( rc!=SQLITE_OK ){
004873        pCur->eState = CURSOR_INVALID;
004874         return rc;
004875      }
004876      pCur->iPage = 0;
004877      pCur->curIntKey = pCur->apPage[0]->intKey;
004878    }
004879    pRoot = pCur->apPage[0];
004880    assert( pRoot->pgno==pCur->pgnoRoot );
004881  
004882    /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
004883    ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
004884    ** NULL, the caller expects a table b-tree. If this is not the case,
004885    ** return an SQLITE_CORRUPT error. 
004886    **
004887    ** Earlier versions of SQLite assumed that this test could not fail
004888    ** if the root page was already loaded when this function was called (i.e.
004889    ** if pCur->iPage>=0). But this is not so if the database is corrupted 
004890    ** in such a way that page pRoot is linked into a second b-tree table 
004891    ** (or the freelist).  */
004892    assert( pRoot->intKey==1 || pRoot->intKey==0 );
004893    if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
004894      return SQLITE_CORRUPT_BKPT;
004895    }
004896  
004897  skip_init:  
004898    pCur->aiIdx[0] = 0;
004899    pCur->info.nSize = 0;
004900    pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
004901  
004902    pRoot = pCur->apPage[0];
004903    if( pRoot->nCell>0 ){
004904      pCur->eState = CURSOR_VALID;
004905    }else if( !pRoot->leaf ){
004906      Pgno subpage;
004907      if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
004908      subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
004909      pCur->eState = CURSOR_VALID;
004910      rc = moveToChild(pCur, subpage);
004911    }else{
004912      pCur->eState = CURSOR_INVALID;
004913    }
004914    return rc;
004915  }
004916  
004917  /*
004918  ** Move the cursor down to the left-most leaf entry beneath the
004919  ** entry to which it is currently pointing.
004920  **
004921  ** The left-most leaf is the one with the smallest key - the first
004922  ** in ascending order.
004923  */
004924  static int moveToLeftmost(BtCursor *pCur){
004925    Pgno pgno;
004926    int rc = SQLITE_OK;
004927    MemPage *pPage;
004928  
004929    assert( cursorOwnsBtShared(pCur) );
004930    assert( pCur->eState==CURSOR_VALID );
004931    while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
004932      assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
004933      pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
004934      rc = moveToChild(pCur, pgno);
004935    }
004936    return rc;
004937  }
004938  
004939  /*
004940  ** Move the cursor down to the right-most leaf entry beneath the
004941  ** page to which it is currently pointing.  Notice the difference
004942  ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
004943  ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
004944  ** finds the right-most entry beneath the *page*.
004945  **
004946  ** The right-most entry is the one with the largest key - the last
004947  ** key in ascending order.
004948  */
004949  static int moveToRightmost(BtCursor *pCur){
004950    Pgno pgno;
004951    int rc = SQLITE_OK;
004952    MemPage *pPage = 0;
004953  
004954    assert( cursorOwnsBtShared(pCur) );
004955    assert( pCur->eState==CURSOR_VALID );
004956    while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){
004957      pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
004958      pCur->aiIdx[pCur->iPage] = pPage->nCell;
004959      rc = moveToChild(pCur, pgno);
004960      if( rc ) return rc;
004961    }
004962    pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
004963    assert( pCur->info.nSize==0 );
004964    assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
004965    return SQLITE_OK;
004966  }
004967  
004968  /* Move the cursor to the first entry in the table.  Return SQLITE_OK
004969  ** on success.  Set *pRes to 0 if the cursor actually points to something
004970  ** or set *pRes to 1 if the table is empty.
004971  */
004972  int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
004973    int rc;
004974  
004975    assert( cursorOwnsBtShared(pCur) );
004976    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
004977    rc = moveToRoot(pCur);
004978    if( rc==SQLITE_OK ){
004979      if( pCur->eState==CURSOR_INVALID ){
004980        assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
004981        *pRes = 1;
004982      }else{
004983        assert( pCur->apPage[pCur->iPage]->nCell>0 );
004984        *pRes = 0;
004985        rc = moveToLeftmost(pCur);
004986      }
004987    }
004988    return rc;
004989  }
004990  
004991  /* Move the cursor to the last entry in the table.  Return SQLITE_OK
004992  ** on success.  Set *pRes to 0 if the cursor actually points to something
004993  ** or set *pRes to 1 if the table is empty.
004994  */
004995  int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
004996    int rc;
004997   
004998    assert( cursorOwnsBtShared(pCur) );
004999    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005000  
005001    /* If the cursor already points to the last entry, this is a no-op. */
005002    if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
005003  #ifdef SQLITE_DEBUG
005004      /* This block serves to assert() that the cursor really does point 
005005      ** to the last entry in the b-tree. */
005006      int ii;
005007      for(ii=0; ii<pCur->iPage; ii++){
005008        assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
005009      }
005010      assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
005011      assert( pCur->apPage[pCur->iPage]->leaf );
005012  #endif
005013      return SQLITE_OK;
005014    }
005015  
005016    rc = moveToRoot(pCur);
005017    if( rc==SQLITE_OK ){
005018      if( CURSOR_INVALID==pCur->eState ){
005019        assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
005020        *pRes = 1;
005021      }else{
005022        assert( pCur->eState==CURSOR_VALID );
005023        *pRes = 0;
005024        rc = moveToRightmost(pCur);
005025        if( rc==SQLITE_OK ){
005026          pCur->curFlags |= BTCF_AtLast;
005027        }else{
005028          pCur->curFlags &= ~BTCF_AtLast;
005029        }
005030     
005031      }
005032    }
005033    return rc;
005034  }
005035  
005036  /* Move the cursor so that it points to an entry near the key 
005037  ** specified by pIdxKey or intKey.   Return a success code.
005038  **
005039  ** For INTKEY tables, the intKey parameter is used.  pIdxKey 
005040  ** must be NULL.  For index tables, pIdxKey is used and intKey
005041  ** is ignored.
005042  **
005043  ** If an exact match is not found, then the cursor is always
005044  ** left pointing at a leaf page which would hold the entry if it
005045  ** were present.  The cursor might point to an entry that comes
005046  ** before or after the key.
005047  **
005048  ** An integer is written into *pRes which is the result of
005049  ** comparing the key with the entry to which the cursor is 
005050  ** pointing.  The meaning of the integer written into
005051  ** *pRes is as follows:
005052  **
005053  **     *pRes<0      The cursor is left pointing at an entry that
005054  **                  is smaller than intKey/pIdxKey or if the table is empty
005055  **                  and the cursor is therefore left point to nothing.
005056  **
005057  **     *pRes==0     The cursor is left pointing at an entry that
005058  **                  exactly matches intKey/pIdxKey.
005059  **
005060  **     *pRes>0      The cursor is left pointing at an entry that
005061  **                  is larger than intKey/pIdxKey.
005062  **
005063  ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
005064  ** exists an entry in the table that exactly matches pIdxKey.  
005065  */
005066  int sqlite3BtreeMovetoUnpacked(
005067    BtCursor *pCur,          /* The cursor to be moved */
005068    UnpackedRecord *pIdxKey, /* Unpacked index key */
005069    i64 intKey,              /* The table key */
005070    int biasRight,           /* If true, bias the search to the high end */
005071    int *pRes                /* Write search results here */
005072  ){
005073    int rc;
005074    RecordCompare xRecordCompare;
005075  
005076    assert( cursorOwnsBtShared(pCur) );
005077    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005078    assert( pRes );
005079    assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
005080    assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
005081  
005082    /* If the cursor is already positioned at the point we are trying
005083    ** to move to, then just return without doing any work */
005084    if( pIdxKey==0
005085     && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
005086    ){
005087      if( pCur->info.nKey==intKey ){
005088        *pRes = 0;
005089        return SQLITE_OK;
005090      }
005091      if( (pCur->curFlags & BTCF_AtLast)!=0 && pCur->info.nKey<intKey ){
005092        *pRes = -1;
005093        return SQLITE_OK;
005094      }
005095    }
005096  
005097    if( pIdxKey ){
005098      xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
005099      pIdxKey->errCode = 0;
005100      assert( pIdxKey->default_rc==1 
005101           || pIdxKey->default_rc==0 
005102           || pIdxKey->default_rc==-1
005103      );
005104    }else{
005105      xRecordCompare = 0; /* All keys are integers */
005106    }
005107  
005108    rc = moveToRoot(pCur);
005109    if( rc ){
005110      return rc;
005111    }
005112    assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
005113    assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
005114    assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
005115    if( pCur->eState==CURSOR_INVALID ){
005116      *pRes = -1;
005117      assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
005118      return SQLITE_OK;
005119    }
005120    assert( pCur->apPage[0]->intKey==pCur->curIntKey );
005121    assert( pCur->curIntKey || pIdxKey );
005122    for(;;){
005123      int lwr, upr, idx, c;
005124      Pgno chldPg;
005125      MemPage *pPage = pCur->apPage[pCur->iPage];
005126      u8 *pCell;                          /* Pointer to current cell in pPage */
005127  
005128      /* pPage->nCell must be greater than zero. If this is the root-page
005129      ** the cursor would have been INVALID above and this for(;;) loop
005130      ** not run. If this is not the root-page, then the moveToChild() routine
005131      ** would have already detected db corruption. Similarly, pPage must
005132      ** be the right kind (index or table) of b-tree page. Otherwise
005133      ** a moveToChild() or moveToRoot() call would have detected corruption.  */
005134      assert( pPage->nCell>0 );
005135      assert( pPage->intKey==(pIdxKey==0) );
005136      lwr = 0;
005137      upr = pPage->nCell-1;
005138      assert( biasRight==0 || biasRight==1 );
005139      idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
005140      pCur->aiIdx[pCur->iPage] = (u16)idx;
005141      if( xRecordCompare==0 ){
005142        for(;;){
005143          i64 nCellKey;
005144          pCell = findCellPastPtr(pPage, idx);
005145          if( pPage->intKeyLeaf ){
005146            while( 0x80 <= *(pCell++) ){
005147              if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
005148            }
005149          }
005150          getVarint(pCell, (u64*)&nCellKey);
005151          if( nCellKey<intKey ){
005152            lwr = idx+1;
005153            if( lwr>upr ){ c = -1; break; }
005154          }else if( nCellKey>intKey ){
005155            upr = idx-1;
005156            if( lwr>upr ){ c = +1; break; }
005157          }else{
005158            assert( nCellKey==intKey );
005159            pCur->aiIdx[pCur->iPage] = (u16)idx;
005160            if( !pPage->leaf ){
005161              lwr = idx;
005162              goto moveto_next_layer;
005163            }else{
005164              pCur->curFlags |= BTCF_ValidNKey;
005165              pCur->info.nKey = nCellKey;
005166              pCur->info.nSize = 0;
005167              *pRes = 0;
005168              return SQLITE_OK;
005169            }
005170          }
005171          assert( lwr+upr>=0 );
005172          idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
005173        }
005174      }else{
005175        for(;;){
005176          int nCell;  /* Size of the pCell cell in bytes */
005177          pCell = findCellPastPtr(pPage, idx);
005178  
005179          /* The maximum supported page-size is 65536 bytes. This means that
005180          ** the maximum number of record bytes stored on an index B-Tree
005181          ** page is less than 16384 bytes and may be stored as a 2-byte
005182          ** varint. This information is used to attempt to avoid parsing 
005183          ** the entire cell by checking for the cases where the record is 
005184          ** stored entirely within the b-tree page by inspecting the first 
005185          ** 2 bytes of the cell.
005186          */
005187          nCell = pCell[0];
005188          if( nCell<=pPage->max1bytePayload ){
005189            /* This branch runs if the record-size field of the cell is a
005190            ** single byte varint and the record fits entirely on the main
005191            ** b-tree page.  */
005192            testcase( pCell+nCell+1==pPage->aDataEnd );
005193            c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
005194          }else if( !(pCell[1] & 0x80) 
005195            && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
005196          ){
005197            /* The record-size field is a 2 byte varint and the record 
005198            ** fits entirely on the main b-tree page.  */
005199            testcase( pCell+nCell+2==pPage->aDataEnd );
005200            c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
005201          }else{
005202            /* The record flows over onto one or more overflow pages. In
005203            ** this case the whole cell needs to be parsed, a buffer allocated
005204            ** and accessPayload() used to retrieve the record into the
005205            ** buffer before VdbeRecordCompare() can be called. 
005206            **
005207            ** If the record is corrupt, the xRecordCompare routine may read
005208            ** up to two varints past the end of the buffer. An extra 18 
005209            ** bytes of padding is allocated at the end of the buffer in
005210            ** case this happens.  */
005211            void *pCellKey;
005212            u8 * const pCellBody = pCell - pPage->childPtrSize;
005213            pPage->xParseCell(pPage, pCellBody, &pCur->info);
005214            nCell = (int)pCur->info.nKey;
005215            testcase( nCell<0 );   /* True if key size is 2^32 or more */
005216            testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
005217            testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
005218            testcase( nCell==2 );  /* Minimum legal index key size */
005219            if( nCell<2 ){
005220              rc = SQLITE_CORRUPT_BKPT;
005221              goto moveto_finish;
005222            }
005223            pCellKey = sqlite3Malloc( nCell+18 );
005224            if( pCellKey==0 ){
005225              rc = SQLITE_NOMEM_BKPT;
005226              goto moveto_finish;
005227            }
005228            pCur->aiIdx[pCur->iPage] = (u16)idx;
005229            rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);
005230            if( rc ){
005231              sqlite3_free(pCellKey);
005232              goto moveto_finish;
005233            }
005234            c = xRecordCompare(nCell, pCellKey, pIdxKey);
005235            sqlite3_free(pCellKey);
005236          }
005237          assert( 
005238              (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
005239           && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
005240          );
005241          if( c<0 ){
005242            lwr = idx+1;
005243          }else if( c>0 ){
005244            upr = idx-1;
005245          }else{
005246            assert( c==0 );
005247            *pRes = 0;
005248            rc = SQLITE_OK;
005249            pCur->aiIdx[pCur->iPage] = (u16)idx;
005250            if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;
005251            goto moveto_finish;
005252          }
005253          if( lwr>upr ) break;
005254          assert( lwr+upr>=0 );
005255          idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
005256        }
005257      }
005258      assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
005259      assert( pPage->isInit );
005260      if( pPage->leaf ){
005261        assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
005262        pCur->aiIdx[pCur->iPage] = (u16)idx;
005263        *pRes = c;
005264        rc = SQLITE_OK;
005265        goto moveto_finish;
005266      }
005267  moveto_next_layer:
005268      if( lwr>=pPage->nCell ){
005269        chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
005270      }else{
005271        chldPg = get4byte(findCell(pPage, lwr));
005272      }
005273      pCur->aiIdx[pCur->iPage] = (u16)lwr;
005274      rc = moveToChild(pCur, chldPg);
005275      if( rc ) break;
005276    }
005277  moveto_finish:
005278    pCur->info.nSize = 0;
005279    assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
005280    return rc;
005281  }
005282  
005283  
005284  /*
005285  ** Return TRUE if the cursor is not pointing at an entry of the table.
005286  **
005287  ** TRUE will be returned after a call to sqlite3BtreeNext() moves
005288  ** past the last entry in the table or sqlite3BtreePrev() moves past
005289  ** the first entry.  TRUE is also returned if the table is empty.
005290  */
005291  int sqlite3BtreeEof(BtCursor *pCur){
005292    /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
005293    ** have been deleted? This API will need to change to return an error code
005294    ** as well as the boolean result value.
005295    */
005296    return (CURSOR_VALID!=pCur->eState);
005297  }
005298  
005299  /*
005300  ** Advance the cursor to the next entry in the database.  If
005301  ** successful then set *pRes=0.  If the cursor
005302  ** was already pointing to the last entry in the database before
005303  ** this routine was called, then set *pRes=1.
005304  **
005305  ** The main entry point is sqlite3BtreeNext().  That routine is optimized
005306  ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
005307  ** to the next cell on the current page.  The (slower) btreeNext() helper
005308  ** routine is called when it is necessary to move to a different page or
005309  ** to restore the cursor.
005310  **
005311  ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
005312  ** will be 1 if the cursor being stepped corresponds to an SQL index and
005313  ** if this routine could have been skipped if that SQL index had been
005314  ** a unique index.  Otherwise the caller will have set *pRes to zero.
005315  ** Zero is the common case. The btree implementation is free to use the
005316  ** initial *pRes value as a hint to improve performance, but the current
005317  ** SQLite btree implementation does not. (Note that the comdb2 btree
005318  ** implementation does use this hint, however.)
005319  */
005320  static SQLITE_NOINLINE int btreeNext(BtCursor *pCur, int *pRes){
005321    int rc;
005322    int idx;
005323    MemPage *pPage;
005324  
005325    assert( cursorOwnsBtShared(pCur) );
005326    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
005327    assert( *pRes==0 );
005328    if( pCur->eState!=CURSOR_VALID ){
005329      assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
005330      rc = restoreCursorPosition(pCur);
005331      if( rc!=SQLITE_OK ){
005332        return rc;
005333      }
005334      if( CURSOR_INVALID==pCur->eState ){
005335        *pRes = 1;
005336        return SQLITE_OK;
005337      }
005338      if( pCur->skipNext ){
005339        assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
005340        pCur->eState = CURSOR_VALID;
005341        if( pCur->skipNext>0 ){
005342          pCur->skipNext = 0;
005343          return SQLITE_OK;
005344        }
005345        pCur->skipNext = 0;
005346      }
005347    }
005348  
005349    pPage = pCur->apPage[pCur->iPage];
005350    idx = ++pCur->aiIdx[pCur->iPage];
005351    assert( pPage->isInit );
005352  
005353    /* If the database file is corrupt, it is possible for the value of idx 
005354    ** to be invalid here. This can only occur if a second cursor modifies
005355    ** the page while cursor pCur is holding a reference to it. Which can
005356    ** only happen if the database is corrupt in such a way as to link the
005357    ** page into more than one b-tree structure. */
005358    testcase( idx>pPage->nCell );
005359  
005360    if( idx>=pPage->nCell ){
005361      if( !pPage->leaf ){
005362        rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
005363        if( rc ) return rc;
005364        return moveToLeftmost(pCur);
005365      }
005366      do{
005367        if( pCur->iPage==0 ){
005368          *pRes = 1;
005369          pCur->eState = CURSOR_INVALID;
005370          return SQLITE_OK;
005371        }
005372        moveToParent(pCur);
005373        pPage = pCur->apPage[pCur->iPage];
005374      }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
005375      if( pPage->intKey ){
005376        return sqlite3BtreeNext(pCur, pRes);
005377      }else{
005378        return SQLITE_OK;
005379      }
005380    }
005381    if( pPage->leaf ){
005382      return SQLITE_OK;
005383    }else{
005384      return moveToLeftmost(pCur);
005385    }
005386  }
005387  int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
005388    MemPage *pPage;
005389    assert( cursorOwnsBtShared(pCur) );
005390    assert( pRes!=0 );
005391    assert( *pRes==0 || *pRes==1 );
005392    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
005393    pCur->info.nSize = 0;
005394    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
005395    *pRes = 0;
005396    if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
005397    pPage = pCur->apPage[pCur->iPage];
005398    if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
005399      pCur->aiIdx[pCur->iPage]--;
005400      return btreeNext(pCur, pRes);
005401    }
005402    if( pPage->leaf ){
005403      return SQLITE_OK;
005404    }else{
005405      return moveToLeftmost(pCur);
005406    }
005407  }
005408  
005409  /*
005410  ** Step the cursor to the back to the previous entry in the database.  If
005411  ** successful then set *pRes=0.  If the cursor
005412  ** was already pointing to the first entry in the database before
005413  ** this routine was called, then set *pRes=1.
005414  **
005415  ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
005416  ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
005417  ** to the previous cell on the current page.  The (slower) btreePrevious()
005418  ** helper routine is called when it is necessary to move to a different page
005419  ** or to restore the cursor.
005420  **
005421  ** The calling function will set *pRes to 0 or 1.  The initial *pRes value
005422  ** will be 1 if the cursor being stepped corresponds to an SQL index and
005423  ** if this routine could have been skipped if that SQL index had been
005424  ** a unique index.  Otherwise the caller will have set *pRes to zero.
005425  ** Zero is the common case. The btree implementation is free to use the
005426  ** initial *pRes value as a hint to improve performance, but the current
005427  ** SQLite btree implementation does not. (Note that the comdb2 btree
005428  ** implementation does use this hint, however.)
005429  */
005430  static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur, int *pRes){
005431    int rc;
005432    MemPage *pPage;
005433  
005434    assert( cursorOwnsBtShared(pCur) );
005435    assert( pRes!=0 );
005436    assert( *pRes==0 );
005437    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
005438    assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
005439    assert( pCur->info.nSize==0 );
005440    if( pCur->eState!=CURSOR_VALID ){
005441      rc = restoreCursorPosition(pCur);
005442      if( rc!=SQLITE_OK ){
005443        return rc;
005444      }
005445      if( CURSOR_INVALID==pCur->eState ){
005446        *pRes = 1;
005447        return SQLITE_OK;
005448      }
005449      if( pCur->skipNext ){
005450        assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
005451        pCur->eState = CURSOR_VALID;
005452        if( pCur->skipNext<0 ){
005453          pCur->skipNext = 0;
005454          return SQLITE_OK;
005455        }
005456        pCur->skipNext = 0;
005457      }
005458    }
005459  
005460    pPage = pCur->apPage[pCur->iPage];
005461    assert( pPage->isInit );
005462    if( !pPage->leaf ){
005463      int idx = pCur->aiIdx[pCur->iPage];
005464      rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
005465      if( rc ) return rc;
005466      rc = moveToRightmost(pCur);
005467    }else{
005468      while( pCur->aiIdx[pCur->iPage]==0 ){
005469        if( pCur->iPage==0 ){
005470          pCur->eState = CURSOR_INVALID;
005471          *pRes = 1;
005472          return SQLITE_OK;
005473        }
005474        moveToParent(pCur);
005475      }
005476      assert( pCur->info.nSize==0 );
005477      assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
005478  
005479      pCur->aiIdx[pCur->iPage]--;
005480      pPage = pCur->apPage[pCur->iPage];
005481      if( pPage->intKey && !pPage->leaf ){
005482        rc = sqlite3BtreePrevious(pCur, pRes);
005483      }else{
005484        rc = SQLITE_OK;
005485      }
005486    }
005487    return rc;
005488  }
005489  int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
005490    assert( cursorOwnsBtShared(pCur) );
005491    assert( pRes!=0 );
005492    assert( *pRes==0 || *pRes==1 );
005493    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
005494    *pRes = 0;
005495    pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
005496    pCur->info.nSize = 0;
005497    if( pCur->eState!=CURSOR_VALID
005498     || pCur->aiIdx[pCur->iPage]==0
005499     || pCur->apPage[pCur->iPage]->leaf==0
005500    ){
005501      return btreePrevious(pCur, pRes);
005502    }
005503    pCur->aiIdx[pCur->iPage]--;
005504    return SQLITE_OK;
005505  }
005506  
005507  /*
005508  ** Allocate a new page from the database file.
005509  **
005510  ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
005511  ** has already been called on the new page.)  The new page has also
005512  ** been referenced and the calling routine is responsible for calling
005513  ** sqlite3PagerUnref() on the new page when it is done.
005514  **
005515  ** SQLITE_OK is returned on success.  Any other return value indicates
005516  ** an error.  *ppPage is set to NULL in the event of an error.
005517  **
005518  ** If the "nearby" parameter is not 0, then an effort is made to 
005519  ** locate a page close to the page number "nearby".  This can be used in an
005520  ** attempt to keep related pages close to each other in the database file,
005521  ** which in turn can make database access faster.
005522  **
005523  ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
005524  ** anywhere on the free-list, then it is guaranteed to be returned.  If
005525  ** eMode is BTALLOC_LT then the page returned will be less than or equal
005526  ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
005527  ** are no restrictions on which page is returned.
005528  */
005529  static int allocateBtreePage(
005530    BtShared *pBt,         /* The btree */
005531    MemPage **ppPage,      /* Store pointer to the allocated page here */
005532    Pgno *pPgno,           /* Store the page number here */
005533    Pgno nearby,           /* Search for a page near this one */
005534    u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
005535  ){
005536    MemPage *pPage1;
005537    int rc;
005538    u32 n;     /* Number of pages on the freelist */
005539    u32 k;     /* Number of leaves on the trunk of the freelist */
005540    MemPage *pTrunk = 0;
005541    MemPage *pPrevTrunk = 0;
005542    Pgno mxPage;     /* Total size of the database file */
005543  
005544    assert( sqlite3_mutex_held(pBt->mutex) );
005545    assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
005546    pPage1 = pBt->pPage1;
005547    mxPage = btreePagecount(pBt);
005548    /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
005549    ** stores stores the total number of pages on the freelist. */
005550    n = get4byte(&pPage1->aData[36]);
005551    testcase( n==mxPage-1 );
005552    if( n>=mxPage ){
005553      return SQLITE_CORRUPT_BKPT;
005554    }
005555    if( n>0 ){
005556      /* There are pages on the freelist.  Reuse one of those pages. */
005557      Pgno iTrunk;
005558      u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
005559      u32 nSearch = 0;   /* Count of the number of search attempts */
005560      
005561      /* If eMode==BTALLOC_EXACT and a query of the pointer-map
005562      ** shows that the page 'nearby' is somewhere on the free-list, then
005563      ** the entire-list will be searched for that page.
005564      */
005565  #ifndef SQLITE_OMIT_AUTOVACUUM
005566      if( eMode==BTALLOC_EXACT ){
005567        if( nearby<=mxPage ){
005568          u8 eType;
005569          assert( nearby>0 );
005570          assert( pBt->autoVacuum );
005571          rc = ptrmapGet(pBt, nearby, &eType, 0);
005572          if( rc ) return rc;
005573          if( eType==PTRMAP_FREEPAGE ){
005574            searchList = 1;
005575          }
005576        }
005577      }else if( eMode==BTALLOC_LE ){
005578        searchList = 1;
005579      }
005580  #endif
005581  
005582      /* Decrement the free-list count by 1. Set iTrunk to the index of the
005583      ** first free-list trunk page. iPrevTrunk is initially 1.
005584      */
005585      rc = sqlite3PagerWrite(pPage1->pDbPage);
005586      if( rc ) return rc;
005587      put4byte(&pPage1->aData[36], n-1);
005588  
005589      /* The code within this loop is run only once if the 'searchList' variable
005590      ** is not true. Otherwise, it runs once for each trunk-page on the
005591      ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
005592      ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
005593      */
005594      do {
005595        pPrevTrunk = pTrunk;
005596        if( pPrevTrunk ){
005597          /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
005598          ** is the page number of the next freelist trunk page in the list or
005599          ** zero if this is the last freelist trunk page. */
005600          iTrunk = get4byte(&pPrevTrunk->aData[0]);
005601        }else{
005602          /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
005603          ** stores the page number of the first page of the freelist, or zero if
005604          ** the freelist is empty. */
005605          iTrunk = get4byte(&pPage1->aData[32]);
005606        }
005607        testcase( iTrunk==mxPage );
005608        if( iTrunk>mxPage || nSearch++ > n ){
005609          rc = SQLITE_CORRUPT_BKPT;
005610        }else{
005611          rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
005612        }
005613        if( rc ){
005614          pTrunk = 0;
005615          goto end_allocate_page;
005616        }
005617        assert( pTrunk!=0 );
005618        assert( pTrunk->aData!=0 );
005619        /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
005620        ** is the number of leaf page pointers to follow. */
005621        k = get4byte(&pTrunk->aData[4]);
005622        if( k==0 && !searchList ){
005623          /* The trunk has no leaves and the list is not being searched. 
005624          ** So extract the trunk page itself and use it as the newly 
005625          ** allocated page */
005626          assert( pPrevTrunk==0 );
005627          rc = sqlite3PagerWrite(pTrunk->pDbPage);
005628          if( rc ){
005629            goto end_allocate_page;
005630          }
005631          *pPgno = iTrunk;
005632          memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
005633          *ppPage = pTrunk;
005634          pTrunk = 0;
005635          TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
005636        }else if( k>(u32)(pBt->usableSize/4 - 2) ){
005637          /* Value of k is out of range.  Database corruption */
005638          rc = SQLITE_CORRUPT_BKPT;
005639          goto end_allocate_page;
005640  #ifndef SQLITE_OMIT_AUTOVACUUM
005641        }else if( searchList 
005642              && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 
005643        ){
005644          /* The list is being searched and this trunk page is the page
005645          ** to allocate, regardless of whether it has leaves.
005646          */
005647          *pPgno = iTrunk;
005648          *ppPage = pTrunk;
005649          searchList = 0;
005650          rc = sqlite3PagerWrite(pTrunk->pDbPage);
005651          if( rc ){
005652            goto end_allocate_page;
005653          }
005654          if( k==0 ){
005655            if( !pPrevTrunk ){
005656              memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
005657            }else{
005658              rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
005659              if( rc!=SQLITE_OK ){
005660                goto end_allocate_page;
005661              }
005662              memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
005663            }
005664          }else{
005665            /* The trunk page is required by the caller but it contains 
005666            ** pointers to free-list leaves. The first leaf becomes a trunk
005667            ** page in this case.
005668            */
005669            MemPage *pNewTrunk;
005670            Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
005671            if( iNewTrunk>mxPage ){ 
005672              rc = SQLITE_CORRUPT_BKPT;
005673              goto end_allocate_page;
005674            }
005675            testcase( iNewTrunk==mxPage );
005676            rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
005677            if( rc!=SQLITE_OK ){
005678              goto end_allocate_page;
005679            }
005680            rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
005681            if( rc!=SQLITE_OK ){
005682              releasePage(pNewTrunk);
005683              goto end_allocate_page;
005684            }
005685            memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
005686            put4byte(&pNewTrunk->aData[4], k-1);
005687            memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
005688            releasePage(pNewTrunk);
005689            if( !pPrevTrunk ){
005690              assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
005691              put4byte(&pPage1->aData[32], iNewTrunk);
005692            }else{
005693              rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
005694              if( rc ){
005695                goto end_allocate_page;
005696              }
005697              put4byte(&pPrevTrunk->aData[0], iNewTrunk);
005698            }
005699          }
005700          pTrunk = 0;
005701          TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
005702  #endif
005703        }else if( k>0 ){
005704          /* Extract a leaf from the trunk */
005705          u32 closest;
005706          Pgno iPage;
005707          unsigned char *aData = pTrunk->aData;
005708          if( nearby>0 ){
005709            u32 i;
005710            closest = 0;
005711            if( eMode==BTALLOC_LE ){
005712              for(i=0; i<k; i++){
005713                iPage = get4byte(&aData[8+i*4]);
005714                if( iPage<=nearby ){
005715                  closest = i;
005716                  break;
005717                }
005718              }
005719            }else{
005720              int dist;
005721              dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
005722              for(i=1; i<k; i++){
005723                int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
005724                if( d2<dist ){
005725                  closest = i;
005726                  dist = d2;
005727                }
005728              }
005729            }
005730          }else{
005731            closest = 0;
005732          }
005733  
005734          iPage = get4byte(&aData[8+closest*4]);
005735          testcase( iPage==mxPage );
005736          if( iPage>mxPage ){
005737            rc = SQLITE_CORRUPT_BKPT;
005738            goto end_allocate_page;
005739          }
005740          testcase( iPage==mxPage );
005741          if( !searchList 
005742           || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 
005743          ){
005744            int noContent;
005745            *pPgno = iPage;
005746            TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
005747                   ": %d more free pages\n",
005748                   *pPgno, closest+1, k, pTrunk->pgno, n-1));
005749            rc = sqlite3PagerWrite(pTrunk->pDbPage);
005750            if( rc ) goto end_allocate_page;
005751            if( closest<k-1 ){
005752              memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
005753            }
005754            put4byte(&aData[4], k-1);
005755            noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
005756            rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
005757            if( rc==SQLITE_OK ){
005758              rc = sqlite3PagerWrite((*ppPage)->pDbPage);
005759              if( rc!=SQLITE_OK ){
005760                releasePage(*ppPage);
005761                *ppPage = 0;
005762              }
005763            }
005764            searchList = 0;
005765          }
005766        }
005767        releasePage(pPrevTrunk);
005768        pPrevTrunk = 0;
005769      }while( searchList );
005770    }else{
005771      /* There are no pages on the freelist, so append a new page to the
005772      ** database image.
005773      **
005774      ** Normally, new pages allocated by this block can be requested from the
005775      ** pager layer with the 'no-content' flag set. This prevents the pager
005776      ** from trying to read the pages content from disk. However, if the
005777      ** current transaction has already run one or more incremental-vacuum
005778      ** steps, then the page we are about to allocate may contain content
005779      ** that is required in the event of a rollback. In this case, do
005780      ** not set the no-content flag. This causes the pager to load and journal
005781      ** the current page content before overwriting it.
005782      **
005783      ** Note that the pager will not actually attempt to load or journal 
005784      ** content for any page that really does lie past the end of the database
005785      ** file on disk. So the effects of disabling the no-content optimization
005786      ** here are confined to those pages that lie between the end of the
005787      ** database image and the end of the database file.
005788      */
005789      int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
005790  
005791      rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
005792      if( rc ) return rc;
005793      pBt->nPage++;
005794      if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
005795  
005796  #ifndef SQLITE_OMIT_AUTOVACUUM
005797      if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
005798        /* If *pPgno refers to a pointer-map page, allocate two new pages
005799        ** at the end of the file instead of one. The first allocated page
005800        ** becomes a new pointer-map page, the second is used by the caller.
005801        */
005802        MemPage *pPg = 0;
005803        TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
005804        assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
005805        rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
005806        if( rc==SQLITE_OK ){
005807          rc = sqlite3PagerWrite(pPg->pDbPage);
005808          releasePage(pPg);
005809        }
005810        if( rc ) return rc;
005811        pBt->nPage++;
005812        if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
005813      }
005814  #endif
005815      put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
005816      *pPgno = pBt->nPage;
005817  
005818      assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
005819      rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
005820      if( rc ) return rc;
005821      rc = sqlite3PagerWrite((*ppPage)->pDbPage);
005822      if( rc!=SQLITE_OK ){
005823        releasePage(*ppPage);
005824        *ppPage = 0;
005825      }
005826      TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
005827    }
005828  
005829    assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
005830  
005831  end_allocate_page:
005832    releasePage(pTrunk);
005833    releasePage(pPrevTrunk);
005834    assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
005835    assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
005836    return rc;
005837  }
005838  
005839  /*
005840  ** This function is used to add page iPage to the database file free-list. 
005841  ** It is assumed that the page is not already a part of the free-list.
005842  **
005843  ** The value passed as the second argument to this function is optional.
005844  ** If the caller happens to have a pointer to the MemPage object 
005845  ** corresponding to page iPage handy, it may pass it as the second value. 
005846  ** Otherwise, it may pass NULL.
005847  **
005848  ** If a pointer to a MemPage object is passed as the second argument,
005849  ** its reference count is not altered by this function.
005850  */
005851  static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
005852    MemPage *pTrunk = 0;                /* Free-list trunk page */
005853    Pgno iTrunk = 0;                    /* Page number of free-list trunk page */ 
005854    MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
005855    MemPage *pPage;                     /* Page being freed. May be NULL. */
005856    int rc;                             /* Return Code */
005857    int nFree;                          /* Initial number of pages on free-list */
005858  
005859    assert( sqlite3_mutex_held(pBt->mutex) );
005860    assert( CORRUPT_DB || iPage>1 );
005861    assert( !pMemPage || pMemPage->pgno==iPage );
005862  
005863    if( iPage<2 ) return SQLITE_CORRUPT_BKPT;
005864    if( pMemPage ){
005865      pPage = pMemPage;
005866      sqlite3PagerRef(pPage->pDbPage);
005867    }else{
005868      pPage = btreePageLookup(pBt, iPage);
005869    }
005870  
005871    /* Increment the free page count on pPage1 */
005872    rc = sqlite3PagerWrite(pPage1->pDbPage);
005873    if( rc ) goto freepage_out;
005874    nFree = get4byte(&pPage1->aData[36]);
005875    put4byte(&pPage1->aData[36], nFree+1);
005876  
005877    if( pBt->btsFlags & BTS_SECURE_DELETE ){
005878      /* If the secure_delete option is enabled, then
005879      ** always fully overwrite deleted information with zeros.
005880      */
005881      if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
005882       ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
005883      ){
005884        goto freepage_out;
005885      }
005886      memset(pPage->aData, 0, pPage->pBt->pageSize);
005887    }
005888  
005889    /* If the database supports auto-vacuum, write an entry in the pointer-map
005890    ** to indicate that the page is free.
005891    */
005892    if( ISAUTOVACUUM ){
005893      ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
005894      if( rc ) goto freepage_out;
005895    }
005896  
005897    /* Now manipulate the actual database free-list structure. There are two
005898    ** possibilities. If the free-list is currently empty, or if the first
005899    ** trunk page in the free-list is full, then this page will become a
005900    ** new free-list trunk page. Otherwise, it will become a leaf of the
005901    ** first trunk page in the current free-list. This block tests if it
005902    ** is possible to add the page as a new free-list leaf.
005903    */
005904    if( nFree!=0 ){
005905      u32 nLeaf;                /* Initial number of leaf cells on trunk page */
005906  
005907      iTrunk = get4byte(&pPage1->aData[32]);
005908      rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
005909      if( rc!=SQLITE_OK ){
005910        goto freepage_out;
005911      }
005912  
005913      nLeaf = get4byte(&pTrunk->aData[4]);
005914      assert( pBt->usableSize>32 );
005915      if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
005916        rc = SQLITE_CORRUPT_BKPT;
005917        goto freepage_out;
005918      }
005919      if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
005920        /* In this case there is room on the trunk page to insert the page
005921        ** being freed as a new leaf.
005922        **
005923        ** Note that the trunk page is not really full until it contains
005924        ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
005925        ** coded.  But due to a coding error in versions of SQLite prior to
005926        ** 3.6.0, databases with freelist trunk pages holding more than
005927        ** usableSize/4 - 8 entries will be reported as corrupt.  In order
005928        ** to maintain backwards compatibility with older versions of SQLite,
005929        ** we will continue to restrict the number of entries to usableSize/4 - 8
005930        ** for now.  At some point in the future (once everyone has upgraded
005931        ** to 3.6.0 or later) we should consider fixing the conditional above
005932        ** to read "usableSize/4-2" instead of "usableSize/4-8".
005933        **
005934        ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
005935        ** avoid using the last six entries in the freelist trunk page array in
005936        ** order that database files created by newer versions of SQLite can be
005937        ** read by older versions of SQLite.
005938        */
005939        rc = sqlite3PagerWrite(pTrunk->pDbPage);
005940        if( rc==SQLITE_OK ){
005941          put4byte(&pTrunk->aData[4], nLeaf+1);
005942          put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
005943          if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
005944            sqlite3PagerDontWrite(pPage->pDbPage);
005945          }
005946          rc = btreeSetHasContent(pBt, iPage);
005947        }
005948        TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
005949        goto freepage_out;
005950      }
005951    }
005952  
005953    /* If control flows to this point, then it was not possible to add the
005954    ** the page being freed as a leaf page of the first trunk in the free-list.
005955    ** Possibly because the free-list is empty, or possibly because the 
005956    ** first trunk in the free-list is full. Either way, the page being freed
005957    ** will become the new first trunk page in the free-list.
005958    */
005959    if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
005960      goto freepage_out;
005961    }
005962    rc = sqlite3PagerWrite(pPage->pDbPage);
005963    if( rc!=SQLITE_OK ){
005964      goto freepage_out;
005965    }
005966    put4byte(pPage->aData, iTrunk);
005967    put4byte(&pPage->aData[4], 0);
005968    put4byte(&pPage1->aData[32], iPage);
005969    TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
005970  
005971  freepage_out:
005972    if( pPage ){
005973      pPage->isInit = 0;
005974    }
005975    releasePage(pPage);
005976    releasePage(pTrunk);
005977    return rc;
005978  }
005979  static void freePage(MemPage *pPage, int *pRC){
005980    if( (*pRC)==SQLITE_OK ){
005981      *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
005982    }
005983  }
005984  
005985  /*
005986  ** Free any overflow pages associated with the given Cell.  Write the
005987  ** local Cell size (the number of bytes on the original page, omitting
005988  ** overflow) into *pnSize.
005989  */
005990  static int clearCell(
005991    MemPage *pPage,          /* The page that contains the Cell */
005992    unsigned char *pCell,    /* First byte of the Cell */
005993    CellInfo *pInfo          /* Size information about the cell */
005994  ){
005995    BtShared *pBt = pPage->pBt;
005996    Pgno ovflPgno;
005997    int rc;
005998    int nOvfl;
005999    u32 ovflPageSize;
006000  
006001    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006002    pPage->xParseCell(pPage, pCell, pInfo);
006003    if( pInfo->nLocal==pInfo->nPayload ){
006004      return SQLITE_OK;  /* No overflow pages. Return without doing anything */
006005    }
006006    if( pCell+pInfo->nSize-1 > pPage->aData+pPage->maskPage ){
006007      return SQLITE_CORRUPT_BKPT;  /* Cell extends past end of page */
006008    }
006009    ovflPgno = get4byte(pCell + pInfo->nSize - 4);
006010    assert( pBt->usableSize > 4 );
006011    ovflPageSize = pBt->usableSize - 4;
006012    nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
006013    assert( nOvfl>0 || 
006014      (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
006015    );
006016    while( nOvfl-- ){
006017      Pgno iNext = 0;
006018      MemPage *pOvfl = 0;
006019      if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
006020        /* 0 is not a legal page number and page 1 cannot be an 
006021        ** overflow page. Therefore if ovflPgno<2 or past the end of the 
006022        ** file the database must be corrupt. */
006023        return SQLITE_CORRUPT_BKPT;
006024      }
006025      if( nOvfl ){
006026        rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
006027        if( rc ) return rc;
006028      }
006029  
006030      if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
006031       && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
006032      ){
006033        /* There is no reason any cursor should have an outstanding reference 
006034        ** to an overflow page belonging to a cell that is being deleted/updated.
006035        ** So if there exists more than one reference to this page, then it 
006036        ** must not really be an overflow page and the database must be corrupt. 
006037        ** It is helpful to detect this before calling freePage2(), as 
006038        ** freePage2() may zero the page contents if secure-delete mode is
006039        ** enabled. If this 'overflow' page happens to be a page that the
006040        ** caller is iterating through or using in some other way, this
006041        ** can be problematic.
006042        */
006043        rc = SQLITE_CORRUPT_BKPT;
006044      }else{
006045        rc = freePage2(pBt, pOvfl, ovflPgno);
006046      }
006047  
006048      if( pOvfl ){
006049        sqlite3PagerUnref(pOvfl->pDbPage);
006050      }
006051      if( rc ) return rc;
006052      ovflPgno = iNext;
006053    }
006054    return SQLITE_OK;
006055  }
006056  
006057  /*
006058  ** Create the byte sequence used to represent a cell on page pPage
006059  ** and write that byte sequence into pCell[].  Overflow pages are
006060  ** allocated and filled in as necessary.  The calling procedure
006061  ** is responsible for making sure sufficient space has been allocated
006062  ** for pCell[].
006063  **
006064  ** Note that pCell does not necessary need to point to the pPage->aData
006065  ** area.  pCell might point to some temporary storage.  The cell will
006066  ** be constructed in this temporary area then copied into pPage->aData
006067  ** later.
006068  */
006069  static int fillInCell(
006070    MemPage *pPage,                /* The page that contains the cell */
006071    unsigned char *pCell,          /* Complete text of the cell */
006072    const BtreePayload *pX,        /* Payload with which to construct the cell */
006073    int *pnSize                    /* Write cell size here */
006074  ){
006075    int nPayload;
006076    const u8 *pSrc;
006077    int nSrc, n, rc;
006078    int spaceLeft;
006079    MemPage *pOvfl = 0;
006080    MemPage *pToRelease = 0;
006081    unsigned char *pPrior;
006082    unsigned char *pPayload;
006083    BtShared *pBt = pPage->pBt;
006084    Pgno pgnoOvfl = 0;
006085    int nHeader;
006086  
006087    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006088  
006089    /* pPage is not necessarily writeable since pCell might be auxiliary
006090    ** buffer space that is separate from the pPage buffer area */
006091    assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
006092              || sqlite3PagerIswriteable(pPage->pDbPage) );
006093  
006094    /* Fill in the header. */
006095    nHeader = pPage->childPtrSize;
006096    if( pPage->intKey ){
006097      nPayload = pX->nData + pX->nZero;
006098      pSrc = pX->pData;
006099      nSrc = pX->nData;
006100      assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
006101      nHeader += putVarint32(&pCell[nHeader], nPayload);
006102      nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
006103    }else{
006104      assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
006105      nSrc = nPayload = (int)pX->nKey;
006106      pSrc = pX->pKey;
006107      nHeader += putVarint32(&pCell[nHeader], nPayload);
006108    }
006109    
006110    /* Fill in the payload */
006111    if( nPayload<=pPage->maxLocal ){
006112      n = nHeader + nPayload;
006113      testcase( n==3 );
006114      testcase( n==4 );
006115      if( n<4 ) n = 4;
006116      *pnSize = n;
006117      spaceLeft = nPayload;
006118      pPrior = pCell;
006119    }else{
006120      int mn = pPage->minLocal;
006121      n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
006122      testcase( n==pPage->maxLocal );
006123      testcase( n==pPage->maxLocal+1 );
006124      if( n > pPage->maxLocal ) n = mn;
006125      spaceLeft = n;
006126      *pnSize = n + nHeader + 4;
006127      pPrior = &pCell[nHeader+n];
006128    }
006129    pPayload = &pCell[nHeader];
006130  
006131    /* At this point variables should be set as follows:
006132    **
006133    **   nPayload           Total payload size in bytes
006134    **   pPayload           Begin writing payload here
006135    **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
006136    **                      that means content must spill into overflow pages.
006137    **   *pnSize            Size of the local cell (not counting overflow pages)
006138    **   pPrior             Where to write the pgno of the first overflow page
006139    **
006140    ** Use a call to btreeParseCellPtr() to verify that the values above
006141    ** were computed correctly.
006142    */
006143  #if SQLITE_DEBUG
006144    {
006145      CellInfo info;
006146      pPage->xParseCell(pPage, pCell, &info);
006147      assert( nHeader==(int)(info.pPayload - pCell) );
006148      assert( info.nKey==pX->nKey );
006149      assert( *pnSize == info.nSize );
006150      assert( spaceLeft == info.nLocal );
006151    }
006152  #endif
006153  
006154    /* Write the payload into the local Cell and any extra into overflow pages */
006155    while( nPayload>0 ){
006156      if( spaceLeft==0 ){
006157  #ifndef SQLITE_OMIT_AUTOVACUUM
006158        Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
006159        if( pBt->autoVacuum ){
006160          do{
006161            pgnoOvfl++;
006162          } while( 
006163            PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 
006164          );
006165        }
006166  #endif
006167        rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
006168  #ifndef SQLITE_OMIT_AUTOVACUUM
006169        /* If the database supports auto-vacuum, and the second or subsequent
006170        ** overflow page is being allocated, add an entry to the pointer-map
006171        ** for that page now. 
006172        **
006173        ** If this is the first overflow page, then write a partial entry 
006174        ** to the pointer-map. If we write nothing to this pointer-map slot,
006175        ** then the optimistic overflow chain processing in clearCell()
006176        ** may misinterpret the uninitialized values and delete the
006177        ** wrong pages from the database.
006178        */
006179        if( pBt->autoVacuum && rc==SQLITE_OK ){
006180          u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
006181          ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
006182          if( rc ){
006183            releasePage(pOvfl);
006184          }
006185        }
006186  #endif
006187        if( rc ){
006188          releasePage(pToRelease);
006189          return rc;
006190        }
006191  
006192        /* If pToRelease is not zero than pPrior points into the data area
006193        ** of pToRelease.  Make sure pToRelease is still writeable. */
006194        assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
006195  
006196        /* If pPrior is part of the data area of pPage, then make sure pPage
006197        ** is still writeable */
006198        assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
006199              || sqlite3PagerIswriteable(pPage->pDbPage) );
006200  
006201        put4byte(pPrior, pgnoOvfl);
006202        releasePage(pToRelease);
006203        pToRelease = pOvfl;
006204        pPrior = pOvfl->aData;
006205        put4byte(pPrior, 0);
006206        pPayload = &pOvfl->aData[4];
006207        spaceLeft = pBt->usableSize - 4;
006208      }
006209      n = nPayload;
006210      if( n>spaceLeft ) n = spaceLeft;
006211  
006212      /* If pToRelease is not zero than pPayload points into the data area
006213      ** of pToRelease.  Make sure pToRelease is still writeable. */
006214      assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
006215  
006216      /* If pPayload is part of the data area of pPage, then make sure pPage
006217      ** is still writeable */
006218      assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
006219              || sqlite3PagerIswriteable(pPage->pDbPage) );
006220  
006221      if( nSrc>0 ){
006222        if( n>nSrc ) n = nSrc;
006223        assert( pSrc );
006224        memcpy(pPayload, pSrc, n);
006225      }else{
006226        memset(pPayload, 0, n);
006227      }
006228      nPayload -= n;
006229      pPayload += n;
006230      pSrc += n;
006231      nSrc -= n;
006232      spaceLeft -= n;
006233    }
006234    releasePage(pToRelease);
006235    return SQLITE_OK;
006236  }
006237  
006238  /*
006239  ** Remove the i-th cell from pPage.  This routine effects pPage only.
006240  ** The cell content is not freed or deallocated.  It is assumed that
006241  ** the cell content has been copied someplace else.  This routine just
006242  ** removes the reference to the cell from pPage.
006243  **
006244  ** "sz" must be the number of bytes in the cell.
006245  */
006246  static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
006247    u32 pc;         /* Offset to cell content of cell being deleted */
006248    u8 *data;       /* pPage->aData */
006249    u8 *ptr;        /* Used to move bytes around within data[] */
006250    int rc;         /* The return code */
006251    int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
006252  
006253    if( *pRC ) return;
006254    assert( idx>=0 && idx<pPage->nCell );
006255    assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
006256    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
006257    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006258    data = pPage->aData;
006259    ptr = &pPage->aCellIdx[2*idx];
006260    pc = get2byte(ptr);
006261    hdr = pPage->hdrOffset;
006262    testcase( pc==get2byte(&data[hdr+5]) );
006263    testcase( pc+sz==pPage->pBt->usableSize );
006264    if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
006265      *pRC = SQLITE_CORRUPT_BKPT;
006266      return;
006267    }
006268    rc = freeSpace(pPage, pc, sz);
006269    if( rc ){
006270      *pRC = rc;
006271      return;
006272    }
006273    pPage->nCell--;
006274    if( pPage->nCell==0 ){
006275      memset(&data[hdr+1], 0, 4);
006276      data[hdr+7] = 0;
006277      put2byte(&data[hdr+5], pPage->pBt->usableSize);
006278      pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
006279                         - pPage->childPtrSize - 8;
006280    }else{
006281      memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
006282      put2byte(&data[hdr+3], pPage->nCell);
006283      pPage->nFree += 2;
006284    }
006285  }
006286  
006287  /*
006288  ** Insert a new cell on pPage at cell index "i".  pCell points to the
006289  ** content of the cell.
006290  **
006291  ** If the cell content will fit on the page, then put it there.  If it
006292  ** will not fit, then make a copy of the cell content into pTemp if
006293  ** pTemp is not null.  Regardless of pTemp, allocate a new entry
006294  ** in pPage->apOvfl[] and make it point to the cell content (either
006295  ** in pTemp or the original pCell) and also record its index. 
006296  ** Allocating a new entry in pPage->aCell[] implies that 
006297  ** pPage->nOverflow is incremented.
006298  **
006299  ** *pRC must be SQLITE_OK when this routine is called.
006300  */
006301  static void insertCell(
006302    MemPage *pPage,   /* Page into which we are copying */
006303    int i,            /* New cell becomes the i-th cell of the page */
006304    u8 *pCell,        /* Content of the new cell */
006305    int sz,           /* Bytes of content in pCell */
006306    u8 *pTemp,        /* Temp storage space for pCell, if needed */
006307    Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
006308    int *pRC          /* Read and write return code from here */
006309  ){
006310    int idx = 0;      /* Where to write new cell content in data[] */
006311    int j;            /* Loop counter */
006312    u8 *data;         /* The content of the whole page */
006313    u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
006314  
006315    assert( *pRC==SQLITE_OK );
006316    assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
006317    assert( MX_CELL(pPage->pBt)<=10921 );
006318    assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
006319    assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
006320    assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
006321    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006322    /* The cell should normally be sized correctly.  However, when moving a
006323    ** malformed cell from a leaf page to an interior page, if the cell size
006324    ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
006325    ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
006326    ** the term after the || in the following assert(). */
006327    assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
006328    if( pPage->nOverflow || sz+2>pPage->nFree ){
006329      if( pTemp ){
006330        memcpy(pTemp, pCell, sz);
006331        pCell = pTemp;
006332      }
006333      if( iChild ){
006334        put4byte(pCell, iChild);
006335      }
006336      j = pPage->nOverflow++;
006337      /* Comparison against ArraySize-1 since we hold back one extra slot
006338      ** as a contingency.  In other words, never need more than 3 overflow
006339      ** slots but 4 are allocated, just to be safe. */
006340      assert( j < ArraySize(pPage->apOvfl)-1 );
006341      pPage->apOvfl[j] = pCell;
006342      pPage->aiOvfl[j] = (u16)i;
006343  
006344      /* When multiple overflows occur, they are always sequential and in
006345      ** sorted order.  This invariants arise because multiple overflows can
006346      ** only occur when inserting divider cells into the parent page during
006347      ** balancing, and the dividers are adjacent and sorted.
006348      */
006349      assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
006350      assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
006351    }else{
006352      int rc = sqlite3PagerWrite(pPage->pDbPage);
006353      if( rc!=SQLITE_OK ){
006354        *pRC = rc;
006355        return;
006356      }
006357      assert( sqlite3PagerIswriteable(pPage->pDbPage) );
006358      data = pPage->aData;
006359      assert( &data[pPage->cellOffset]==pPage->aCellIdx );
006360      rc = allocateSpace(pPage, sz, &idx);
006361      if( rc ){ *pRC = rc; return; }
006362      /* The allocateSpace() routine guarantees the following properties
006363      ** if it returns successfully */
006364      assert( idx >= 0 );
006365      assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
006366      assert( idx+sz <= (int)pPage->pBt->usableSize );
006367      pPage->nFree -= (u16)(2 + sz);
006368      memcpy(&data[idx], pCell, sz);
006369      if( iChild ){
006370        put4byte(&data[idx], iChild);
006371      }
006372      pIns = pPage->aCellIdx + i*2;
006373      memmove(pIns+2, pIns, 2*(pPage->nCell - i));
006374      put2byte(pIns, idx);
006375      pPage->nCell++;
006376      /* increment the cell count */
006377      if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
006378      assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );
006379  #ifndef SQLITE_OMIT_AUTOVACUUM
006380      if( pPage->pBt->autoVacuum ){
006381        /* The cell may contain a pointer to an overflow page. If so, write
006382        ** the entry for the overflow page into the pointer map.
006383        */
006384        ptrmapPutOvflPtr(pPage, pCell, pRC);
006385      }
006386  #endif
006387    }
006388  }
006389  
006390  /*
006391  ** A CellArray object contains a cache of pointers and sizes for a
006392  ** consecutive sequence of cells that might be held on multiple pages.
006393  */
006394  typedef struct CellArray CellArray;
006395  struct CellArray {
006396    int nCell;              /* Number of cells in apCell[] */
006397    MemPage *pRef;          /* Reference page */
006398    u8 **apCell;            /* All cells begin balanced */
006399    u16 *szCell;            /* Local size of all cells in apCell[] */
006400  };
006401  
006402  /*
006403  ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
006404  ** computed.
006405  */
006406  static void populateCellCache(CellArray *p, int idx, int N){
006407    assert( idx>=0 && idx+N<=p->nCell );
006408    while( N>0 ){
006409      assert( p->apCell[idx]!=0 );
006410      if( p->szCell[idx]==0 ){
006411        p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
006412      }else{
006413        assert( CORRUPT_DB ||
006414                p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
006415      }
006416      idx++;
006417      N--;
006418    }
006419  }
006420  
006421  /*
006422  ** Return the size of the Nth element of the cell array
006423  */
006424  static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
006425    assert( N>=0 && N<p->nCell );
006426    assert( p->szCell[N]==0 );
006427    p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
006428    return p->szCell[N];
006429  }
006430  static u16 cachedCellSize(CellArray *p, int N){
006431    assert( N>=0 && N<p->nCell );
006432    if( p->szCell[N] ) return p->szCell[N];
006433    return computeCellSize(p, N);
006434  }
006435  
006436  /*
006437  ** Array apCell[] contains pointers to nCell b-tree page cells. The 
006438  ** szCell[] array contains the size in bytes of each cell. This function
006439  ** replaces the current contents of page pPg with the contents of the cell
006440  ** array.
006441  **
006442  ** Some of the cells in apCell[] may currently be stored in pPg. This
006443  ** function works around problems caused by this by making a copy of any 
006444  ** such cells before overwriting the page data.
006445  **
006446  ** The MemPage.nFree field is invalidated by this function. It is the 
006447  ** responsibility of the caller to set it correctly.
006448  */
006449  static int rebuildPage(
006450    MemPage *pPg,                   /* Edit this page */
006451    int nCell,                      /* Final number of cells on page */
006452    u8 **apCell,                    /* Array of cells */
006453    u16 *szCell                     /* Array of cell sizes */
006454  ){
006455    const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
006456    u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
006457    const int usableSize = pPg->pBt->usableSize;
006458    u8 * const pEnd = &aData[usableSize];
006459    int i;
006460    u8 *pCellptr = pPg->aCellIdx;
006461    u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
006462    u8 *pData;
006463  
006464    i = get2byte(&aData[hdr+5]);
006465    memcpy(&pTmp[i], &aData[i], usableSize - i);
006466  
006467    pData = pEnd;
006468    for(i=0; i<nCell; i++){
006469      u8 *pCell = apCell[i];
006470      if( SQLITE_WITHIN(pCell,aData,pEnd) ){
006471        pCell = &pTmp[pCell - aData];
006472      }
006473      pData -= szCell[i];
006474      put2byte(pCellptr, (pData - aData));
006475      pCellptr += 2;
006476      if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
006477      memcpy(pData, pCell, szCell[i]);
006478      assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
006479      testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );
006480    }
006481  
006482    /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
006483    pPg->nCell = nCell;
006484    pPg->nOverflow = 0;
006485  
006486    put2byte(&aData[hdr+1], 0);
006487    put2byte(&aData[hdr+3], pPg->nCell);
006488    put2byte(&aData[hdr+5], pData - aData);
006489    aData[hdr+7] = 0x00;
006490    return SQLITE_OK;
006491  }
006492  
006493  /*
006494  ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
006495  ** contains the size in bytes of each such cell. This function attempts to 
006496  ** add the cells stored in the array to page pPg. If it cannot (because 
006497  ** the page needs to be defragmented before the cells will fit), non-zero
006498  ** is returned. Otherwise, if the cells are added successfully, zero is
006499  ** returned.
006500  **
006501  ** Argument pCellptr points to the first entry in the cell-pointer array
006502  ** (part of page pPg) to populate. After cell apCell[0] is written to the
006503  ** page body, a 16-bit offset is written to pCellptr. And so on, for each
006504  ** cell in the array. It is the responsibility of the caller to ensure
006505  ** that it is safe to overwrite this part of the cell-pointer array.
006506  **
006507  ** When this function is called, *ppData points to the start of the 
006508  ** content area on page pPg. If the size of the content area is extended,
006509  ** *ppData is updated to point to the new start of the content area
006510  ** before returning.
006511  **
006512  ** Finally, argument pBegin points to the byte immediately following the
006513  ** end of the space required by this page for the cell-pointer area (for
006514  ** all cells - not just those inserted by the current call). If the content
006515  ** area must be extended to before this point in order to accomodate all
006516  ** cells in apCell[], then the cells do not fit and non-zero is returned.
006517  */
006518  static int pageInsertArray(
006519    MemPage *pPg,                   /* Page to add cells to */
006520    u8 *pBegin,                     /* End of cell-pointer array */
006521    u8 **ppData,                    /* IN/OUT: Page content -area pointer */
006522    u8 *pCellptr,                   /* Pointer to cell-pointer area */
006523    int iFirst,                     /* Index of first cell to add */
006524    int nCell,                      /* Number of cells to add to pPg */
006525    CellArray *pCArray              /* Array of cells */
006526  ){
006527    int i;
006528    u8 *aData = pPg->aData;
006529    u8 *pData = *ppData;
006530    int iEnd = iFirst + nCell;
006531    assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
006532    for(i=iFirst; i<iEnd; i++){
006533      int sz, rc;
006534      u8 *pSlot;
006535      sz = cachedCellSize(pCArray, i);
006536      if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
006537        if( (pData - pBegin)<sz ) return 1;
006538        pData -= sz;
006539        pSlot = pData;
006540      }
006541      /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
006542      ** database.  But they might for a corrupt database.  Hence use memmove()
006543      ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
006544      assert( (pSlot+sz)<=pCArray->apCell[i]
006545           || pSlot>=(pCArray->apCell[i]+sz)
006546           || CORRUPT_DB );
006547      memmove(pSlot, pCArray->apCell[i], sz);
006548      put2byte(pCellptr, (pSlot - aData));
006549      pCellptr += 2;
006550    }
006551    *ppData = pData;
006552    return 0;
006553  }
006554  
006555  /*
006556  ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell 
006557  ** contains the size in bytes of each such cell. This function adds the
006558  ** space associated with each cell in the array that is currently stored 
006559  ** within the body of pPg to the pPg free-list. The cell-pointers and other
006560  ** fields of the page are not updated.
006561  **
006562  ** This function returns the total number of cells added to the free-list.
006563  */
006564  static int pageFreeArray(
006565    MemPage *pPg,                   /* Page to edit */
006566    int iFirst,                     /* First cell to delete */
006567    int nCell,                      /* Cells to delete */
006568    CellArray *pCArray              /* Array of cells */
006569  ){
006570    u8 * const aData = pPg->aData;
006571    u8 * const pEnd = &aData[pPg->pBt->usableSize];
006572    u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
006573    int nRet = 0;
006574    int i;
006575    int iEnd = iFirst + nCell;
006576    u8 *pFree = 0;
006577    int szFree = 0;
006578  
006579    for(i=iFirst; i<iEnd; i++){
006580      u8 *pCell = pCArray->apCell[i];
006581      if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
006582        int sz;
006583        /* No need to use cachedCellSize() here.  The sizes of all cells that
006584        ** are to be freed have already been computing while deciding which
006585        ** cells need freeing */
006586        sz = pCArray->szCell[i];  assert( sz>0 );
006587        if( pFree!=(pCell + sz) ){
006588          if( pFree ){
006589            assert( pFree>aData && (pFree - aData)<65536 );
006590            freeSpace(pPg, (u16)(pFree - aData), szFree);
006591          }
006592          pFree = pCell;
006593          szFree = sz;
006594          if( pFree+sz>pEnd ) return 0;
006595        }else{
006596          pFree = pCell;
006597          szFree += sz;
006598        }
006599        nRet++;
006600      }
006601    }
006602    if( pFree ){
006603      assert( pFree>aData && (pFree - aData)<65536 );
006604      freeSpace(pPg, (u16)(pFree - aData), szFree);
006605    }
006606    return nRet;
006607  }
006608  
006609  /*
006610  ** apCell[] and szCell[] contains pointers to and sizes of all cells in the
006611  ** pages being balanced.  The current page, pPg, has pPg->nCell cells starting
006612  ** with apCell[iOld].  After balancing, this page should hold nNew cells
006613  ** starting at apCell[iNew].
006614  **
006615  ** This routine makes the necessary adjustments to pPg so that it contains
006616  ** the correct cells after being balanced.
006617  **
006618  ** The pPg->nFree field is invalid when this function returns. It is the
006619  ** responsibility of the caller to set it correctly.
006620  */
006621  static int editPage(
006622    MemPage *pPg,                   /* Edit this page */
006623    int iOld,                       /* Index of first cell currently on page */
006624    int iNew,                       /* Index of new first cell on page */
006625    int nNew,                       /* Final number of cells on page */
006626    CellArray *pCArray              /* Array of cells and sizes */
006627  ){
006628    u8 * const aData = pPg->aData;
006629    const int hdr = pPg->hdrOffset;
006630    u8 *pBegin = &pPg->aCellIdx[nNew * 2];
006631    int nCell = pPg->nCell;       /* Cells stored on pPg */
006632    u8 *pData;
006633    u8 *pCellptr;
006634    int i;
006635    int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
006636    int iNewEnd = iNew + nNew;
006637  
006638  #ifdef SQLITE_DEBUG
006639    u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
006640    memcpy(pTmp, aData, pPg->pBt->usableSize);
006641  #endif
006642  
006643    /* Remove cells from the start and end of the page */
006644    if( iOld<iNew ){
006645      int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
006646      memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
006647      nCell -= nShift;
006648    }
006649    if( iNewEnd < iOldEnd ){
006650      nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
006651    }
006652  
006653    pData = &aData[get2byteNotZero(&aData[hdr+5])];
006654    if( pData<pBegin ) goto editpage_fail;
006655  
006656    /* Add cells to the start of the page */
006657    if( iNew<iOld ){
006658      int nAdd = MIN(nNew,iOld-iNew);
006659      assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
006660      pCellptr = pPg->aCellIdx;
006661      memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
006662      if( pageInsertArray(
006663            pPg, pBegin, &pData, pCellptr,
006664            iNew, nAdd, pCArray
006665      ) ) goto editpage_fail;
006666      nCell += nAdd;
006667    }
006668  
006669    /* Add any overflow cells */
006670    for(i=0; i<pPg->nOverflow; i++){
006671      int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
006672      if( iCell>=0 && iCell<nNew ){
006673        pCellptr = &pPg->aCellIdx[iCell * 2];
006674        memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
006675        nCell++;
006676        if( pageInsertArray(
006677              pPg, pBegin, &pData, pCellptr,
006678              iCell+iNew, 1, pCArray
006679        ) ) goto editpage_fail;
006680      }
006681    }
006682  
006683    /* Append cells to the end of the page */
006684    pCellptr = &pPg->aCellIdx[nCell*2];
006685    if( pageInsertArray(
006686          pPg, pBegin, &pData, pCellptr,
006687          iNew+nCell, nNew-nCell, pCArray
006688    ) ) goto editpage_fail;
006689  
006690    pPg->nCell = nNew;
006691    pPg->nOverflow = 0;
006692  
006693    put2byte(&aData[hdr+3], pPg->nCell);
006694    put2byte(&aData[hdr+5], pData - aData);
006695  
006696  #ifdef SQLITE_DEBUG
006697    for(i=0; i<nNew && !CORRUPT_DB; i++){
006698      u8 *pCell = pCArray->apCell[i+iNew];
006699      int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
006700      if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
006701        pCell = &pTmp[pCell - aData];
006702      }
006703      assert( 0==memcmp(pCell, &aData[iOff],
006704              pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
006705    }
006706  #endif
006707  
006708    return SQLITE_OK;
006709   editpage_fail:
006710    /* Unable to edit this page. Rebuild it from scratch instead. */
006711    populateCellCache(pCArray, iNew, nNew);
006712    return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);
006713  }
006714  
006715  /*
006716  ** The following parameters determine how many adjacent pages get involved
006717  ** in a balancing operation.  NN is the number of neighbors on either side
006718  ** of the page that participate in the balancing operation.  NB is the
006719  ** total number of pages that participate, including the target page and
006720  ** NN neighbors on either side.
006721  **
006722  ** The minimum value of NN is 1 (of course).  Increasing NN above 1
006723  ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
006724  ** in exchange for a larger degradation in INSERT and UPDATE performance.
006725  ** The value of NN appears to give the best results overall.
006726  */
006727  #define NN 1             /* Number of neighbors on either side of pPage */
006728  #define NB (NN*2+1)      /* Total pages involved in the balance */
006729  
006730  
006731  #ifndef SQLITE_OMIT_QUICKBALANCE
006732  /*
006733  ** This version of balance() handles the common special case where
006734  ** a new entry is being inserted on the extreme right-end of the
006735  ** tree, in other words, when the new entry will become the largest
006736  ** entry in the tree.
006737  **
006738  ** Instead of trying to balance the 3 right-most leaf pages, just add
006739  ** a new page to the right-hand side and put the one new entry in
006740  ** that page.  This leaves the right side of the tree somewhat
006741  ** unbalanced.  But odds are that we will be inserting new entries
006742  ** at the end soon afterwards so the nearly empty page will quickly
006743  ** fill up.  On average.
006744  **
006745  ** pPage is the leaf page which is the right-most page in the tree.
006746  ** pParent is its parent.  pPage must have a single overflow entry
006747  ** which is also the right-most entry on the page.
006748  **
006749  ** The pSpace buffer is used to store a temporary copy of the divider
006750  ** cell that will be inserted into pParent. Such a cell consists of a 4
006751  ** byte page number followed by a variable length integer. In other
006752  ** words, at most 13 bytes. Hence the pSpace buffer must be at
006753  ** least 13 bytes in size.
006754  */
006755  static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
006756    BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
006757    MemPage *pNew;                       /* Newly allocated page */
006758    int rc;                              /* Return Code */
006759    Pgno pgnoNew;                        /* Page number of pNew */
006760  
006761    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006762    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
006763    assert( pPage->nOverflow==1 );
006764  
006765    /* This error condition is now caught prior to reaching this function */
006766    if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
006767  
006768    /* Allocate a new page. This page will become the right-sibling of 
006769    ** pPage. Make the parent page writable, so that the new divider cell
006770    ** may be inserted. If both these operations are successful, proceed.
006771    */
006772    rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
006773  
006774    if( rc==SQLITE_OK ){
006775  
006776      u8 *pOut = &pSpace[4];
006777      u8 *pCell = pPage->apOvfl[0];
006778      u16 szCell = pPage->xCellSize(pPage, pCell);
006779      u8 *pStop;
006780  
006781      assert( sqlite3PagerIswriteable(pNew->pDbPage) );
006782      assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
006783      zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
006784      rc = rebuildPage(pNew, 1, &pCell, &szCell);
006785      if( NEVER(rc) ) return rc;
006786      pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
006787  
006788      /* If this is an auto-vacuum database, update the pointer map
006789      ** with entries for the new page, and any pointer from the 
006790      ** cell on the page to an overflow page. If either of these
006791      ** operations fails, the return code is set, but the contents
006792      ** of the parent page are still manipulated by thh code below.
006793      ** That is Ok, at this point the parent page is guaranteed to
006794      ** be marked as dirty. Returning an error code will cause a
006795      ** rollback, undoing any changes made to the parent page.
006796      */
006797      if( ISAUTOVACUUM ){
006798        ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
006799        if( szCell>pNew->minLocal ){
006800          ptrmapPutOvflPtr(pNew, pCell, &rc);
006801        }
006802      }
006803    
006804      /* Create a divider cell to insert into pParent. The divider cell
006805      ** consists of a 4-byte page number (the page number of pPage) and
006806      ** a variable length key value (which must be the same value as the
006807      ** largest key on pPage).
006808      **
006809      ** To find the largest key value on pPage, first find the right-most 
006810      ** cell on pPage. The first two fields of this cell are the 
006811      ** record-length (a variable length integer at most 32-bits in size)
006812      ** and the key value (a variable length integer, may have any value).
006813      ** The first of the while(...) loops below skips over the record-length
006814      ** field. The second while(...) loop copies the key value from the
006815      ** cell on pPage into the pSpace buffer.
006816      */
006817      pCell = findCell(pPage, pPage->nCell-1);
006818      pStop = &pCell[9];
006819      while( (*(pCell++)&0x80) && pCell<pStop );
006820      pStop = &pCell[9];
006821      while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
006822  
006823      /* Insert the new divider cell into pParent. */
006824      if( rc==SQLITE_OK ){
006825        insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
006826                     0, pPage->pgno, &rc);
006827      }
006828  
006829      /* Set the right-child pointer of pParent to point to the new page. */
006830      put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
006831    
006832      /* Release the reference to the new page. */
006833      releasePage(pNew);
006834    }
006835  
006836    return rc;
006837  }
006838  #endif /* SQLITE_OMIT_QUICKBALANCE */
006839  
006840  #if 0
006841  /*
006842  ** This function does not contribute anything to the operation of SQLite.
006843  ** it is sometimes activated temporarily while debugging code responsible 
006844  ** for setting pointer-map entries.
006845  */
006846  static int ptrmapCheckPages(MemPage **apPage, int nPage){
006847    int i, j;
006848    for(i=0; i<nPage; i++){
006849      Pgno n;
006850      u8 e;
006851      MemPage *pPage = apPage[i];
006852      BtShared *pBt = pPage->pBt;
006853      assert( pPage->isInit );
006854  
006855      for(j=0; j<pPage->nCell; j++){
006856        CellInfo info;
006857        u8 *z;
006858       
006859        z = findCell(pPage, j);
006860        pPage->xParseCell(pPage, z, &info);
006861        if( info.nLocal<info.nPayload ){
006862          Pgno ovfl = get4byte(&z[info.nSize-4]);
006863          ptrmapGet(pBt, ovfl, &e, &n);
006864          assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
006865        }
006866        if( !pPage->leaf ){
006867          Pgno child = get4byte(z);
006868          ptrmapGet(pBt, child, &e, &n);
006869          assert( n==pPage->pgno && e==PTRMAP_BTREE );
006870        }
006871      }
006872      if( !pPage->leaf ){
006873        Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
006874        ptrmapGet(pBt, child, &e, &n);
006875        assert( n==pPage->pgno && e==PTRMAP_BTREE );
006876      }
006877    }
006878    return 1;
006879  }
006880  #endif
006881  
006882  /*
006883  ** This function is used to copy the contents of the b-tree node stored 
006884  ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
006885  ** the pointer-map entries for each child page are updated so that the
006886  ** parent page stored in the pointer map is page pTo. If pFrom contained
006887  ** any cells with overflow page pointers, then the corresponding pointer
006888  ** map entries are also updated so that the parent page is page pTo.
006889  **
006890  ** If pFrom is currently carrying any overflow cells (entries in the
006891  ** MemPage.apOvfl[] array), they are not copied to pTo. 
006892  **
006893  ** Before returning, page pTo is reinitialized using btreeInitPage().
006894  **
006895  ** The performance of this function is not critical. It is only used by 
006896  ** the balance_shallower() and balance_deeper() procedures, neither of
006897  ** which are called often under normal circumstances.
006898  */
006899  static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
006900    if( (*pRC)==SQLITE_OK ){
006901      BtShared * const pBt = pFrom->pBt;
006902      u8 * const aFrom = pFrom->aData;
006903      u8 * const aTo = pTo->aData;
006904      int const iFromHdr = pFrom->hdrOffset;
006905      int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
006906      int rc;
006907      int iData;
006908    
006909    
006910      assert( pFrom->isInit );
006911      assert( pFrom->nFree>=iToHdr );
006912      assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
006913    
006914      /* Copy the b-tree node content from page pFrom to page pTo. */
006915      iData = get2byte(&aFrom[iFromHdr+5]);
006916      memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
006917      memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
006918    
006919      /* Reinitialize page pTo so that the contents of the MemPage structure
006920      ** match the new data. The initialization of pTo can actually fail under
006921      ** fairly obscure circumstances, even though it is a copy of initialized 
006922      ** page pFrom.
006923      */
006924      pTo->isInit = 0;
006925      rc = btreeInitPage(pTo);
006926      if( rc!=SQLITE_OK ){
006927        *pRC = rc;
006928        return;
006929      }
006930    
006931      /* If this is an auto-vacuum database, update the pointer-map entries
006932      ** for any b-tree or overflow pages that pTo now contains the pointers to.
006933      */
006934      if( ISAUTOVACUUM ){
006935        *pRC = setChildPtrmaps(pTo);
006936      }
006937    }
006938  }
006939  
006940  /*
006941  ** This routine redistributes cells on the iParentIdx'th child of pParent
006942  ** (hereafter "the page") and up to 2 siblings so that all pages have about the
006943  ** same amount of free space. Usually a single sibling on either side of the
006944  ** page are used in the balancing, though both siblings might come from one
006945  ** side if the page is the first or last child of its parent. If the page 
006946  ** has fewer than 2 siblings (something which can only happen if the page
006947  ** is a root page or a child of a root page) then all available siblings
006948  ** participate in the balancing.
006949  **
006950  ** The number of siblings of the page might be increased or decreased by 
006951  ** one or two in an effort to keep pages nearly full but not over full. 
006952  **
006953  ** Note that when this routine is called, some of the cells on the page
006954  ** might not actually be stored in MemPage.aData[]. This can happen
006955  ** if the page is overfull. This routine ensures that all cells allocated
006956  ** to the page and its siblings fit into MemPage.aData[] before returning.
006957  **
006958  ** In the course of balancing the page and its siblings, cells may be
006959  ** inserted into or removed from the parent page (pParent). Doing so
006960  ** may cause the parent page to become overfull or underfull. If this
006961  ** happens, it is the responsibility of the caller to invoke the correct
006962  ** balancing routine to fix this problem (see the balance() routine). 
006963  **
006964  ** If this routine fails for any reason, it might leave the database
006965  ** in a corrupted state. So if this routine fails, the database should
006966  ** be rolled back.
006967  **
006968  ** The third argument to this function, aOvflSpace, is a pointer to a
006969  ** buffer big enough to hold one page. If while inserting cells into the parent
006970  ** page (pParent) the parent page becomes overfull, this buffer is
006971  ** used to store the parent's overflow cells. Because this function inserts
006972  ** a maximum of four divider cells into the parent page, and the maximum
006973  ** size of a cell stored within an internal node is always less than 1/4
006974  ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
006975  ** enough for all overflow cells.
006976  **
006977  ** If aOvflSpace is set to a null pointer, this function returns 
006978  ** SQLITE_NOMEM.
006979  */
006980  static int balance_nonroot(
006981    MemPage *pParent,               /* Parent page of siblings being balanced */
006982    int iParentIdx,                 /* Index of "the page" in pParent */
006983    u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
006984    int isRoot,                     /* True if pParent is a root-page */
006985    int bBulk                       /* True if this call is part of a bulk load */
006986  ){
006987    BtShared *pBt;               /* The whole database */
006988    int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
006989    int nNew = 0;                /* Number of pages in apNew[] */
006990    int nOld;                    /* Number of pages in apOld[] */
006991    int i, j, k;                 /* Loop counters */
006992    int nxDiv;                   /* Next divider slot in pParent->aCell[] */
006993    int rc = SQLITE_OK;          /* The return code */
006994    u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
006995    int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
006996    int usableSpace;             /* Bytes in pPage beyond the header */
006997    int pageFlags;               /* Value of pPage->aData[0] */
006998    int iSpace1 = 0;             /* First unused byte of aSpace1[] */
006999    int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
007000    int szScratch;               /* Size of scratch memory requested */
007001    MemPage *apOld[NB];          /* pPage and up to two siblings */
007002    MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
007003    u8 *pRight;                  /* Location in parent of right-sibling pointer */
007004    u8 *apDiv[NB-1];             /* Divider cells in pParent */
007005    int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
007006    int cntOld[NB+2];            /* Old index in b.apCell[] */
007007    int szNew[NB+2];             /* Combined size of cells placed on i-th page */
007008    u8 *aSpace1;                 /* Space for copies of dividers cells */
007009    Pgno pgno;                   /* Temp var to store a page number in */
007010    u8 abDone[NB+2];             /* True after i'th new page is populated */
007011    Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
007012    Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
007013    u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
007014    CellArray b;                  /* Parsed information on cells being balanced */
007015  
007016    memset(abDone, 0, sizeof(abDone));
007017    b.nCell = 0;
007018    b.apCell = 0;
007019    pBt = pParent->pBt;
007020    assert( sqlite3_mutex_held(pBt->mutex) );
007021    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007022  
007023  #if 0
007024    TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
007025  #endif
007026  
007027    /* At this point pParent may have at most one overflow cell. And if
007028    ** this overflow cell is present, it must be the cell with 
007029    ** index iParentIdx. This scenario comes about when this function
007030    ** is called (indirectly) from sqlite3BtreeDelete().
007031    */
007032    assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
007033    assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
007034  
007035    if( !aOvflSpace ){
007036      return SQLITE_NOMEM_BKPT;
007037    }
007038  
007039    /* Find the sibling pages to balance. Also locate the cells in pParent 
007040    ** that divide the siblings. An attempt is made to find NN siblings on 
007041    ** either side of pPage. More siblings are taken from one side, however, 
007042    ** if there are fewer than NN siblings on the other side. If pParent
007043    ** has NB or fewer children then all children of pParent are taken.  
007044    **
007045    ** This loop also drops the divider cells from the parent page. This
007046    ** way, the remainder of the function does not have to deal with any
007047    ** overflow cells in the parent page, since if any existed they will
007048    ** have already been removed.
007049    */
007050    i = pParent->nOverflow + pParent->nCell;
007051    if( i<2 ){
007052      nxDiv = 0;
007053    }else{
007054      assert( bBulk==0 || bBulk==1 );
007055      if( iParentIdx==0 ){                 
007056        nxDiv = 0;
007057      }else if( iParentIdx==i ){
007058        nxDiv = i-2+bBulk;
007059      }else{
007060        nxDiv = iParentIdx-1;
007061      }
007062      i = 2-bBulk;
007063    }
007064    nOld = i+1;
007065    if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
007066      pRight = &pParent->aData[pParent->hdrOffset+8];
007067    }else{
007068      pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
007069    }
007070    pgno = get4byte(pRight);
007071    while( 1 ){
007072      rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
007073      if( rc ){
007074        memset(apOld, 0, (i+1)*sizeof(MemPage*));
007075        goto balance_cleanup;
007076      }
007077      nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
007078      if( (i--)==0 ) break;
007079  
007080      if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
007081        apDiv[i] = pParent->apOvfl[0];
007082        pgno = get4byte(apDiv[i]);
007083        szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
007084        pParent->nOverflow = 0;
007085      }else{
007086        apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
007087        pgno = get4byte(apDiv[i]);
007088        szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
007089  
007090        /* Drop the cell from the parent page. apDiv[i] still points to
007091        ** the cell within the parent, even though it has been dropped.
007092        ** This is safe because dropping a cell only overwrites the first
007093        ** four bytes of it, and this function does not need the first
007094        ** four bytes of the divider cell. So the pointer is safe to use
007095        ** later on.  
007096        **
007097        ** But not if we are in secure-delete mode. In secure-delete mode,
007098        ** the dropCell() routine will overwrite the entire cell with zeroes.
007099        ** In this case, temporarily copy the cell into the aOvflSpace[]
007100        ** buffer. It will be copied out again as soon as the aSpace[] buffer
007101        ** is allocated.  */
007102        if( pBt->btsFlags & BTS_SECURE_DELETE ){
007103          int iOff;
007104  
007105          iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
007106          if( (iOff+szNew[i])>(int)pBt->usableSize ){
007107            rc = SQLITE_CORRUPT_BKPT;
007108            memset(apOld, 0, (i+1)*sizeof(MemPage*));
007109            goto balance_cleanup;
007110          }else{
007111            memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
007112            apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
007113          }
007114        }
007115        dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
007116      }
007117    }
007118  
007119    /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
007120    ** alignment */
007121    nMaxCells = (nMaxCells + 3)&~3;
007122  
007123    /*
007124    ** Allocate space for memory structures
007125    */
007126    szScratch =
007127         nMaxCells*sizeof(u8*)                       /* b.apCell */
007128       + nMaxCells*sizeof(u16)                       /* b.szCell */
007129       + pBt->pageSize;                              /* aSpace1 */
007130  
007131    /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer
007132    ** that is more than 6 times the database page size. */
007133    assert( szScratch<=6*(int)pBt->pageSize );
007134    b.apCell = sqlite3ScratchMalloc( szScratch ); 
007135    if( b.apCell==0 ){
007136      rc = SQLITE_NOMEM_BKPT;
007137      goto balance_cleanup;
007138    }
007139    b.szCell = (u16*)&b.apCell[nMaxCells];
007140    aSpace1 = (u8*)&b.szCell[nMaxCells];
007141    assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
007142  
007143    /*
007144    ** Load pointers to all cells on sibling pages and the divider cells
007145    ** into the local b.apCell[] array.  Make copies of the divider cells
007146    ** into space obtained from aSpace1[]. The divider cells have already
007147    ** been removed from pParent.
007148    **
007149    ** If the siblings are on leaf pages, then the child pointers of the
007150    ** divider cells are stripped from the cells before they are copied
007151    ** into aSpace1[].  In this way, all cells in b.apCell[] are without
007152    ** child pointers.  If siblings are not leaves, then all cell in
007153    ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
007154    ** are alike.
007155    **
007156    ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
007157    **       leafData:  1 if pPage holds key+data and pParent holds only keys.
007158    */
007159    b.pRef = apOld[0];
007160    leafCorrection = b.pRef->leaf*4;
007161    leafData = b.pRef->intKeyLeaf;
007162    for(i=0; i<nOld; i++){
007163      MemPage *pOld = apOld[i];
007164      int limit = pOld->nCell;
007165      u8 *aData = pOld->aData;
007166      u16 maskPage = pOld->maskPage;
007167      u8 *piCell = aData + pOld->cellOffset;
007168      u8 *piEnd;
007169  
007170      /* Verify that all sibling pages are of the same "type" (table-leaf,
007171      ** table-interior, index-leaf, or index-interior).
007172      */
007173      if( pOld->aData[0]!=apOld[0]->aData[0] ){
007174        rc = SQLITE_CORRUPT_BKPT;
007175        goto balance_cleanup;
007176      }
007177  
007178      /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
007179      ** constains overflow cells, include them in the b.apCell[] array
007180      ** in the correct spot.
007181      **
007182      ** Note that when there are multiple overflow cells, it is always the
007183      ** case that they are sequential and adjacent.  This invariant arises
007184      ** because multiple overflows can only occurs when inserting divider
007185      ** cells into a parent on a prior balance, and divider cells are always
007186      ** adjacent and are inserted in order.  There is an assert() tagged
007187      ** with "NOTE 1" in the overflow cell insertion loop to prove this
007188      ** invariant.
007189      **
007190      ** This must be done in advance.  Once the balance starts, the cell
007191      ** offset section of the btree page will be overwritten and we will no
007192      ** long be able to find the cells if a pointer to each cell is not saved
007193      ** first.
007194      */
007195      memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
007196      if( pOld->nOverflow>0 ){
007197        limit = pOld->aiOvfl[0];
007198        for(j=0; j<limit; j++){
007199          b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
007200          piCell += 2;
007201          b.nCell++;
007202        }
007203        for(k=0; k<pOld->nOverflow; k++){
007204          assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
007205          b.apCell[b.nCell] = pOld->apOvfl[k];
007206          b.nCell++;
007207        }
007208      }
007209      piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
007210      while( piCell<piEnd ){
007211        assert( b.nCell<nMaxCells );
007212        b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
007213        piCell += 2;
007214        b.nCell++;
007215      }
007216  
007217      cntOld[i] = b.nCell;
007218      if( i<nOld-1 && !leafData){
007219        u16 sz = (u16)szNew[i];
007220        u8 *pTemp;
007221        assert( b.nCell<nMaxCells );
007222        b.szCell[b.nCell] = sz;
007223        pTemp = &aSpace1[iSpace1];
007224        iSpace1 += sz;
007225        assert( sz<=pBt->maxLocal+23 );
007226        assert( iSpace1 <= (int)pBt->pageSize );
007227        memcpy(pTemp, apDiv[i], sz);
007228        b.apCell[b.nCell] = pTemp+leafCorrection;
007229        assert( leafCorrection==0 || leafCorrection==4 );
007230        b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
007231        if( !pOld->leaf ){
007232          assert( leafCorrection==0 );
007233          assert( pOld->hdrOffset==0 );
007234          /* The right pointer of the child page pOld becomes the left
007235          ** pointer of the divider cell */
007236          memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
007237        }else{
007238          assert( leafCorrection==4 );
007239          while( b.szCell[b.nCell]<4 ){
007240            /* Do not allow any cells smaller than 4 bytes. If a smaller cell
007241            ** does exist, pad it with 0x00 bytes. */
007242            assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
007243            assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
007244            aSpace1[iSpace1++] = 0x00;
007245            b.szCell[b.nCell]++;
007246          }
007247        }
007248        b.nCell++;
007249      }
007250    }
007251  
007252    /*
007253    ** Figure out the number of pages needed to hold all b.nCell cells.
007254    ** Store this number in "k".  Also compute szNew[] which is the total
007255    ** size of all cells on the i-th page and cntNew[] which is the index
007256    ** in b.apCell[] of the cell that divides page i from page i+1.  
007257    ** cntNew[k] should equal b.nCell.
007258    **
007259    ** Values computed by this block:
007260    **
007261    **           k: The total number of sibling pages
007262    **    szNew[i]: Spaced used on the i-th sibling page.
007263    **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
007264    **              the right of the i-th sibling page.
007265    ** usableSpace: Number of bytes of space available on each sibling.
007266    ** 
007267    */
007268    usableSpace = pBt->usableSize - 12 + leafCorrection;
007269    for(i=0; i<nOld; i++){
007270      MemPage *p = apOld[i];
007271      szNew[i] = usableSpace - p->nFree;
007272      if( szNew[i]<0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
007273      for(j=0; j<p->nOverflow; j++){
007274        szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
007275      }
007276      cntNew[i] = cntOld[i];
007277    }
007278    k = nOld;
007279    for(i=0; i<k; i++){
007280      int sz;
007281      while( szNew[i]>usableSpace ){
007282        if( i+1>=k ){
007283          k = i+2;
007284          if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
007285          szNew[k-1] = 0;
007286          cntNew[k-1] = b.nCell;
007287        }
007288        sz = 2 + cachedCellSize(&b, cntNew[i]-1);
007289        szNew[i] -= sz;
007290        if( !leafData ){
007291          if( cntNew[i]<b.nCell ){
007292            sz = 2 + cachedCellSize(&b, cntNew[i]);
007293          }else{
007294            sz = 0;
007295          }
007296        }
007297        szNew[i+1] += sz;
007298        cntNew[i]--;
007299      }
007300      while( cntNew[i]<b.nCell ){
007301        sz = 2 + cachedCellSize(&b, cntNew[i]);
007302        if( szNew[i]+sz>usableSpace ) break;
007303        szNew[i] += sz;
007304        cntNew[i]++;
007305        if( !leafData ){
007306          if( cntNew[i]<b.nCell ){
007307            sz = 2 + cachedCellSize(&b, cntNew[i]);
007308          }else{
007309            sz = 0;
007310          }
007311        }
007312        szNew[i+1] -= sz;
007313      }
007314      if( cntNew[i]>=b.nCell ){
007315        k = i+1;
007316      }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
007317        rc = SQLITE_CORRUPT_BKPT;
007318        goto balance_cleanup;
007319      }
007320    }
007321  
007322    /*
007323    ** The packing computed by the previous block is biased toward the siblings
007324    ** on the left side (siblings with smaller keys). The left siblings are
007325    ** always nearly full, while the right-most sibling might be nearly empty.
007326    ** The next block of code attempts to adjust the packing of siblings to
007327    ** get a better balance.
007328    **
007329    ** This adjustment is more than an optimization.  The packing above might
007330    ** be so out of balance as to be illegal.  For example, the right-most
007331    ** sibling might be completely empty.  This adjustment is not optional.
007332    */
007333    for(i=k-1; i>0; i--){
007334      int szRight = szNew[i];  /* Size of sibling on the right */
007335      int szLeft = szNew[i-1]; /* Size of sibling on the left */
007336      int r;              /* Index of right-most cell in left sibling */
007337      int d;              /* Index of first cell to the left of right sibling */
007338  
007339      r = cntNew[i-1] - 1;
007340      d = r + 1 - leafData;
007341      (void)cachedCellSize(&b, d);
007342      do{
007343        assert( d<nMaxCells );
007344        assert( r<nMaxCells );
007345        (void)cachedCellSize(&b, r);
007346        if( szRight!=0
007347         && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
007348          break;
007349        }
007350        szRight += b.szCell[d] + 2;
007351        szLeft -= b.szCell[r] + 2;
007352        cntNew[i-1] = r;
007353        r--;
007354        d--;
007355      }while( r>=0 );
007356      szNew[i] = szRight;
007357      szNew[i-1] = szLeft;
007358      if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
007359        rc = SQLITE_CORRUPT_BKPT;
007360        goto balance_cleanup;
007361      }
007362    }
007363  
007364    /* Sanity check:  For a non-corrupt database file one of the follwing
007365    ** must be true:
007366    **    (1) We found one or more cells (cntNew[0])>0), or
007367    **    (2) pPage is a virtual root page.  A virtual root page is when
007368    **        the real root page is page 1 and we are the only child of
007369    **        that page.
007370    */
007371    assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
007372    TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
007373      apOld[0]->pgno, apOld[0]->nCell,
007374      nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
007375      nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
007376    ));
007377  
007378    /*
007379    ** Allocate k new pages.  Reuse old pages where possible.
007380    */
007381    pageFlags = apOld[0]->aData[0];
007382    for(i=0; i<k; i++){
007383      MemPage *pNew;
007384      if( i<nOld ){
007385        pNew = apNew[i] = apOld[i];
007386        apOld[i] = 0;
007387        rc = sqlite3PagerWrite(pNew->pDbPage);
007388        nNew++;
007389        if( rc ) goto balance_cleanup;
007390      }else{
007391        assert( i>0 );
007392        rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
007393        if( rc ) goto balance_cleanup;
007394        zeroPage(pNew, pageFlags);
007395        apNew[i] = pNew;
007396        nNew++;
007397        cntOld[i] = b.nCell;
007398  
007399        /* Set the pointer-map entry for the new sibling page. */
007400        if( ISAUTOVACUUM ){
007401          ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
007402          if( rc!=SQLITE_OK ){
007403            goto balance_cleanup;
007404          }
007405        }
007406      }
007407    }
007408  
007409    /*
007410    ** Reassign page numbers so that the new pages are in ascending order. 
007411    ** This helps to keep entries in the disk file in order so that a scan
007412    ** of the table is closer to a linear scan through the file. That in turn 
007413    ** helps the operating system to deliver pages from the disk more rapidly.
007414    **
007415    ** An O(n^2) insertion sort algorithm is used, but since n is never more 
007416    ** than (NB+2) (a small constant), that should not be a problem.
007417    **
007418    ** When NB==3, this one optimization makes the database about 25% faster 
007419    ** for large insertions and deletions.
007420    */
007421    for(i=0; i<nNew; i++){
007422      aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
007423      aPgFlags[i] = apNew[i]->pDbPage->flags;
007424      for(j=0; j<i; j++){
007425        if( aPgno[j]==aPgno[i] ){
007426          /* This branch is taken if the set of sibling pages somehow contains
007427          ** duplicate entries. This can happen if the database is corrupt. 
007428          ** It would be simpler to detect this as part of the loop below, but
007429          ** we do the detection here in order to avoid populating the pager
007430          ** cache with two separate objects associated with the same
007431          ** page number.  */
007432          assert( CORRUPT_DB );
007433          rc = SQLITE_CORRUPT_BKPT;
007434          goto balance_cleanup;
007435        }
007436      }
007437    }
007438    for(i=0; i<nNew; i++){
007439      int iBest = 0;                /* aPgno[] index of page number to use */
007440      for(j=1; j<nNew; j++){
007441        if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
007442      }
007443      pgno = aPgOrder[iBest];
007444      aPgOrder[iBest] = 0xffffffff;
007445      if( iBest!=i ){
007446        if( iBest>i ){
007447          sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
007448        }
007449        sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
007450        apNew[i]->pgno = pgno;
007451      }
007452    }
007453  
007454    TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
007455           "%d(%d nc=%d) %d(%d nc=%d)\n",
007456      apNew[0]->pgno, szNew[0], cntNew[0],
007457      nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
007458      nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
007459      nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
007460      nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
007461      nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
007462      nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
007463      nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
007464      nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
007465    ));
007466  
007467    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007468    put4byte(pRight, apNew[nNew-1]->pgno);
007469  
007470    /* If the sibling pages are not leaves, ensure that the right-child pointer
007471    ** of the right-most new sibling page is set to the value that was 
007472    ** originally in the same field of the right-most old sibling page. */
007473    if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
007474      MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
007475      memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
007476    }
007477  
007478    /* Make any required updates to pointer map entries associated with 
007479    ** cells stored on sibling pages following the balance operation. Pointer
007480    ** map entries associated with divider cells are set by the insertCell()
007481    ** routine. The associated pointer map entries are:
007482    **
007483    **   a) if the cell contains a reference to an overflow chain, the
007484    **      entry associated with the first page in the overflow chain, and
007485    **
007486    **   b) if the sibling pages are not leaves, the child page associated
007487    **      with the cell.
007488    **
007489    ** If the sibling pages are not leaves, then the pointer map entry 
007490    ** associated with the right-child of each sibling may also need to be 
007491    ** updated. This happens below, after the sibling pages have been 
007492    ** populated, not here.
007493    */
007494    if( ISAUTOVACUUM ){
007495      MemPage *pNew = apNew[0];
007496      u8 *aOld = pNew->aData;
007497      int cntOldNext = pNew->nCell + pNew->nOverflow;
007498      int usableSize = pBt->usableSize;
007499      int iNew = 0;
007500      int iOld = 0;
007501  
007502      for(i=0; i<b.nCell; i++){
007503        u8 *pCell = b.apCell[i];
007504        if( i==cntOldNext ){
007505          MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
007506          cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
007507          aOld = pOld->aData;
007508        }
007509        if( i==cntNew[iNew] ){
007510          pNew = apNew[++iNew];
007511          if( !leafData ) continue;
007512        }
007513  
007514        /* Cell pCell is destined for new sibling page pNew. Originally, it
007515        ** was either part of sibling page iOld (possibly an overflow cell), 
007516        ** or else the divider cell to the left of sibling page iOld. So,
007517        ** if sibling page iOld had the same page number as pNew, and if
007518        ** pCell really was a part of sibling page iOld (not a divider or
007519        ** overflow cell), we can skip updating the pointer map entries.  */
007520        if( iOld>=nNew
007521         || pNew->pgno!=aPgno[iOld]
007522         || !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])
007523        ){
007524          if( !leafCorrection ){
007525            ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
007526          }
007527          if( cachedCellSize(&b,i)>pNew->minLocal ){
007528            ptrmapPutOvflPtr(pNew, pCell, &rc);
007529          }
007530          if( rc ) goto balance_cleanup;
007531        }
007532      }
007533    }
007534  
007535    /* Insert new divider cells into pParent. */
007536    for(i=0; i<nNew-1; i++){
007537      u8 *pCell;
007538      u8 *pTemp;
007539      int sz;
007540      MemPage *pNew = apNew[i];
007541      j = cntNew[i];
007542  
007543      assert( j<nMaxCells );
007544      assert( b.apCell[j]!=0 );
007545      pCell = b.apCell[j];
007546      sz = b.szCell[j] + leafCorrection;
007547      pTemp = &aOvflSpace[iOvflSpace];
007548      if( !pNew->leaf ){
007549        memcpy(&pNew->aData[8], pCell, 4);
007550      }else if( leafData ){
007551        /* If the tree is a leaf-data tree, and the siblings are leaves, 
007552        ** then there is no divider cell in b.apCell[]. Instead, the divider 
007553        ** cell consists of the integer key for the right-most cell of 
007554        ** the sibling-page assembled above only.
007555        */
007556        CellInfo info;
007557        j--;
007558        pNew->xParseCell(pNew, b.apCell[j], &info);
007559        pCell = pTemp;
007560        sz = 4 + putVarint(&pCell[4], info.nKey);
007561        pTemp = 0;
007562      }else{
007563        pCell -= 4;
007564        /* Obscure case for non-leaf-data trees: If the cell at pCell was
007565        ** previously stored on a leaf node, and its reported size was 4
007566        ** bytes, then it may actually be smaller than this 
007567        ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
007568        ** any cell). But it is important to pass the correct size to 
007569        ** insertCell(), so reparse the cell now.
007570        **
007571        ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
007572        ** and WITHOUT ROWID tables with exactly one column which is the
007573        ** primary key.
007574        */
007575        if( b.szCell[j]==4 ){
007576          assert(leafCorrection==4);
007577          sz = pParent->xCellSize(pParent, pCell);
007578        }
007579      }
007580      iOvflSpace += sz;
007581      assert( sz<=pBt->maxLocal+23 );
007582      assert( iOvflSpace <= (int)pBt->pageSize );
007583      insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
007584      if( rc!=SQLITE_OK ) goto balance_cleanup;
007585      assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007586    }
007587  
007588    /* Now update the actual sibling pages. The order in which they are updated
007589    ** is important, as this code needs to avoid disrupting any page from which
007590    ** cells may still to be read. In practice, this means:
007591    **
007592    **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
007593    **      then it is not safe to update page apNew[iPg] until after
007594    **      the left-hand sibling apNew[iPg-1] has been updated.
007595    **
007596    **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
007597    **      then it is not safe to update page apNew[iPg] until after
007598    **      the right-hand sibling apNew[iPg+1] has been updated.
007599    **
007600    ** If neither of the above apply, the page is safe to update.
007601    **
007602    ** The iPg value in the following loop starts at nNew-1 goes down
007603    ** to 0, then back up to nNew-1 again, thus making two passes over
007604    ** the pages.  On the initial downward pass, only condition (1) above
007605    ** needs to be tested because (2) will always be true from the previous
007606    ** step.  On the upward pass, both conditions are always true, so the
007607    ** upwards pass simply processes pages that were missed on the downward
007608    ** pass.
007609    */
007610    for(i=1-nNew; i<nNew; i++){
007611      int iPg = i<0 ? -i : i;
007612      assert( iPg>=0 && iPg<nNew );
007613      if( abDone[iPg] ) continue;         /* Skip pages already processed */
007614      if( i>=0                            /* On the upwards pass, or... */
007615       || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
007616      ){
007617        int iNew;
007618        int iOld;
007619        int nNewCell;
007620  
007621        /* Verify condition (1):  If cells are moving left, update iPg
007622        ** only after iPg-1 has already been updated. */
007623        assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
007624  
007625        /* Verify condition (2):  If cells are moving right, update iPg
007626        ** only after iPg+1 has already been updated. */
007627        assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
007628  
007629        if( iPg==0 ){
007630          iNew = iOld = 0;
007631          nNewCell = cntNew[0];
007632        }else{
007633          iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
007634          iNew = cntNew[iPg-1] + !leafData;
007635          nNewCell = cntNew[iPg] - iNew;
007636        }
007637  
007638        rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
007639        if( rc ) goto balance_cleanup;
007640        abDone[iPg]++;
007641        apNew[iPg]->nFree = usableSpace-szNew[iPg];
007642        assert( apNew[iPg]->nOverflow==0 );
007643        assert( apNew[iPg]->nCell==nNewCell );
007644      }
007645    }
007646  
007647    /* All pages have been processed exactly once */
007648    assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
007649  
007650    assert( nOld>0 );
007651    assert( nNew>0 );
007652  
007653    if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
007654      /* The root page of the b-tree now contains no cells. The only sibling
007655      ** page is the right-child of the parent. Copy the contents of the
007656      ** child page into the parent, decreasing the overall height of the
007657      ** b-tree structure by one. This is described as the "balance-shallower"
007658      ** sub-algorithm in some documentation.
007659      **
007660      ** If this is an auto-vacuum database, the call to copyNodeContent() 
007661      ** sets all pointer-map entries corresponding to database image pages 
007662      ** for which the pointer is stored within the content being copied.
007663      **
007664      ** It is critical that the child page be defragmented before being
007665      ** copied into the parent, because if the parent is page 1 then it will
007666      ** by smaller than the child due to the database header, and so all the
007667      ** free space needs to be up front.
007668      */
007669      assert( nNew==1 || CORRUPT_DB );
007670      rc = defragmentPage(apNew[0]);
007671      testcase( rc!=SQLITE_OK );
007672      assert( apNew[0]->nFree == 
007673          (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
007674        || rc!=SQLITE_OK
007675      );
007676      copyNodeContent(apNew[0], pParent, &rc);
007677      freePage(apNew[0], &rc);
007678    }else if( ISAUTOVACUUM && !leafCorrection ){
007679      /* Fix the pointer map entries associated with the right-child of each
007680      ** sibling page. All other pointer map entries have already been taken
007681      ** care of.  */
007682      for(i=0; i<nNew; i++){
007683        u32 key = get4byte(&apNew[i]->aData[8]);
007684        ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
007685      }
007686    }
007687  
007688    assert( pParent->isInit );
007689    TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
007690            nOld, nNew, b.nCell));
007691  
007692    /* Free any old pages that were not reused as new pages.
007693    */
007694    for(i=nNew; i<nOld; i++){
007695      freePage(apOld[i], &rc);
007696    }
007697  
007698  #if 0
007699    if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
007700      /* The ptrmapCheckPages() contains assert() statements that verify that
007701      ** all pointer map pages are set correctly. This is helpful while 
007702      ** debugging. This is usually disabled because a corrupt database may
007703      ** cause an assert() statement to fail.  */
007704      ptrmapCheckPages(apNew, nNew);
007705      ptrmapCheckPages(&pParent, 1);
007706    }
007707  #endif
007708  
007709    /*
007710    ** Cleanup before returning.
007711    */
007712  balance_cleanup:
007713    sqlite3ScratchFree(b.apCell);
007714    for(i=0; i<nOld; i++){
007715      releasePage(apOld[i]);
007716    }
007717    for(i=0; i<nNew; i++){
007718      releasePage(apNew[i]);
007719    }
007720  
007721    return rc;
007722  }
007723  
007724  
007725  /*
007726  ** This function is called when the root page of a b-tree structure is
007727  ** overfull (has one or more overflow pages).
007728  **
007729  ** A new child page is allocated and the contents of the current root
007730  ** page, including overflow cells, are copied into the child. The root
007731  ** page is then overwritten to make it an empty page with the right-child 
007732  ** pointer pointing to the new page.
007733  **
007734  ** Before returning, all pointer-map entries corresponding to pages 
007735  ** that the new child-page now contains pointers to are updated. The
007736  ** entry corresponding to the new right-child pointer of the root
007737  ** page is also updated.
007738  **
007739  ** If successful, *ppChild is set to contain a reference to the child 
007740  ** page and SQLITE_OK is returned. In this case the caller is required
007741  ** to call releasePage() on *ppChild exactly once. If an error occurs,
007742  ** an error code is returned and *ppChild is set to 0.
007743  */
007744  static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
007745    int rc;                        /* Return value from subprocedures */
007746    MemPage *pChild = 0;           /* Pointer to a new child page */
007747    Pgno pgnoChild = 0;            /* Page number of the new child page */
007748    BtShared *pBt = pRoot->pBt;    /* The BTree */
007749  
007750    assert( pRoot->nOverflow>0 );
007751    assert( sqlite3_mutex_held(pBt->mutex) );
007752  
007753    /* Make pRoot, the root page of the b-tree, writable. Allocate a new 
007754    ** page that will become the new right-child of pPage. Copy the contents
007755    ** of the node stored on pRoot into the new child page.
007756    */
007757    rc = sqlite3PagerWrite(pRoot->pDbPage);
007758    if( rc==SQLITE_OK ){
007759      rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
007760      copyNodeContent(pRoot, pChild, &rc);
007761      if( ISAUTOVACUUM ){
007762        ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
007763      }
007764    }
007765    if( rc ){
007766      *ppChild = 0;
007767      releasePage(pChild);
007768      return rc;
007769    }
007770    assert( sqlite3PagerIswriteable(pChild->pDbPage) );
007771    assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
007772    assert( pChild->nCell==pRoot->nCell );
007773  
007774    TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
007775  
007776    /* Copy the overflow cells from pRoot to pChild */
007777    memcpy(pChild->aiOvfl, pRoot->aiOvfl,
007778           pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
007779    memcpy(pChild->apOvfl, pRoot->apOvfl,
007780           pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
007781    pChild->nOverflow = pRoot->nOverflow;
007782  
007783    /* Zero the contents of pRoot. Then install pChild as the right-child. */
007784    zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
007785    put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
007786  
007787    *ppChild = pChild;
007788    return SQLITE_OK;
007789  }
007790  
007791  /*
007792  ** The page that pCur currently points to has just been modified in
007793  ** some way. This function figures out if this modification means the
007794  ** tree needs to be balanced, and if so calls the appropriate balancing 
007795  ** routine. Balancing routines are:
007796  **
007797  **   balance_quick()
007798  **   balance_deeper()
007799  **   balance_nonroot()
007800  */
007801  static int balance(BtCursor *pCur){
007802    int rc = SQLITE_OK;
007803    const int nMin = pCur->pBt->usableSize * 2 / 3;
007804    u8 aBalanceQuickSpace[13];
007805    u8 *pFree = 0;
007806  
007807    VVA_ONLY( int balance_quick_called = 0 );
007808    VVA_ONLY( int balance_deeper_called = 0 );
007809  
007810    do {
007811      int iPage = pCur->iPage;
007812      MemPage *pPage = pCur->apPage[iPage];
007813  
007814      if( iPage==0 ){
007815        if( pPage->nOverflow ){
007816          /* The root page of the b-tree is overfull. In this case call the
007817          ** balance_deeper() function to create a new child for the root-page
007818          ** and copy the current contents of the root-page to it. The
007819          ** next iteration of the do-loop will balance the child page.
007820          */ 
007821          assert( balance_deeper_called==0 );
007822          VVA_ONLY( balance_deeper_called++ );
007823          rc = balance_deeper(pPage, &pCur->apPage[1]);
007824          if( rc==SQLITE_OK ){
007825            pCur->iPage = 1;
007826            pCur->aiIdx[0] = 0;
007827            pCur->aiIdx[1] = 0;
007828            assert( pCur->apPage[1]->nOverflow );
007829          }
007830        }else{
007831          break;
007832        }
007833      }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
007834        break;
007835      }else{
007836        MemPage * const pParent = pCur->apPage[iPage-1];
007837        int const iIdx = pCur->aiIdx[iPage-1];
007838  
007839        rc = sqlite3PagerWrite(pParent->pDbPage);
007840        if( rc==SQLITE_OK ){
007841  #ifndef SQLITE_OMIT_QUICKBALANCE
007842          if( pPage->intKeyLeaf
007843           && pPage->nOverflow==1
007844           && pPage->aiOvfl[0]==pPage->nCell
007845           && pParent->pgno!=1
007846           && pParent->nCell==iIdx
007847          ){
007848            /* Call balance_quick() to create a new sibling of pPage on which
007849            ** to store the overflow cell. balance_quick() inserts a new cell
007850            ** into pParent, which may cause pParent overflow. If this
007851            ** happens, the next iteration of the do-loop will balance pParent 
007852            ** use either balance_nonroot() or balance_deeper(). Until this
007853            ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
007854            ** buffer. 
007855            **
007856            ** The purpose of the following assert() is to check that only a
007857            ** single call to balance_quick() is made for each call to this
007858            ** function. If this were not verified, a subtle bug involving reuse
007859            ** of the aBalanceQuickSpace[] might sneak in.
007860            */
007861            assert( balance_quick_called==0 ); 
007862            VVA_ONLY( balance_quick_called++ );
007863            rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
007864          }else
007865  #endif
007866          {
007867            /* In this case, call balance_nonroot() to redistribute cells
007868            ** between pPage and up to 2 of its sibling pages. This involves
007869            ** modifying the contents of pParent, which may cause pParent to
007870            ** become overfull or underfull. The next iteration of the do-loop
007871            ** will balance the parent page to correct this.
007872            ** 
007873            ** If the parent page becomes overfull, the overflow cell or cells
007874            ** are stored in the pSpace buffer allocated immediately below. 
007875            ** A subsequent iteration of the do-loop will deal with this by
007876            ** calling balance_nonroot() (balance_deeper() may be called first,
007877            ** but it doesn't deal with overflow cells - just moves them to a
007878            ** different page). Once this subsequent call to balance_nonroot() 
007879            ** has completed, it is safe to release the pSpace buffer used by
007880            ** the previous call, as the overflow cell data will have been 
007881            ** copied either into the body of a database page or into the new
007882            ** pSpace buffer passed to the latter call to balance_nonroot().
007883            */
007884            u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
007885            rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
007886                                 pCur->hints&BTREE_BULKLOAD);
007887            if( pFree ){
007888              /* If pFree is not NULL, it points to the pSpace buffer used 
007889              ** by a previous call to balance_nonroot(). Its contents are
007890              ** now stored either on real database pages or within the 
007891              ** new pSpace buffer, so it may be safely freed here. */
007892              sqlite3PageFree(pFree);
007893            }
007894  
007895            /* The pSpace buffer will be freed after the next call to
007896            ** balance_nonroot(), or just before this function returns, whichever
007897            ** comes first. */
007898            pFree = pSpace;
007899          }
007900        }
007901  
007902        pPage->nOverflow = 0;
007903  
007904        /* The next iteration of the do-loop balances the parent page. */
007905        releasePage(pPage);
007906        pCur->iPage--;
007907        assert( pCur->iPage>=0 );
007908      }
007909    }while( rc==SQLITE_OK );
007910  
007911    if( pFree ){
007912      sqlite3PageFree(pFree);
007913    }
007914    return rc;
007915  }
007916  
007917  
007918  /*
007919  ** Insert a new record into the BTree.  The content of the new record
007920  ** is described by the pX object.  The pCur cursor is used only to
007921  ** define what table the record should be inserted into, and is left
007922  ** pointing at a random location.
007923  **
007924  ** For a table btree (used for rowid tables), only the pX.nKey value of
007925  ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
007926  ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
007927  ** hold the content of the row.
007928  **
007929  ** For an index btree (used for indexes and WITHOUT ROWID tables), the
007930  ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The 
007931  ** pX.pData,nData,nZero fields must be zero.
007932  **
007933  ** If the seekResult parameter is non-zero, then a successful call to
007934  ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
007935  ** been performed.  In other words, if seekResult!=0 then the cursor
007936  ** is currently pointing to a cell that will be adjacent to the cell
007937  ** to be inserted.  If seekResult<0 then pCur points to a cell that is
007938  ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
007939  ** that is larger than (pKey,nKey).
007940  **
007941  ** If seekResult==0, that means pCur is pointing at some unknown location.
007942  ** In that case, this routine must seek the cursor to the correct insertion
007943  ** point for (pKey,nKey) before doing the insertion.  For index btrees,
007944  ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
007945  ** key values and pX->aMem can be used instead of pX->pKey to avoid having
007946  ** to decode the key.
007947  */
007948  int sqlite3BtreeInsert(
007949    BtCursor *pCur,                /* Insert data into the table of this cursor */
007950    const BtreePayload *pX,        /* Content of the row to be inserted */
007951    int appendBias,                /* True if this is likely an append */
007952    int seekResult                 /* Result of prior MovetoUnpacked() call */
007953  ){
007954    int rc;
007955    int loc = seekResult;          /* -1: before desired location  +1: after */
007956    int szNew = 0;
007957    int idx;
007958    MemPage *pPage;
007959    Btree *p = pCur->pBtree;
007960    BtShared *pBt = p->pBt;
007961    unsigned char *oldCell;
007962    unsigned char *newCell = 0;
007963  
007964    if( pCur->eState==CURSOR_FAULT ){
007965      assert( pCur->skipNext!=SQLITE_OK );
007966      return pCur->skipNext;
007967    }
007968  
007969    assert( cursorOwnsBtShared(pCur) );
007970    assert( (pCur->curFlags & BTCF_WriteFlag)!=0
007971                && pBt->inTransaction==TRANS_WRITE
007972                && (pBt->btsFlags & BTS_READ_ONLY)==0 );
007973    assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
007974  
007975    /* Assert that the caller has been consistent. If this cursor was opened
007976    ** expecting an index b-tree, then the caller should be inserting blob
007977    ** keys with no associated data. If the cursor was opened expecting an
007978    ** intkey table, the caller should be inserting integer keys with a
007979    ** blob of associated data.  */
007980    assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
007981  
007982    /* Save the positions of any other cursors open on this table.
007983    **
007984    ** In some cases, the call to btreeMoveto() below is a no-op. For
007985    ** example, when inserting data into a table with auto-generated integer
007986    ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 
007987    ** integer key to use. It then calls this function to actually insert the 
007988    ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
007989    ** that the cursor is already where it needs to be and returns without
007990    ** doing any work. To avoid thwarting these optimizations, it is important
007991    ** not to clear the cursor here.
007992    */
007993    if( pCur->curFlags & BTCF_Multiple ){
007994      rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
007995      if( rc ) return rc;
007996    }
007997  
007998    if( pCur->pKeyInfo==0 ){
007999      assert( pX->pKey==0 );
008000      /* If this is an insert into a table b-tree, invalidate any incrblob 
008001      ** cursors open on the row being replaced */
008002      invalidateIncrblobCursors(p, pX->nKey, 0);
008003  
008004      /* If the cursor is currently on the last row and we are appending a
008005      ** new row onto the end, set the "loc" to avoid an unnecessary
008006      ** btreeMoveto() call */
008007      if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
008008        loc = 0;
008009      }else if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey>0
008010                 && pCur->info.nKey==pX->nKey-1 ){
008011        loc = -1;
008012      }else if( loc==0 ){
008013        rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, appendBias, &loc);
008014        if( rc ) return rc;
008015      }
008016    }else if( loc==0 ){
008017      if( pX->nMem ){
008018        UnpackedRecord r;
008019        r.pKeyInfo = pCur->pKeyInfo;
008020        r.aMem = pX->aMem;
008021        r.nField = pX->nMem;
008022        r.default_rc = 0;
008023        r.errCode = 0;
008024        r.r1 = 0;
008025        r.r2 = 0;
008026        r.eqSeen = 0;
008027        rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, appendBias, &loc);
008028      }else{
008029        rc = btreeMoveto(pCur, pX->pKey, pX->nKey, appendBias, &loc);
008030      }
008031      if( rc ) return rc;
008032    }
008033    assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
008034  
008035    pPage = pCur->apPage[pCur->iPage];
008036    assert( pPage->intKey || pX->nKey>=0 );
008037    assert( pPage->leaf || !pPage->intKey );
008038  
008039    TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
008040            pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
008041            loc==0 ? "overwrite" : "new entry"));
008042    assert( pPage->isInit );
008043    newCell = pBt->pTmpSpace;
008044    assert( newCell!=0 );
008045    rc = fillInCell(pPage, newCell, pX, &szNew);
008046    if( rc ) goto end_insert;
008047    assert( szNew==pPage->xCellSize(pPage, newCell) );
008048    assert( szNew <= MX_CELL_SIZE(pBt) );
008049    idx = pCur->aiIdx[pCur->iPage];
008050    if( loc==0 ){
008051      CellInfo info;
008052      assert( idx<pPage->nCell );
008053      rc = sqlite3PagerWrite(pPage->pDbPage);
008054      if( rc ){
008055        goto end_insert;
008056      }
008057      oldCell = findCell(pPage, idx);
008058      if( !pPage->leaf ){
008059        memcpy(newCell, oldCell, 4);
008060      }
008061      rc = clearCell(pPage, oldCell, &info);
008062      if( info.nSize==szNew && info.nLocal==info.nPayload ){
008063        /* Overwrite the old cell with the new if they are the same size.
008064        ** We could also try to do this if the old cell is smaller, then add
008065        ** the leftover space to the free list.  But experiments show that
008066        ** doing that is no faster then skipping this optimization and just
008067        ** calling dropCell() and insertCell(). */
008068        assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
008069        if( oldCell+szNew > pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
008070        memcpy(oldCell, newCell, szNew);
008071        return SQLITE_OK;
008072      }
008073      dropCell(pPage, idx, info.nSize, &rc);
008074      if( rc ) goto end_insert;
008075    }else if( loc<0 && pPage->nCell>0 ){
008076      assert( pPage->leaf );
008077      idx = ++pCur->aiIdx[pCur->iPage];
008078    }else{
008079      assert( pPage->leaf );
008080    }
008081    insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
008082    assert( pPage->nOverflow==0 || rc==SQLITE_OK );
008083    assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
008084  
008085    /* If no error has occurred and pPage has an overflow cell, call balance() 
008086    ** to redistribute the cells within the tree. Since balance() may move
008087    ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
008088    ** variables.
008089    **
008090    ** Previous versions of SQLite called moveToRoot() to move the cursor
008091    ** back to the root page as balance() used to invalidate the contents
008092    ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
008093    ** set the cursor state to "invalid". This makes common insert operations
008094    ** slightly faster.
008095    **
008096    ** There is a subtle but important optimization here too. When inserting
008097    ** multiple records into an intkey b-tree using a single cursor (as can
008098    ** happen while processing an "INSERT INTO ... SELECT" statement), it
008099    ** is advantageous to leave the cursor pointing to the last entry in
008100    ** the b-tree if possible. If the cursor is left pointing to the last
008101    ** entry in the table, and the next row inserted has an integer key
008102    ** larger than the largest existing key, it is possible to insert the
008103    ** row without seeking the cursor. This can be a big performance boost.
008104    */
008105    pCur->info.nSize = 0;
008106    if( pPage->nOverflow ){
008107      assert( rc==SQLITE_OK );
008108      pCur->curFlags &= ~(BTCF_ValidNKey);
008109      rc = balance(pCur);
008110  
008111      /* Must make sure nOverflow is reset to zero even if the balance()
008112      ** fails. Internal data structure corruption will result otherwise. 
008113      ** Also, set the cursor state to invalid. This stops saveCursorPosition()
008114      ** from trying to save the current position of the cursor.  */
008115      pCur->apPage[pCur->iPage]->nOverflow = 0;
008116      pCur->eState = CURSOR_INVALID;
008117    }
008118    assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
008119  
008120  end_insert:
008121    return rc;
008122  }
008123  
008124  /*
008125  ** Delete the entry that the cursor is pointing to. 
008126  **
008127  ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
008128  ** the cursor is left pointing at an arbitrary location after the delete.
008129  ** But if that bit is set, then the cursor is left in a state such that
008130  ** the next call to BtreeNext() or BtreePrev() moves it to the same row
008131  ** as it would have been on if the call to BtreeDelete() had been omitted.
008132  **
008133  ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
008134  ** associated with a single table entry and its indexes.  Only one of those
008135  ** deletes is considered the "primary" delete.  The primary delete occurs
008136  ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
008137  ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
008138  ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
008139  ** but which might be used by alternative storage engines.
008140  */
008141  int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
008142    Btree *p = pCur->pBtree;
008143    BtShared *pBt = p->pBt;              
008144    int rc;                              /* Return code */
008145    MemPage *pPage;                      /* Page to delete cell from */
008146    unsigned char *pCell;                /* Pointer to cell to delete */
008147    int iCellIdx;                        /* Index of cell to delete */
008148    int iCellDepth;                      /* Depth of node containing pCell */ 
008149    CellInfo info;                       /* Size of the cell being deleted */
008150    int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
008151    u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
008152  
008153    assert( cursorOwnsBtShared(pCur) );
008154    assert( pBt->inTransaction==TRANS_WRITE );
008155    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
008156    assert( pCur->curFlags & BTCF_WriteFlag );
008157    assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
008158    assert( !hasReadConflicts(p, pCur->pgnoRoot) );
008159    assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
008160    assert( pCur->eState==CURSOR_VALID );
008161    assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
008162  
008163    iCellDepth = pCur->iPage;
008164    iCellIdx = pCur->aiIdx[iCellDepth];
008165    pPage = pCur->apPage[iCellDepth];
008166    pCell = findCell(pPage, iCellIdx);
008167  
008168    /* If the bPreserve flag is set to true, then the cursor position must
008169    ** be preserved following this delete operation. If the current delete
008170    ** will cause a b-tree rebalance, then this is done by saving the cursor
008171    ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 
008172    ** returning. 
008173    **
008174    ** Or, if the current delete will not cause a rebalance, then the cursor
008175    ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
008176    ** before or after the deleted entry. In this case set bSkipnext to true.  */
008177    if( bPreserve ){
008178      if( !pPage->leaf 
008179       || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
008180      ){
008181        /* A b-tree rebalance will be required after deleting this entry.
008182        ** Save the cursor key.  */
008183        rc = saveCursorKey(pCur);
008184        if( rc ) return rc;
008185      }else{
008186        bSkipnext = 1;
008187      }
008188    }
008189  
008190    /* If the page containing the entry to delete is not a leaf page, move
008191    ** the cursor to the largest entry in the tree that is smaller than
008192    ** the entry being deleted. This cell will replace the cell being deleted
008193    ** from the internal node. The 'previous' entry is used for this instead
008194    ** of the 'next' entry, as the previous entry is always a part of the
008195    ** sub-tree headed by the child page of the cell being deleted. This makes
008196    ** balancing the tree following the delete operation easier.  */
008197    if( !pPage->leaf ){
008198      int notUsed = 0;
008199      rc = sqlite3BtreePrevious(pCur, &notUsed);
008200      if( rc ) return rc;
008201    }
008202  
008203    /* Save the positions of any other cursors open on this table before
008204    ** making any modifications.  */
008205    if( pCur->curFlags & BTCF_Multiple ){
008206      rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
008207      if( rc ) return rc;
008208    }
008209  
008210    /* If this is a delete operation to remove a row from a table b-tree,
008211    ** invalidate any incrblob cursors open on the row being deleted.  */
008212    if( pCur->pKeyInfo==0 ){
008213      invalidateIncrblobCursors(p, pCur->info.nKey, 0);
008214    }
008215  
008216    /* Make the page containing the entry to be deleted writable. Then free any
008217    ** overflow pages associated with the entry and finally remove the cell
008218    ** itself from within the page.  */
008219    rc = sqlite3PagerWrite(pPage->pDbPage);
008220    if( rc ) return rc;
008221    rc = clearCell(pPage, pCell, &info);
008222    dropCell(pPage, iCellIdx, info.nSize, &rc);
008223    if( rc ) return rc;
008224  
008225    /* If the cell deleted was not located on a leaf page, then the cursor
008226    ** is currently pointing to the largest entry in the sub-tree headed
008227    ** by the child-page of the cell that was just deleted from an internal
008228    ** node. The cell from the leaf node needs to be moved to the internal
008229    ** node to replace the deleted cell.  */
008230    if( !pPage->leaf ){
008231      MemPage *pLeaf = pCur->apPage[pCur->iPage];
008232      int nCell;
008233      Pgno n = pCur->apPage[iCellDepth+1]->pgno;
008234      unsigned char *pTmp;
008235  
008236      pCell = findCell(pLeaf, pLeaf->nCell-1);
008237      if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
008238      nCell = pLeaf->xCellSize(pLeaf, pCell);
008239      assert( MX_CELL_SIZE(pBt) >= nCell );
008240      pTmp = pBt->pTmpSpace;
008241      assert( pTmp!=0 );
008242      rc = sqlite3PagerWrite(pLeaf->pDbPage);
008243      if( rc==SQLITE_OK ){
008244        insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
008245      }
008246      dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
008247      if( rc ) return rc;
008248    }
008249  
008250    /* Balance the tree. If the entry deleted was located on a leaf page,
008251    ** then the cursor still points to that page. In this case the first
008252    ** call to balance() repairs the tree, and the if(...) condition is
008253    ** never true.
008254    **
008255    ** Otherwise, if the entry deleted was on an internal node page, then
008256    ** pCur is pointing to the leaf page from which a cell was removed to
008257    ** replace the cell deleted from the internal node. This is slightly
008258    ** tricky as the leaf node may be underfull, and the internal node may
008259    ** be either under or overfull. In this case run the balancing algorithm
008260    ** on the leaf node first. If the balance proceeds far enough up the
008261    ** tree that we can be sure that any problem in the internal node has
008262    ** been corrected, so be it. Otherwise, after balancing the leaf node,
008263    ** walk the cursor up the tree to the internal node and balance it as 
008264    ** well.  */
008265    rc = balance(pCur);
008266    if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
008267      while( pCur->iPage>iCellDepth ){
008268        releasePage(pCur->apPage[pCur->iPage--]);
008269      }
008270      rc = balance(pCur);
008271    }
008272  
008273    if( rc==SQLITE_OK ){
008274      if( bSkipnext ){
008275        assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
008276        assert( pPage==pCur->apPage[pCur->iPage] || CORRUPT_DB );
008277        assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
008278        pCur->eState = CURSOR_SKIPNEXT;
008279        if( iCellIdx>=pPage->nCell ){
008280          pCur->skipNext = -1;
008281          pCur->aiIdx[iCellDepth] = pPage->nCell-1;
008282        }else{
008283          pCur->skipNext = 1;
008284        }
008285      }else{
008286        rc = moveToRoot(pCur);
008287        if( bPreserve ){
008288          pCur->eState = CURSOR_REQUIRESEEK;
008289        }
008290      }
008291    }
008292    return rc;
008293  }
008294  
008295  /*
008296  ** Create a new BTree table.  Write into *piTable the page
008297  ** number for the root page of the new table.
008298  **
008299  ** The type of type is determined by the flags parameter.  Only the
008300  ** following values of flags are currently in use.  Other values for
008301  ** flags might not work:
008302  **
008303  **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
008304  **     BTREE_ZERODATA                  Used for SQL indices
008305  */
008306  static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
008307    BtShared *pBt = p->pBt;
008308    MemPage *pRoot;
008309    Pgno pgnoRoot;
008310    int rc;
008311    int ptfFlags;          /* Page-type flage for the root page of new table */
008312  
008313    assert( sqlite3BtreeHoldsMutex(p) );
008314    assert( pBt->inTransaction==TRANS_WRITE );
008315    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
008316  
008317  #ifdef SQLITE_OMIT_AUTOVACUUM
008318    rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
008319    if( rc ){
008320      return rc;
008321    }
008322  #else
008323    if( pBt->autoVacuum ){
008324      Pgno pgnoMove;      /* Move a page here to make room for the root-page */
008325      MemPage *pPageMove; /* The page to move to. */
008326  
008327      /* Creating a new table may probably require moving an existing database
008328      ** to make room for the new tables root page. In case this page turns
008329      ** out to be an overflow page, delete all overflow page-map caches
008330      ** held by open cursors.
008331      */
008332      invalidateAllOverflowCache(pBt);
008333  
008334      /* Read the value of meta[3] from the database to determine where the
008335      ** root page of the new table should go. meta[3] is the largest root-page
008336      ** created so far, so the new root-page is (meta[3]+1).
008337      */
008338      sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
008339      pgnoRoot++;
008340  
008341      /* The new root-page may not be allocated on a pointer-map page, or the
008342      ** PENDING_BYTE page.
008343      */
008344      while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
008345          pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
008346        pgnoRoot++;
008347      }
008348      assert( pgnoRoot>=3 || CORRUPT_DB );
008349      testcase( pgnoRoot<3 );
008350  
008351      /* Allocate a page. The page that currently resides at pgnoRoot will
008352      ** be moved to the allocated page (unless the allocated page happens
008353      ** to reside at pgnoRoot).
008354      */
008355      rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
008356      if( rc!=SQLITE_OK ){
008357        return rc;
008358      }
008359  
008360      if( pgnoMove!=pgnoRoot ){
008361        /* pgnoRoot is the page that will be used for the root-page of
008362        ** the new table (assuming an error did not occur). But we were
008363        ** allocated pgnoMove. If required (i.e. if it was not allocated
008364        ** by extending the file), the current page at position pgnoMove
008365        ** is already journaled.
008366        */
008367        u8 eType = 0;
008368        Pgno iPtrPage = 0;
008369  
008370        /* Save the positions of any open cursors. This is required in
008371        ** case they are holding a reference to an xFetch reference
008372        ** corresponding to page pgnoRoot.  */
008373        rc = saveAllCursors(pBt, 0, 0);
008374        releasePage(pPageMove);
008375        if( rc!=SQLITE_OK ){
008376          return rc;
008377        }
008378  
008379        /* Move the page currently at pgnoRoot to pgnoMove. */
008380        rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
008381        if( rc!=SQLITE_OK ){
008382          return rc;
008383        }
008384        rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
008385        if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
008386          rc = SQLITE_CORRUPT_BKPT;
008387        }
008388        if( rc!=SQLITE_OK ){
008389          releasePage(pRoot);
008390          return rc;
008391        }
008392        assert( eType!=PTRMAP_ROOTPAGE );
008393        assert( eType!=PTRMAP_FREEPAGE );
008394        rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
008395        releasePage(pRoot);
008396  
008397        /* Obtain the page at pgnoRoot */
008398        if( rc!=SQLITE_OK ){
008399          return rc;
008400        }
008401        rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
008402        if( rc!=SQLITE_OK ){
008403          return rc;
008404        }
008405        rc = sqlite3PagerWrite(pRoot->pDbPage);
008406        if( rc!=SQLITE_OK ){
008407          releasePage(pRoot);
008408          return rc;
008409        }
008410      }else{
008411        pRoot = pPageMove;
008412      } 
008413  
008414      /* Update the pointer-map and meta-data with the new root-page number. */
008415      ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
008416      if( rc ){
008417        releasePage(pRoot);
008418        return rc;
008419      }
008420  
008421      /* When the new root page was allocated, page 1 was made writable in
008422      ** order either to increase the database filesize, or to decrement the
008423      ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
008424      */
008425      assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
008426      rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
008427      if( NEVER(rc) ){
008428        releasePage(pRoot);
008429        return rc;
008430      }
008431  
008432    }else{
008433      rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
008434      if( rc ) return rc;
008435    }
008436  #endif
008437    assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
008438    if( createTabFlags & BTREE_INTKEY ){
008439      ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
008440    }else{
008441      ptfFlags = PTF_ZERODATA | PTF_LEAF;
008442    }
008443    zeroPage(pRoot, ptfFlags);
008444    sqlite3PagerUnref(pRoot->pDbPage);
008445    assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
008446    *piTable = (int)pgnoRoot;
008447    return SQLITE_OK;
008448  }
008449  int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
008450    int rc;
008451    sqlite3BtreeEnter(p);
008452    rc = btreeCreateTable(p, piTable, flags);
008453    sqlite3BtreeLeave(p);
008454    return rc;
008455  }
008456  
008457  /*
008458  ** Erase the given database page and all its children.  Return
008459  ** the page to the freelist.
008460  */
008461  static int clearDatabasePage(
008462    BtShared *pBt,           /* The BTree that contains the table */
008463    Pgno pgno,               /* Page number to clear */
008464    int freePageFlag,        /* Deallocate page if true */
008465    int *pnChange            /* Add number of Cells freed to this counter */
008466  ){
008467    MemPage *pPage;
008468    int rc;
008469    unsigned char *pCell;
008470    int i;
008471    int hdr;
008472    CellInfo info;
008473  
008474    assert( sqlite3_mutex_held(pBt->mutex) );
008475    if( pgno>btreePagecount(pBt) ){
008476      return SQLITE_CORRUPT_BKPT;
008477    }
008478    rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
008479    if( rc ) return rc;
008480    if( pPage->bBusy ){
008481      rc = SQLITE_CORRUPT_BKPT;
008482      goto cleardatabasepage_out;
008483    }
008484    pPage->bBusy = 1;
008485    hdr = pPage->hdrOffset;
008486    for(i=0; i<pPage->nCell; i++){
008487      pCell = findCell(pPage, i);
008488      if( !pPage->leaf ){
008489        rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
008490        if( rc ) goto cleardatabasepage_out;
008491      }
008492      rc = clearCell(pPage, pCell, &info);
008493      if( rc ) goto cleardatabasepage_out;
008494    }
008495    if( !pPage->leaf ){
008496      rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
008497      if( rc ) goto cleardatabasepage_out;
008498    }else if( pnChange ){
008499      assert( pPage->intKey || CORRUPT_DB );
008500      testcase( !pPage->intKey );
008501      *pnChange += pPage->nCell;
008502    }
008503    if( freePageFlag ){
008504      freePage(pPage, &rc);
008505    }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
008506      zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
008507    }
008508  
008509  cleardatabasepage_out:
008510    pPage->bBusy = 0;
008511    releasePage(pPage);
008512    return rc;
008513  }
008514  
008515  /*
008516  ** Delete all information from a single table in the database.  iTable is
008517  ** the page number of the root of the table.  After this routine returns,
008518  ** the root page is empty, but still exists.
008519  **
008520  ** This routine will fail with SQLITE_LOCKED if there are any open
008521  ** read cursors on the table.  Open write cursors are moved to the
008522  ** root of the table.
008523  **
008524  ** If pnChange is not NULL, then table iTable must be an intkey table. The
008525  ** integer value pointed to by pnChange is incremented by the number of
008526  ** entries in the table.
008527  */
008528  int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
008529    int rc;
008530    BtShared *pBt = p->pBt;
008531    sqlite3BtreeEnter(p);
008532    assert( p->inTrans==TRANS_WRITE );
008533  
008534    rc = saveAllCursors(pBt, (Pgno)iTable, 0);
008535  
008536    if( SQLITE_OK==rc ){
008537      /* Invalidate all incrblob cursors open on table iTable (assuming iTable
008538      ** is the root of a table b-tree - if it is not, the following call is
008539      ** a no-op).  */
008540      invalidateIncrblobCursors(p, 0, 1);
008541      rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
008542    }
008543    sqlite3BtreeLeave(p);
008544    return rc;
008545  }
008546  
008547  /*
008548  ** Delete all information from the single table that pCur is open on.
008549  **
008550  ** This routine only work for pCur on an ephemeral table.
008551  */
008552  int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
008553    return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
008554  }
008555  
008556  /*
008557  ** Erase all information in a table and add the root of the table to
008558  ** the freelist.  Except, the root of the principle table (the one on
008559  ** page 1) is never added to the freelist.
008560  **
008561  ** This routine will fail with SQLITE_LOCKED if there are any open
008562  ** cursors on the table.
008563  **
008564  ** If AUTOVACUUM is enabled and the page at iTable is not the last
008565  ** root page in the database file, then the last root page 
008566  ** in the database file is moved into the slot formerly occupied by
008567  ** iTable and that last slot formerly occupied by the last root page
008568  ** is added to the freelist instead of iTable.  In this say, all
008569  ** root pages are kept at the beginning of the database file, which
008570  ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the 
008571  ** page number that used to be the last root page in the file before
008572  ** the move.  If no page gets moved, *piMoved is set to 0.
008573  ** The last root page is recorded in meta[3] and the value of
008574  ** meta[3] is updated by this procedure.
008575  */
008576  static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
008577    int rc;
008578    MemPage *pPage = 0;
008579    BtShared *pBt = p->pBt;
008580  
008581    assert( sqlite3BtreeHoldsMutex(p) );
008582    assert( p->inTrans==TRANS_WRITE );
008583    assert( iTable>=2 );
008584  
008585    rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
008586    if( rc ) return rc;
008587    rc = sqlite3BtreeClearTable(p, iTable, 0);
008588    if( rc ){
008589      releasePage(pPage);
008590      return rc;
008591    }
008592  
008593    *piMoved = 0;
008594  
008595  #ifdef SQLITE_OMIT_AUTOVACUUM
008596    freePage(pPage, &rc);
008597    releasePage(pPage);
008598  #else
008599    if( pBt->autoVacuum ){
008600      Pgno maxRootPgno;
008601      sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
008602  
008603      if( iTable==maxRootPgno ){
008604        /* If the table being dropped is the table with the largest root-page
008605        ** number in the database, put the root page on the free list. 
008606        */
008607        freePage(pPage, &rc);
008608        releasePage(pPage);
008609        if( rc!=SQLITE_OK ){
008610          return rc;
008611        }
008612      }else{
008613        /* The table being dropped does not have the largest root-page
008614        ** number in the database. So move the page that does into the 
008615        ** gap left by the deleted root-page.
008616        */
008617        MemPage *pMove;
008618        releasePage(pPage);
008619        rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
008620        if( rc!=SQLITE_OK ){
008621          return rc;
008622        }
008623        rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
008624        releasePage(pMove);
008625        if( rc!=SQLITE_OK ){
008626          return rc;
008627        }
008628        pMove = 0;
008629        rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
008630        freePage(pMove, &rc);
008631        releasePage(pMove);
008632        if( rc!=SQLITE_OK ){
008633          return rc;
008634        }
008635        *piMoved = maxRootPgno;
008636      }
008637  
008638      /* Set the new 'max-root-page' value in the database header. This
008639      ** is the old value less one, less one more if that happens to
008640      ** be a root-page number, less one again if that is the
008641      ** PENDING_BYTE_PAGE.
008642      */
008643      maxRootPgno--;
008644      while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
008645             || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
008646        maxRootPgno--;
008647      }
008648      assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
008649  
008650      rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
008651    }else{
008652      freePage(pPage, &rc);
008653      releasePage(pPage);
008654    }
008655  #endif
008656    return rc;  
008657  }
008658  int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
008659    int rc;
008660    sqlite3BtreeEnter(p);
008661    rc = btreeDropTable(p, iTable, piMoved);
008662    sqlite3BtreeLeave(p);
008663    return rc;
008664  }
008665  
008666  
008667  /*
008668  ** This function may only be called if the b-tree connection already
008669  ** has a read or write transaction open on the database.
008670  **
008671  ** Read the meta-information out of a database file.  Meta[0]
008672  ** is the number of free pages currently in the database.  Meta[1]
008673  ** through meta[15] are available for use by higher layers.  Meta[0]
008674  ** is read-only, the others are read/write.
008675  ** 
008676  ** The schema layer numbers meta values differently.  At the schema
008677  ** layer (and the SetCookie and ReadCookie opcodes) the number of
008678  ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
008679  **
008680  ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
008681  ** of reading the value out of the header, it instead loads the "DataVersion"
008682  ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
008683  ** database file.  It is a number computed by the pager.  But its access
008684  ** pattern is the same as header meta values, and so it is convenient to
008685  ** read it from this routine.
008686  */
008687  void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
008688    BtShared *pBt = p->pBt;
008689  
008690    sqlite3BtreeEnter(p);
008691    assert( p->inTrans>TRANS_NONE );
008692    assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
008693    assert( pBt->pPage1 );
008694    assert( idx>=0 && idx<=15 );
008695  
008696    if( idx==BTREE_DATA_VERSION ){
008697      *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
008698    }else{
008699      *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
008700    }
008701  
008702    /* If auto-vacuum is disabled in this build and this is an auto-vacuum
008703    ** database, mark the database as read-only.  */
008704  #ifdef SQLITE_OMIT_AUTOVACUUM
008705    if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
008706      pBt->btsFlags |= BTS_READ_ONLY;
008707    }
008708  #endif
008709  
008710    sqlite3BtreeLeave(p);
008711  }
008712  
008713  /*
008714  ** Write meta-information back into the database.  Meta[0] is
008715  ** read-only and may not be written.
008716  */
008717  int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
008718    BtShared *pBt = p->pBt;
008719    unsigned char *pP1;
008720    int rc;
008721    assert( idx>=1 && idx<=15 );
008722    sqlite3BtreeEnter(p);
008723    assert( p->inTrans==TRANS_WRITE );
008724    assert( pBt->pPage1!=0 );
008725    pP1 = pBt->pPage1->aData;
008726    rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
008727    if( rc==SQLITE_OK ){
008728      put4byte(&pP1[36 + idx*4], iMeta);
008729  #ifndef SQLITE_OMIT_AUTOVACUUM
008730      if( idx==BTREE_INCR_VACUUM ){
008731        assert( pBt->autoVacuum || iMeta==0 );
008732        assert( iMeta==0 || iMeta==1 );
008733        pBt->incrVacuum = (u8)iMeta;
008734      }
008735  #endif
008736    }
008737    sqlite3BtreeLeave(p);
008738    return rc;
008739  }
008740  
008741  #ifndef SQLITE_OMIT_BTREECOUNT
008742  /*
008743  ** The first argument, pCur, is a cursor opened on some b-tree. Count the
008744  ** number of entries in the b-tree and write the result to *pnEntry.
008745  **
008746  ** SQLITE_OK is returned if the operation is successfully executed. 
008747  ** Otherwise, if an error is encountered (i.e. an IO error or database
008748  ** corruption) an SQLite error code is returned.
008749  */
008750  int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
008751    i64 nEntry = 0;                      /* Value to return in *pnEntry */
008752    int rc;                              /* Return code */
008753  
008754    if( pCur->pgnoRoot==0 ){
008755      *pnEntry = 0;
008756      return SQLITE_OK;
008757    }
008758    rc = moveToRoot(pCur);
008759  
008760    /* Unless an error occurs, the following loop runs one iteration for each
008761    ** page in the B-Tree structure (not including overflow pages). 
008762    */
008763    while( rc==SQLITE_OK ){
008764      int iIdx;                          /* Index of child node in parent */
008765      MemPage *pPage;                    /* Current page of the b-tree */
008766  
008767      /* If this is a leaf page or the tree is not an int-key tree, then 
008768      ** this page contains countable entries. Increment the entry counter
008769      ** accordingly.
008770      */
008771      pPage = pCur->apPage[pCur->iPage];
008772      if( pPage->leaf || !pPage->intKey ){
008773        nEntry += pPage->nCell;
008774      }
008775  
008776      /* pPage is a leaf node. This loop navigates the cursor so that it 
008777      ** points to the first interior cell that it points to the parent of
008778      ** the next page in the tree that has not yet been visited. The
008779      ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
008780      ** of the page, or to the number of cells in the page if the next page
008781      ** to visit is the right-child of its parent.
008782      **
008783      ** If all pages in the tree have been visited, return SQLITE_OK to the
008784      ** caller.
008785      */
008786      if( pPage->leaf ){
008787        do {
008788          if( pCur->iPage==0 ){
008789            /* All pages of the b-tree have been visited. Return successfully. */
008790            *pnEntry = nEntry;
008791            return moveToRoot(pCur);
008792          }
008793          moveToParent(pCur);
008794        }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
008795  
008796        pCur->aiIdx[pCur->iPage]++;
008797        pPage = pCur->apPage[pCur->iPage];
008798      }
008799  
008800      /* Descend to the child node of the cell that the cursor currently 
008801      ** points at. This is the right-child if (iIdx==pPage->nCell).
008802      */
008803      iIdx = pCur->aiIdx[pCur->iPage];
008804      if( iIdx==pPage->nCell ){
008805        rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
008806      }else{
008807        rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
008808      }
008809    }
008810  
008811    /* An error has occurred. Return an error code. */
008812    return rc;
008813  }
008814  #endif
008815  
008816  /*
008817  ** Return the pager associated with a BTree.  This routine is used for
008818  ** testing and debugging only.
008819  */
008820  Pager *sqlite3BtreePager(Btree *p){
008821    return p->pBt->pPager;
008822  }
008823  
008824  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
008825  /*
008826  ** Append a message to the error message string.
008827  */
008828  static void checkAppendMsg(
008829    IntegrityCk *pCheck,
008830    const char *zFormat,
008831    ...
008832  ){
008833    va_list ap;
008834    if( !pCheck->mxErr ) return;
008835    pCheck->mxErr--;
008836    pCheck->nErr++;
008837    va_start(ap, zFormat);
008838    if( pCheck->errMsg.nChar ){
008839      sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
008840    }
008841    if( pCheck->zPfx ){
008842      sqlite3XPrintf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
008843    }
008844    sqlite3VXPrintf(&pCheck->errMsg, zFormat, ap);
008845    va_end(ap);
008846    if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
008847      pCheck->mallocFailed = 1;
008848    }
008849  }
008850  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
008851  
008852  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
008853  
008854  /*
008855  ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
008856  ** corresponds to page iPg is already set.
008857  */
008858  static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
008859    assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
008860    return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
008861  }
008862  
008863  /*
008864  ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
008865  */
008866  static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
008867    assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
008868    pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
008869  }
008870  
008871  
008872  /*
008873  ** Add 1 to the reference count for page iPage.  If this is the second
008874  ** reference to the page, add an error message to pCheck->zErrMsg.
008875  ** Return 1 if there are 2 or more references to the page and 0 if
008876  ** if this is the first reference to the page.
008877  **
008878  ** Also check that the page number is in bounds.
008879  */
008880  static int checkRef(IntegrityCk *pCheck, Pgno iPage){
008881    if( iPage==0 ) return 1;
008882    if( iPage>pCheck->nPage ){
008883      checkAppendMsg(pCheck, "invalid page number %d", iPage);
008884      return 1;
008885    }
008886    if( getPageReferenced(pCheck, iPage) ){
008887      checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
008888      return 1;
008889    }
008890    setPageReferenced(pCheck, iPage);
008891    return 0;
008892  }
008893  
008894  #ifndef SQLITE_OMIT_AUTOVACUUM
008895  /*
008896  ** Check that the entry in the pointer-map for page iChild maps to 
008897  ** page iParent, pointer type ptrType. If not, append an error message
008898  ** to pCheck.
008899  */
008900  static void checkPtrmap(
008901    IntegrityCk *pCheck,   /* Integrity check context */
008902    Pgno iChild,           /* Child page number */
008903    u8 eType,              /* Expected pointer map type */
008904    Pgno iParent           /* Expected pointer map parent page number */
008905  ){
008906    int rc;
008907    u8 ePtrmapType;
008908    Pgno iPtrmapParent;
008909  
008910    rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
008911    if( rc!=SQLITE_OK ){
008912      if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
008913      checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
008914      return;
008915    }
008916  
008917    if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
008918      checkAppendMsg(pCheck,
008919        "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 
008920        iChild, eType, iParent, ePtrmapType, iPtrmapParent);
008921    }
008922  }
008923  #endif
008924  
008925  /*
008926  ** Check the integrity of the freelist or of an overflow page list.
008927  ** Verify that the number of pages on the list is N.
008928  */
008929  static void checkList(
008930    IntegrityCk *pCheck,  /* Integrity checking context */
008931    int isFreeList,       /* True for a freelist.  False for overflow page list */
008932    int iPage,            /* Page number for first page in the list */
008933    int N                 /* Expected number of pages in the list */
008934  ){
008935    int i;
008936    int expected = N;
008937    int iFirst = iPage;
008938    while( N-- > 0 && pCheck->mxErr ){
008939      DbPage *pOvflPage;
008940      unsigned char *pOvflData;
008941      if( iPage<1 ){
008942        checkAppendMsg(pCheck,
008943           "%d of %d pages missing from overflow list starting at %d",
008944            N+1, expected, iFirst);
008945        break;
008946      }
008947      if( checkRef(pCheck, iPage) ) break;
008948      if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
008949        checkAppendMsg(pCheck, "failed to get page %d", iPage);
008950        break;
008951      }
008952      pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
008953      if( isFreeList ){
008954        int n = get4byte(&pOvflData[4]);
008955  #ifndef SQLITE_OMIT_AUTOVACUUM
008956        if( pCheck->pBt->autoVacuum ){
008957          checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
008958        }
008959  #endif
008960        if( n>(int)pCheck->pBt->usableSize/4-2 ){
008961          checkAppendMsg(pCheck,
008962             "freelist leaf count too big on page %d", iPage);
008963          N--;
008964        }else{
008965          for(i=0; i<n; i++){
008966            Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
008967  #ifndef SQLITE_OMIT_AUTOVACUUM
008968            if( pCheck->pBt->autoVacuum ){
008969              checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
008970            }
008971  #endif
008972            checkRef(pCheck, iFreePage);
008973          }
008974          N -= n;
008975        }
008976      }
008977  #ifndef SQLITE_OMIT_AUTOVACUUM
008978      else{
008979        /* If this database supports auto-vacuum and iPage is not the last
008980        ** page in this overflow list, check that the pointer-map entry for
008981        ** the following page matches iPage.
008982        */
008983        if( pCheck->pBt->autoVacuum && N>0 ){
008984          i = get4byte(pOvflData);
008985          checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
008986        }
008987      }
008988  #endif
008989      iPage = get4byte(pOvflData);
008990      sqlite3PagerUnref(pOvflPage);
008991  
008992      if( isFreeList && N<(iPage!=0) ){
008993        checkAppendMsg(pCheck, "free-page count in header is too small");
008994      }
008995    }
008996  }
008997  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
008998  
008999  /*
009000  ** An implementation of a min-heap.
009001  **
009002  ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
009003  ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
009004  ** and aHeap[N*2+1].
009005  **
009006  ** The heap property is this:  Every node is less than or equal to both
009007  ** of its daughter nodes.  A consequence of the heap property is that the
009008  ** root node aHeap[1] is always the minimum value currently in the heap.
009009  **
009010  ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
009011  ** the heap, preserving the heap property.  The btreeHeapPull() routine
009012  ** removes the root element from the heap (the minimum value in the heap)
009013  ** and then moves other nodes around as necessary to preserve the heap
009014  ** property.
009015  **
009016  ** This heap is used for cell overlap and coverage testing.  Each u32
009017  ** entry represents the span of a cell or freeblock on a btree page.  
009018  ** The upper 16 bits are the index of the first byte of a range and the
009019  ** lower 16 bits are the index of the last byte of that range.
009020  */
009021  static void btreeHeapInsert(u32 *aHeap, u32 x){
009022    u32 j, i = ++aHeap[0];
009023    aHeap[i] = x;
009024    while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
009025      x = aHeap[j];
009026      aHeap[j] = aHeap[i];
009027      aHeap[i] = x;
009028      i = j;
009029    }
009030  }
009031  static int btreeHeapPull(u32 *aHeap, u32 *pOut){
009032    u32 j, i, x;
009033    if( (x = aHeap[0])==0 ) return 0;
009034    *pOut = aHeap[1];
009035    aHeap[1] = aHeap[x];
009036    aHeap[x] = 0xffffffff;
009037    aHeap[0]--;
009038    i = 1;
009039    while( (j = i*2)<=aHeap[0] ){
009040      if( aHeap[j]>aHeap[j+1] ) j++;
009041      if( aHeap[i]<aHeap[j] ) break;
009042      x = aHeap[i];
009043      aHeap[i] = aHeap[j];
009044      aHeap[j] = x;
009045      i = j;
009046    }
009047    return 1;  
009048  }
009049  
009050  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009051  /*
009052  ** Do various sanity checks on a single page of a tree.  Return
009053  ** the tree depth.  Root pages return 0.  Parents of root pages
009054  ** return 1, and so forth.
009055  ** 
009056  ** These checks are done:
009057  **
009058  **      1.  Make sure that cells and freeblocks do not overlap
009059  **          but combine to completely cover the page.
009060  **      2.  Make sure integer cell keys are in order.
009061  **      3.  Check the integrity of overflow pages.
009062  **      4.  Recursively call checkTreePage on all children.
009063  **      5.  Verify that the depth of all children is the same.
009064  */
009065  static int checkTreePage(
009066    IntegrityCk *pCheck,  /* Context for the sanity check */
009067    int iPage,            /* Page number of the page to check */
009068    i64 *piMinKey,        /* Write minimum integer primary key here */
009069    i64 maxKey            /* Error if integer primary key greater than this */
009070  ){
009071    MemPage *pPage = 0;      /* The page being analyzed */
009072    int i;                   /* Loop counter */
009073    int rc;                  /* Result code from subroutine call */
009074    int depth = -1, d2;      /* Depth of a subtree */
009075    int pgno;                /* Page number */
009076    int nFrag;               /* Number of fragmented bytes on the page */
009077    int hdr;                 /* Offset to the page header */
009078    int cellStart;           /* Offset to the start of the cell pointer array */
009079    int nCell;               /* Number of cells */
009080    int doCoverageCheck = 1; /* True if cell coverage checking should be done */
009081    int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
009082                             ** False if IPK must be strictly less than maxKey */
009083    u8 *data;                /* Page content */
009084    u8 *pCell;               /* Cell content */
009085    u8 *pCellIdx;            /* Next element of the cell pointer array */
009086    BtShared *pBt;           /* The BtShared object that owns pPage */
009087    u32 pc;                  /* Address of a cell */
009088    u32 usableSize;          /* Usable size of the page */
009089    u32 contentOffset;       /* Offset to the start of the cell content area */
009090    u32 *heap = 0;           /* Min-heap used for checking cell coverage */
009091    u32 x, prev = 0;         /* Next and previous entry on the min-heap */
009092    const char *saved_zPfx = pCheck->zPfx;
009093    int saved_v1 = pCheck->v1;
009094    int saved_v2 = pCheck->v2;
009095    u8 savedIsInit = 0;
009096  
009097    /* Check that the page exists
009098    */
009099    pBt = pCheck->pBt;
009100    usableSize = pBt->usableSize;
009101    if( iPage==0 ) return 0;
009102    if( checkRef(pCheck, iPage) ) return 0;
009103    pCheck->zPfx = "Page %d: ";
009104    pCheck->v1 = iPage;
009105    if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
009106      checkAppendMsg(pCheck,
009107         "unable to get the page. error code=%d", rc);
009108      goto end_of_check;
009109    }
009110  
009111    /* Clear MemPage.isInit to make sure the corruption detection code in
009112    ** btreeInitPage() is executed.  */
009113    savedIsInit = pPage->isInit;
009114    pPage->isInit = 0;
009115    if( (rc = btreeInitPage(pPage))!=0 ){
009116      assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
009117      checkAppendMsg(pCheck,
009118                     "btreeInitPage() returns error code %d", rc);
009119      goto end_of_check;
009120    }
009121    data = pPage->aData;
009122    hdr = pPage->hdrOffset;
009123  
009124    /* Set up for cell analysis */
009125    pCheck->zPfx = "On tree page %d cell %d: ";
009126    contentOffset = get2byteNotZero(&data[hdr+5]);
009127    assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
009128  
009129    /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
009130    ** number of cells on the page. */
009131    nCell = get2byte(&data[hdr+3]);
009132    assert( pPage->nCell==nCell );
009133  
009134    /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
009135    ** immediately follows the b-tree page header. */
009136    cellStart = hdr + 12 - 4*pPage->leaf;
009137    assert( pPage->aCellIdx==&data[cellStart] );
009138    pCellIdx = &data[cellStart + 2*(nCell-1)];
009139  
009140    if( !pPage->leaf ){
009141      /* Analyze the right-child page of internal pages */
009142      pgno = get4byte(&data[hdr+8]);
009143  #ifndef SQLITE_OMIT_AUTOVACUUM
009144      if( pBt->autoVacuum ){
009145        pCheck->zPfx = "On page %d at right child: ";
009146        checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
009147      }
009148  #endif
009149      depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
009150      keyCanBeEqual = 0;
009151    }else{
009152      /* For leaf pages, the coverage check will occur in the same loop
009153      ** as the other cell checks, so initialize the heap.  */
009154      heap = pCheck->heap;
009155      heap[0] = 0;
009156    }
009157  
009158    /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
009159    ** integer offsets to the cell contents. */
009160    for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
009161      CellInfo info;
009162  
009163      /* Check cell size */
009164      pCheck->v2 = i;
009165      assert( pCellIdx==&data[cellStart + i*2] );
009166      pc = get2byteAligned(pCellIdx);
009167      pCellIdx -= 2;
009168      if( pc<contentOffset || pc>usableSize-4 ){
009169        checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
009170                               pc, contentOffset, usableSize-4);
009171        doCoverageCheck = 0;
009172        continue;
009173      }
009174      pCell = &data[pc];
009175      pPage->xParseCell(pPage, pCell, &info);
009176      if( pc+info.nSize>usableSize ){
009177        checkAppendMsg(pCheck, "Extends off end of page");
009178        doCoverageCheck = 0;
009179        continue;
009180      }
009181  
009182      /* Check for integer primary key out of range */
009183      if( pPage->intKey ){
009184        if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
009185          checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
009186        }
009187        maxKey = info.nKey;
009188      }
009189  
009190      /* Check the content overflow list */
009191      if( info.nPayload>info.nLocal ){
009192        int nPage;       /* Number of pages on the overflow chain */
009193        Pgno pgnoOvfl;   /* First page of the overflow chain */
009194        assert( pc + info.nSize - 4 <= usableSize );
009195        nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
009196        pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
009197  #ifndef SQLITE_OMIT_AUTOVACUUM
009198        if( pBt->autoVacuum ){
009199          checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
009200        }
009201  #endif
009202        checkList(pCheck, 0, pgnoOvfl, nPage);
009203      }
009204  
009205      if( !pPage->leaf ){
009206        /* Check sanity of left child page for internal pages */
009207        pgno = get4byte(pCell);
009208  #ifndef SQLITE_OMIT_AUTOVACUUM
009209        if( pBt->autoVacuum ){
009210          checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
009211        }
009212  #endif
009213        d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
009214        keyCanBeEqual = 0;
009215        if( d2!=depth ){
009216          checkAppendMsg(pCheck, "Child page depth differs");
009217          depth = d2;
009218        }
009219      }else{
009220        /* Populate the coverage-checking heap for leaf pages */
009221        btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
009222      }
009223    }
009224    *piMinKey = maxKey;
009225  
009226    /* Check for complete coverage of the page
009227    */
009228    pCheck->zPfx = 0;
009229    if( doCoverageCheck && pCheck->mxErr>0 ){
009230      /* For leaf pages, the min-heap has already been initialized and the
009231      ** cells have already been inserted.  But for internal pages, that has
009232      ** not yet been done, so do it now */
009233      if( !pPage->leaf ){
009234        heap = pCheck->heap;
009235        heap[0] = 0;
009236        for(i=nCell-1; i>=0; i--){
009237          u32 size;
009238          pc = get2byteAligned(&data[cellStart+i*2]);
009239          size = pPage->xCellSize(pPage, &data[pc]);
009240          btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
009241        }
009242      }
009243      /* Add the freeblocks to the min-heap
009244      **
009245      ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
009246      ** is the offset of the first freeblock, or zero if there are no
009247      ** freeblocks on the page. 
009248      */
009249      i = get2byte(&data[hdr+1]);
009250      while( i>0 ){
009251        int size, j;
009252        assert( (u32)i<=usableSize-4 );     /* Enforced by btreeInitPage() */
009253        size = get2byte(&data[i+2]);
009254        assert( (u32)(i+size)<=usableSize );  /* Enforced by btreeInitPage() */
009255        btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
009256        /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
009257        ** big-endian integer which is the offset in the b-tree page of the next
009258        ** freeblock in the chain, or zero if the freeblock is the last on the
009259        ** chain. */
009260        j = get2byte(&data[i]);
009261        /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
009262        ** increasing offset. */
009263        assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
009264        assert( (u32)j<=usableSize-4 );   /* Enforced by btreeInitPage() */
009265        i = j;
009266      }
009267      /* Analyze the min-heap looking for overlap between cells and/or 
009268      ** freeblocks, and counting the number of untracked bytes in nFrag.
009269      ** 
009270      ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
009271      ** There is an implied first entry the covers the page header, the cell
009272      ** pointer index, and the gap between the cell pointer index and the start
009273      ** of cell content.  
009274      **
009275      ** The loop below pulls entries from the min-heap in order and compares
009276      ** the start_address against the previous end_address.  If there is an
009277      ** overlap, that means bytes are used multiple times.  If there is a gap,
009278      ** that gap is added to the fragmentation count.
009279      */
009280      nFrag = 0;
009281      prev = contentOffset - 1;   /* Implied first min-heap entry */
009282      while( btreeHeapPull(heap,&x) ){
009283        if( (prev&0xffff)>=(x>>16) ){
009284          checkAppendMsg(pCheck,
009285            "Multiple uses for byte %u of page %d", x>>16, iPage);
009286          break;
009287        }else{
009288          nFrag += (x>>16) - (prev&0xffff) - 1;
009289          prev = x;
009290        }
009291      }
009292      nFrag += usableSize - (prev&0xffff) - 1;
009293      /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
009294      ** is stored in the fifth field of the b-tree page header.
009295      ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
009296      ** number of fragmented free bytes within the cell content area.
009297      */
009298      if( heap[0]==0 && nFrag!=data[hdr+7] ){
009299        checkAppendMsg(pCheck,
009300            "Fragmentation of %d bytes reported as %d on page %d",
009301            nFrag, data[hdr+7], iPage);
009302      }
009303    }
009304  
009305  end_of_check:
009306    if( !doCoverageCheck ) pPage->isInit = savedIsInit;
009307    releasePage(pPage);
009308    pCheck->zPfx = saved_zPfx;
009309    pCheck->v1 = saved_v1;
009310    pCheck->v2 = saved_v2;
009311    return depth+1;
009312  }
009313  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
009314  
009315  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009316  /*
009317  ** This routine does a complete check of the given BTree file.  aRoot[] is
009318  ** an array of pages numbers were each page number is the root page of
009319  ** a table.  nRoot is the number of entries in aRoot.
009320  **
009321  ** A read-only or read-write transaction must be opened before calling
009322  ** this function.
009323  **
009324  ** Write the number of error seen in *pnErr.  Except for some memory
009325  ** allocation errors,  an error message held in memory obtained from
009326  ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
009327  ** returned.  If a memory allocation error occurs, NULL is returned.
009328  */
009329  char *sqlite3BtreeIntegrityCheck(
009330    Btree *p,     /* The btree to be checked */
009331    int *aRoot,   /* An array of root pages numbers for individual trees */
009332    int nRoot,    /* Number of entries in aRoot[] */
009333    int mxErr,    /* Stop reporting errors after this many */
009334    int *pnErr    /* Write number of errors seen to this variable */
009335  ){
009336    Pgno i;
009337    IntegrityCk sCheck;
009338    BtShared *pBt = p->pBt;
009339    int savedDbFlags = pBt->db->flags;
009340    char zErr[100];
009341    VVA_ONLY( int nRef );
009342  
009343    sqlite3BtreeEnter(p);
009344    assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
009345    VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
009346    assert( nRef>=0 );
009347    sCheck.pBt = pBt;
009348    sCheck.pPager = pBt->pPager;
009349    sCheck.nPage = btreePagecount(sCheck.pBt);
009350    sCheck.mxErr = mxErr;
009351    sCheck.nErr = 0;
009352    sCheck.mallocFailed = 0;
009353    sCheck.zPfx = 0;
009354    sCheck.v1 = 0;
009355    sCheck.v2 = 0;
009356    sCheck.aPgRef = 0;
009357    sCheck.heap = 0;
009358    sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
009359    sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
009360    if( sCheck.nPage==0 ){
009361      goto integrity_ck_cleanup;
009362    }
009363  
009364    sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
009365    if( !sCheck.aPgRef ){
009366      sCheck.mallocFailed = 1;
009367      goto integrity_ck_cleanup;
009368    }
009369    sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
009370    if( sCheck.heap==0 ){
009371      sCheck.mallocFailed = 1;
009372      goto integrity_ck_cleanup;
009373    }
009374  
009375    i = PENDING_BYTE_PAGE(pBt);
009376    if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
009377  
009378    /* Check the integrity of the freelist
009379    */
009380    sCheck.zPfx = "Main freelist: ";
009381    checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
009382              get4byte(&pBt->pPage1->aData[36]));
009383    sCheck.zPfx = 0;
009384  
009385    /* Check all the tables.
009386    */
009387    testcase( pBt->db->flags & SQLITE_CellSizeCk );
009388    pBt->db->flags &= ~SQLITE_CellSizeCk;
009389    for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
009390      i64 notUsed;
009391      if( aRoot[i]==0 ) continue;
009392  #ifndef SQLITE_OMIT_AUTOVACUUM
009393      if( pBt->autoVacuum && aRoot[i]>1 ){
009394        checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
009395      }
009396  #endif
009397      checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
009398    }
009399    pBt->db->flags = savedDbFlags;
009400  
009401    /* Make sure every page in the file is referenced
009402    */
009403    for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
009404  #ifdef SQLITE_OMIT_AUTOVACUUM
009405      if( getPageReferenced(&sCheck, i)==0 ){
009406        checkAppendMsg(&sCheck, "Page %d is never used", i);
009407      }
009408  #else
009409      /* If the database supports auto-vacuum, make sure no tables contain
009410      ** references to pointer-map pages.
009411      */
009412      if( getPageReferenced(&sCheck, i)==0 && 
009413         (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
009414        checkAppendMsg(&sCheck, "Page %d is never used", i);
009415      }
009416      if( getPageReferenced(&sCheck, i)!=0 && 
009417         (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
009418        checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
009419      }
009420  #endif
009421    }
009422  
009423    /* Clean  up and report errors.
009424    */
009425  integrity_ck_cleanup:
009426    sqlite3PageFree(sCheck.heap);
009427    sqlite3_free(sCheck.aPgRef);
009428    if( sCheck.mallocFailed ){
009429      sqlite3StrAccumReset(&sCheck.errMsg);
009430      sCheck.nErr++;
009431    }
009432    *pnErr = sCheck.nErr;
009433    if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
009434    /* Make sure this analysis did not leave any unref() pages. */
009435    assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
009436    sqlite3BtreeLeave(p);
009437    return sqlite3StrAccumFinish(&sCheck.errMsg);
009438  }
009439  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
009440  
009441  /*
009442  ** Return the full pathname of the underlying database file.  Return
009443  ** an empty string if the database is in-memory or a TEMP database.
009444  **
009445  ** The pager filename is invariant as long as the pager is
009446  ** open so it is safe to access without the BtShared mutex.
009447  */
009448  const char *sqlite3BtreeGetFilename(Btree *p){
009449    assert( p->pBt->pPager!=0 );
009450    return sqlite3PagerFilename(p->pBt->pPager, 1);
009451  }
009452  
009453  /*
009454  ** Return the pathname of the journal file for this database. The return
009455  ** value of this routine is the same regardless of whether the journal file
009456  ** has been created or not.
009457  **
009458  ** The pager journal filename is invariant as long as the pager is
009459  ** open so it is safe to access without the BtShared mutex.
009460  */
009461  const char *sqlite3BtreeGetJournalname(Btree *p){
009462    assert( p->pBt->pPager!=0 );
009463    return sqlite3PagerJournalname(p->pBt->pPager);
009464  }
009465  
009466  /*
009467  ** Return non-zero if a transaction is active.
009468  */
009469  int sqlite3BtreeIsInTrans(Btree *p){
009470    assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
009471    return (p && (p->inTrans==TRANS_WRITE));
009472  }
009473  
009474  #ifndef SQLITE_OMIT_WAL
009475  /*
009476  ** Run a checkpoint on the Btree passed as the first argument.
009477  **
009478  ** Return SQLITE_LOCKED if this or any other connection has an open 
009479  ** transaction on the shared-cache the argument Btree is connected to.
009480  **
009481  ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
009482  */
009483  int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
009484    int rc = SQLITE_OK;
009485    if( p ){
009486      BtShared *pBt = p->pBt;
009487      sqlite3BtreeEnter(p);
009488      if( pBt->inTransaction!=TRANS_NONE ){
009489        rc = SQLITE_LOCKED;
009490      }else{
009491        rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
009492      }
009493      sqlite3BtreeLeave(p);
009494    }
009495    return rc;
009496  }
009497  #endif
009498  
009499  /*
009500  ** Return non-zero if a read (or write) transaction is active.
009501  */
009502  int sqlite3BtreeIsInReadTrans(Btree *p){
009503    assert( p );
009504    assert( sqlite3_mutex_held(p->db->mutex) );
009505    return p->inTrans!=TRANS_NONE;
009506  }
009507  
009508  int sqlite3BtreeIsInBackup(Btree *p){
009509    assert( p );
009510    assert( sqlite3_mutex_held(p->db->mutex) );
009511    return p->nBackup!=0;
009512  }
009513  
009514  /*
009515  ** This function returns a pointer to a blob of memory associated with
009516  ** a single shared-btree. The memory is used by client code for its own
009517  ** purposes (for example, to store a high-level schema associated with 
009518  ** the shared-btree). The btree layer manages reference counting issues.
009519  **
009520  ** The first time this is called on a shared-btree, nBytes bytes of memory
009521  ** are allocated, zeroed, and returned to the caller. For each subsequent 
009522  ** call the nBytes parameter is ignored and a pointer to the same blob
009523  ** of memory returned. 
009524  **
009525  ** If the nBytes parameter is 0 and the blob of memory has not yet been
009526  ** allocated, a null pointer is returned. If the blob has already been
009527  ** allocated, it is returned as normal.
009528  **
009529  ** Just before the shared-btree is closed, the function passed as the 
009530  ** xFree argument when the memory allocation was made is invoked on the 
009531  ** blob of allocated memory. The xFree function should not call sqlite3_free()
009532  ** on the memory, the btree layer does that.
009533  */
009534  void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
009535    BtShared *pBt = p->pBt;
009536    sqlite3BtreeEnter(p);
009537    if( !pBt->pSchema && nBytes ){
009538      pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
009539      pBt->xFreeSchema = xFree;
009540    }
009541    sqlite3BtreeLeave(p);
009542    return pBt->pSchema;
009543  }
009544  
009545  /*
009546  ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 
009547  ** btree as the argument handle holds an exclusive lock on the 
009548  ** sqlite_master table. Otherwise SQLITE_OK.
009549  */
009550  int sqlite3BtreeSchemaLocked(Btree *p){
009551    int rc;
009552    assert( sqlite3_mutex_held(p->db->mutex) );
009553    sqlite3BtreeEnter(p);
009554    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
009555    assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
009556    sqlite3BtreeLeave(p);
009557    return rc;
009558  }
009559  
009560  
009561  #ifndef SQLITE_OMIT_SHARED_CACHE
009562  /*
009563  ** Obtain a lock on the table whose root page is iTab.  The
009564  ** lock is a write lock if isWritelock is true or a read lock
009565  ** if it is false.
009566  */
009567  int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
009568    int rc = SQLITE_OK;
009569    assert( p->inTrans!=TRANS_NONE );
009570    if( p->sharable ){
009571      u8 lockType = READ_LOCK + isWriteLock;
009572      assert( READ_LOCK+1==WRITE_LOCK );
009573      assert( isWriteLock==0 || isWriteLock==1 );
009574  
009575      sqlite3BtreeEnter(p);
009576      rc = querySharedCacheTableLock(p, iTab, lockType);
009577      if( rc==SQLITE_OK ){
009578        rc = setSharedCacheTableLock(p, iTab, lockType);
009579      }
009580      sqlite3BtreeLeave(p);
009581    }
009582    return rc;
009583  }
009584  #endif
009585  
009586  #ifndef SQLITE_OMIT_INCRBLOB
009587  /*
009588  ** Argument pCsr must be a cursor opened for writing on an 
009589  ** INTKEY table currently pointing at a valid table entry. 
009590  ** This function modifies the data stored as part of that entry.
009591  **
009592  ** Only the data content may only be modified, it is not possible to 
009593  ** change the length of the data stored. If this function is called with
009594  ** parameters that attempt to write past the end of the existing data,
009595  ** no modifications are made and SQLITE_CORRUPT is returned.
009596  */
009597  int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
009598    int rc;
009599    assert( cursorOwnsBtShared(pCsr) );
009600    assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
009601    assert( pCsr->curFlags & BTCF_Incrblob );
009602  
009603    rc = restoreCursorPosition(pCsr);
009604    if( rc!=SQLITE_OK ){
009605      return rc;
009606    }
009607    assert( pCsr->eState!=CURSOR_REQUIRESEEK );
009608    if( pCsr->eState!=CURSOR_VALID ){
009609      return SQLITE_ABORT;
009610    }
009611  
009612    /* Save the positions of all other cursors open on this table. This is
009613    ** required in case any of them are holding references to an xFetch
009614    ** version of the b-tree page modified by the accessPayload call below.
009615    **
009616    ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
009617    ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
009618    ** saveAllCursors can only return SQLITE_OK.
009619    */
009620    VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
009621    assert( rc==SQLITE_OK );
009622  
009623    /* Check some assumptions: 
009624    **   (a) the cursor is open for writing,
009625    **   (b) there is a read/write transaction open,
009626    **   (c) the connection holds a write-lock on the table (if required),
009627    **   (d) there are no conflicting read-locks, and
009628    **   (e) the cursor points at a valid row of an intKey table.
009629    */
009630    if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
009631      return SQLITE_READONLY;
009632    }
009633    assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
009634                && pCsr->pBt->inTransaction==TRANS_WRITE );
009635    assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
009636    assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
009637    assert( pCsr->apPage[pCsr->iPage]->intKey );
009638  
009639    return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
009640  }
009641  
009642  /* 
009643  ** Mark this cursor as an incremental blob cursor.
009644  */
009645  void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
009646    pCur->curFlags |= BTCF_Incrblob;
009647    pCur->pBtree->hasIncrblobCur = 1;
009648  }
009649  #endif
009650  
009651  /*
009652  ** Set both the "read version" (single byte at byte offset 18) and 
009653  ** "write version" (single byte at byte offset 19) fields in the database
009654  ** header to iVersion.
009655  */
009656  int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
009657    BtShared *pBt = pBtree->pBt;
009658    int rc;                         /* Return code */
009659   
009660    assert( iVersion==1 || iVersion==2 );
009661  
009662    /* If setting the version fields to 1, do not automatically open the
009663    ** WAL connection, even if the version fields are currently set to 2.
009664    */
009665    pBt->btsFlags &= ~BTS_NO_WAL;
009666    if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
009667  
009668    rc = sqlite3BtreeBeginTrans(pBtree, 0);
009669    if( rc==SQLITE_OK ){
009670      u8 *aData = pBt->pPage1->aData;
009671      if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
009672        rc = sqlite3BtreeBeginTrans(pBtree, 2);
009673        if( rc==SQLITE_OK ){
009674          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
009675          if( rc==SQLITE_OK ){
009676            aData[18] = (u8)iVersion;
009677            aData[19] = (u8)iVersion;
009678          }
009679        }
009680      }
009681    }
009682  
009683    pBt->btsFlags &= ~BTS_NO_WAL;
009684    return rc;
009685  }
009686  
009687  /*
009688  ** Return true if the cursor has a hint specified.  This routine is
009689  ** only used from within assert() statements
009690  */
009691  int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
009692    return (pCsr->hints & mask)!=0;
009693  }
009694  
009695  /*
009696  ** Return true if the given Btree is read-only.
009697  */
009698  int sqlite3BtreeIsReadonly(Btree *p){
009699    return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
009700  }
009701  
009702  /*
009703  ** Return the size of the header added to each page by this module.
009704  */
009705  int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
009706  
009707  #if !defined(SQLITE_OMIT_SHARED_CACHE)
009708  /*
009709  ** Return true if the Btree passed as the only argument is sharable.
009710  */
009711  int sqlite3BtreeSharable(Btree *p){
009712    return p->sharable;
009713  }
009714  
009715  /*
009716  ** Return the number of connections to the BtShared object accessed by
009717  ** the Btree handle passed as the only argument. For private caches 
009718  ** this is always 1. For shared caches it may be 1 or greater.
009719  */
009720  int sqlite3BtreeConnectionCount(Btree *p){
009721    testcase( p->sharable );
009722    return p->pBt->nRef;
009723  }
009724  #endif