node.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600
  1. package bolt
  2. import (
  3. "bytes"
  4. "fmt"
  5. "sort"
  6. "unsafe"
  7. )
  8. // node represents an in-memory, deserialized page.
  9. type node struct {
  10. bucket *Bucket
  11. isLeaf bool
  12. unbalanced bool
  13. spilled bool
  14. key []byte
  15. pgid pgid
  16. parent *node
  17. children nodes
  18. inodes inodes
  19. }
  20. // root returns the top-level node this node is attached to.
  21. func (n *node) root() *node {
  22. if n.parent == nil {
  23. return n
  24. }
  25. return n.parent.root()
  26. }
  27. // minKeys returns the minimum number of inodes this node should have.
  28. func (n *node) minKeys() int {
  29. if n.isLeaf {
  30. return 1
  31. }
  32. return 2
  33. }
  34. // size returns the size of the node after serialization.
  35. func (n *node) size() int {
  36. sz, elsz := pageHeaderSize, n.pageElementSize()
  37. for i := 0; i < len(n.inodes); i++ {
  38. item := &n.inodes[i]
  39. sz += elsz + len(item.key) + len(item.value)
  40. }
  41. return sz
  42. }
  43. // sizeLessThan returns true if the node is less than a given size.
  44. // This is an optimization to avoid calculating a large node when we only need
  45. // to know if it fits inside a certain page size.
  46. func (n *node) sizeLessThan(v int) bool {
  47. sz, elsz := pageHeaderSize, n.pageElementSize()
  48. for i := 0; i < len(n.inodes); i++ {
  49. item := &n.inodes[i]
  50. sz += elsz + len(item.key) + len(item.value)
  51. if sz >= v {
  52. return false
  53. }
  54. }
  55. return true
  56. }
  57. // pageElementSize returns the size of each page element based on the type of node.
  58. func (n *node) pageElementSize() int {
  59. if n.isLeaf {
  60. return leafPageElementSize
  61. }
  62. return branchPageElementSize
  63. }
  64. // childAt returns the child node at a given index.
  65. func (n *node) childAt(index int) *node {
  66. if n.isLeaf {
  67. panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
  68. }
  69. return n.bucket.node(n.inodes[index].pgid, n)
  70. }
  71. // childIndex returns the index of a given child node.
  72. func (n *node) childIndex(child *node) int {
  73. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
  74. return index
  75. }
  76. // numChildren returns the number of children.
  77. func (n *node) numChildren() int {
  78. return len(n.inodes)
  79. }
  80. // nextSibling returns the next node with the same parent.
  81. func (n *node) nextSibling() *node {
  82. if n.parent == nil {
  83. return nil
  84. }
  85. index := n.parent.childIndex(n)
  86. if index >= n.parent.numChildren()-1 {
  87. return nil
  88. }
  89. return n.parent.childAt(index + 1)
  90. }
  91. // prevSibling returns the previous node with the same parent.
  92. func (n *node) prevSibling() *node {
  93. if n.parent == nil {
  94. return nil
  95. }
  96. index := n.parent.childIndex(n)
  97. if index == 0 {
  98. return nil
  99. }
  100. return n.parent.childAt(index - 1)
  101. }
  102. // put inserts a key/value.
  103. func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
  104. if pgid >= n.bucket.tx.meta.pgid {
  105. panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
  106. } else if len(oldKey) <= 0 {
  107. panic("put: zero-length old key")
  108. } else if len(newKey) <= 0 {
  109. panic("put: zero-length new key")
  110. }
  111. // Find insertion index.
  112. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
  113. // Add capacity and shift nodes if we don't have an exact match and need to insert.
  114. exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
  115. if !exact {
  116. n.inodes = append(n.inodes, inode{})
  117. copy(n.inodes[index+1:], n.inodes[index:])
  118. }
  119. inode := &n.inodes[index]
  120. inode.flags = flags
  121. inode.key = newKey
  122. inode.value = value
  123. inode.pgid = pgid
  124. _assert(len(inode.key) > 0, "put: zero-length inode key")
  125. }
  126. // del removes a key from the node.
  127. func (n *node) del(key []byte) {
  128. // Find index of key.
  129. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
  130. // Exit if the key isn't found.
  131. if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
  132. return
  133. }
  134. // Delete inode from the node.
  135. n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
  136. // Mark the node as needing rebalancing.
  137. n.unbalanced = true
  138. }
  139. // read initializes the node from a page.
  140. func (n *node) read(p *page) {
  141. n.pgid = p.id
  142. n.isLeaf = ((p.flags & leafPageFlag) != 0)
  143. n.inodes = make(inodes, int(p.count))
  144. for i := 0; i < int(p.count); i++ {
  145. inode := &n.inodes[i]
  146. if n.isLeaf {
  147. elem := p.leafPageElement(uint16(i))
  148. inode.flags = elem.flags
  149. inode.key = elem.key()
  150. inode.value = elem.value()
  151. } else {
  152. elem := p.branchPageElement(uint16(i))
  153. inode.pgid = elem.pgid
  154. inode.key = elem.key()
  155. }
  156. _assert(len(inode.key) > 0, "read: zero-length inode key")
  157. }
  158. // Save first key so we can find the node in the parent when we spill.
  159. if len(n.inodes) > 0 {
  160. n.key = n.inodes[0].key
  161. _assert(len(n.key) > 0, "read: zero-length node key")
  162. } else {
  163. n.key = nil
  164. }
  165. }
  166. // write writes the items onto one or more pages.
  167. func (n *node) write(p *page) {
  168. // Initialize page.
  169. if n.isLeaf {
  170. p.flags |= leafPageFlag
  171. } else {
  172. p.flags |= branchPageFlag
  173. }
  174. if len(n.inodes) >= 0xFFFF {
  175. panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
  176. }
  177. p.count = uint16(len(n.inodes))
  178. // Loop over each item and write it to the page.
  179. b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
  180. for i, item := range n.inodes {
  181. _assert(len(item.key) > 0, "write: zero-length inode key")
  182. // Write the page element.
  183. if n.isLeaf {
  184. elem := p.leafPageElement(uint16(i))
  185. elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
  186. elem.flags = item.flags
  187. elem.ksize = uint32(len(item.key))
  188. elem.vsize = uint32(len(item.value))
  189. } else {
  190. elem := p.branchPageElement(uint16(i))
  191. elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
  192. elem.ksize = uint32(len(item.key))
  193. elem.pgid = item.pgid
  194. _assert(elem.pgid != p.id, "write: circular dependency occurred")
  195. }
  196. // If the length of key+value is larger than the max allocation size
  197. // then we need to reallocate the byte array pointer.
  198. //
  199. // See: https://github.com/boltdb/bolt/pull/335
  200. klen, vlen := len(item.key), len(item.value)
  201. if len(b) < klen+vlen {
  202. b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
  203. }
  204. // Write data for the element to the end of the page.
  205. copy(b[0:], item.key)
  206. b = b[klen:]
  207. copy(b[0:], item.value)
  208. b = b[vlen:]
  209. }
  210. // DEBUG ONLY: n.dump()
  211. }
  212. // split breaks up a node into multiple smaller nodes, if appropriate.
  213. // This should only be called from the spill() function.
  214. func (n *node) split(pageSize int) []*node {
  215. var nodes []*node
  216. node := n
  217. for {
  218. // Split node into two.
  219. a, b := node.splitTwo(pageSize)
  220. nodes = append(nodes, a)
  221. // If we can't split then exit the loop.
  222. if b == nil {
  223. break
  224. }
  225. // Set node to b so it gets split on the next iteration.
  226. node = b
  227. }
  228. return nodes
  229. }
  230. // splitTwo breaks up a node into two smaller nodes, if appropriate.
  231. // This should only be called from the split() function.
  232. func (n *node) splitTwo(pageSize int) (*node, *node) {
  233. // Ignore the split if the page doesn't have at least enough nodes for
  234. // two pages or if the nodes can fit in a single page.
  235. if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
  236. return n, nil
  237. }
  238. // Determine the threshold before starting a new node.
  239. var fillPercent = n.bucket.FillPercent
  240. if fillPercent < minFillPercent {
  241. fillPercent = minFillPercent
  242. } else if fillPercent > maxFillPercent {
  243. fillPercent = maxFillPercent
  244. }
  245. threshold := int(float64(pageSize) * fillPercent)
  246. // Determine split position and sizes of the two pages.
  247. splitIndex, _ := n.splitIndex(threshold)
  248. // Split node into two separate nodes.
  249. // If there's no parent then we'll need to create one.
  250. if n.parent == nil {
  251. n.parent = &node{bucket: n.bucket, children: []*node{n}}
  252. }
  253. // Create a new node and add it to the parent.
  254. next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
  255. n.parent.children = append(n.parent.children, next)
  256. // Split inodes across two nodes.
  257. next.inodes = n.inodes[splitIndex:]
  258. n.inodes = n.inodes[:splitIndex]
  259. // Update the statistics.
  260. n.bucket.tx.stats.Split++
  261. return n, next
  262. }
  263. // splitIndex finds the position where a page will fill a given threshold.
  264. // It returns the index as well as the size of the first page.
  265. // This is only be called from split().
  266. func (n *node) splitIndex(threshold int) (index, sz int) {
  267. sz = pageHeaderSize
  268. // Loop until we only have the minimum number of keys required for the second page.
  269. for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
  270. index = i
  271. inode := n.inodes[i]
  272. elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
  273. // If we have at least the minimum number of keys and adding another
  274. // node would put us over the threshold then exit and return.
  275. if i >= minKeysPerPage && sz+elsize > threshold {
  276. break
  277. }
  278. // Add the element size to the total size.
  279. sz += elsize
  280. }
  281. return
  282. }
  283. // spill writes the nodes to dirty pages and splits nodes as it goes.
  284. // Returns an error if dirty pages cannot be allocated.
  285. func (n *node) spill() error {
  286. var tx = n.bucket.tx
  287. if n.spilled {
  288. return nil
  289. }
  290. // Spill child nodes first. Child nodes can materialize sibling nodes in
  291. // the case of split-merge so we cannot use a range loop. We have to check
  292. // the children size on every loop iteration.
  293. sort.Sort(n.children)
  294. for i := 0; i < len(n.children); i++ {
  295. if err := n.children[i].spill(); err != nil {
  296. return err
  297. }
  298. }
  299. // We no longer need the child list because it's only used for spill tracking.
  300. n.children = nil
  301. // Split nodes into appropriate sizes. The first node will always be n.
  302. var nodes = n.split(tx.db.pageSize)
  303. for _, node := range nodes {
  304. // Add node's page to the freelist if it's not new.
  305. if node.pgid > 0 {
  306. tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
  307. node.pgid = 0
  308. }
  309. // Allocate contiguous space for the node.
  310. p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
  311. if err != nil {
  312. return err
  313. }
  314. // Write the node.
  315. if p.id >= tx.meta.pgid {
  316. panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
  317. }
  318. node.pgid = p.id
  319. node.write(p)
  320. node.spilled = true
  321. // Insert into parent inodes.
  322. if node.parent != nil {
  323. var key = node.key
  324. if key == nil {
  325. key = node.inodes[0].key
  326. }
  327. node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
  328. node.key = node.inodes[0].key
  329. _assert(len(node.key) > 0, "spill: zero-length node key")
  330. }
  331. // Update the statistics.
  332. tx.stats.Spill++
  333. }
  334. // If the root node split and created a new root then we need to spill that
  335. // as well. We'll clear out the children to make sure it doesn't try to respill.
  336. if n.parent != nil && n.parent.pgid == 0 {
  337. n.children = nil
  338. return n.parent.spill()
  339. }
  340. return nil
  341. }
  342. // rebalance attempts to combine the node with sibling nodes if the node fill
  343. // size is below a threshold or if there are not enough keys.
  344. func (n *node) rebalance() {
  345. if !n.unbalanced {
  346. return
  347. }
  348. n.unbalanced = false
  349. // Update statistics.
  350. n.bucket.tx.stats.Rebalance++
  351. // Ignore if node is above threshold (25%) and has enough keys.
  352. var threshold = n.bucket.tx.db.pageSize / 4
  353. if n.size() > threshold && len(n.inodes) > n.minKeys() {
  354. return
  355. }
  356. // Root node has special handling.
  357. if n.parent == nil {
  358. // If root node is a branch and only has one node then collapse it.
  359. if !n.isLeaf && len(n.inodes) == 1 {
  360. // Move root's child up.
  361. child := n.bucket.node(n.inodes[0].pgid, n)
  362. n.isLeaf = child.isLeaf
  363. n.inodes = child.inodes[:]
  364. n.children = child.children
  365. // Reparent all child nodes being moved.
  366. for _, inode := range n.inodes {
  367. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  368. child.parent = n
  369. }
  370. }
  371. // Remove old child.
  372. child.parent = nil
  373. delete(n.bucket.nodes, child.pgid)
  374. child.free()
  375. }
  376. return
  377. }
  378. // If node has no keys then just remove it.
  379. if n.numChildren() == 0 {
  380. n.parent.del(n.key)
  381. n.parent.removeChild(n)
  382. delete(n.bucket.nodes, n.pgid)
  383. n.free()
  384. n.parent.rebalance()
  385. return
  386. }
  387. _assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
  388. // Destination node is right sibling if idx == 0, otherwise left sibling.
  389. var target *node
  390. var useNextSibling = (n.parent.childIndex(n) == 0)
  391. if useNextSibling {
  392. target = n.nextSibling()
  393. } else {
  394. target = n.prevSibling()
  395. }
  396. // If both this node and the target node are too small then merge them.
  397. if useNextSibling {
  398. // Reparent all child nodes being moved.
  399. for _, inode := range target.inodes {
  400. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  401. child.parent.removeChild(child)
  402. child.parent = n
  403. child.parent.children = append(child.parent.children, child)
  404. }
  405. }
  406. // Copy over inodes from target and remove target.
  407. n.inodes = append(n.inodes, target.inodes...)
  408. n.parent.del(target.key)
  409. n.parent.removeChild(target)
  410. delete(n.bucket.nodes, target.pgid)
  411. target.free()
  412. } else {
  413. // Reparent all child nodes being moved.
  414. for _, inode := range n.inodes {
  415. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  416. child.parent.removeChild(child)
  417. child.parent = target
  418. child.parent.children = append(child.parent.children, child)
  419. }
  420. }
  421. // Copy over inodes to target and remove node.
  422. target.inodes = append(target.inodes, n.inodes...)
  423. n.parent.del(n.key)
  424. n.parent.removeChild(n)
  425. delete(n.bucket.nodes, n.pgid)
  426. n.free()
  427. }
  428. // Either this node or the target node was deleted from the parent so rebalance it.
  429. n.parent.rebalance()
  430. }
  431. // removes a node from the list of in-memory children.
  432. // This does not affect the inodes.
  433. func (n *node) removeChild(target *node) {
  434. for i, child := range n.children {
  435. if child == target {
  436. n.children = append(n.children[:i], n.children[i+1:]...)
  437. return
  438. }
  439. }
  440. }
  441. // dereference causes the node to copy all its inode key/value references to heap memory.
  442. // This is required when the mmap is reallocated so inodes are not pointing to stale data.
  443. func (n *node) dereference() {
  444. if n.key != nil {
  445. key := make([]byte, len(n.key))
  446. copy(key, n.key)
  447. n.key = key
  448. _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
  449. }
  450. for i := range n.inodes {
  451. inode := &n.inodes[i]
  452. key := make([]byte, len(inode.key))
  453. copy(key, inode.key)
  454. inode.key = key
  455. _assert(len(inode.key) > 0, "dereference: zero-length inode key")
  456. value := make([]byte, len(inode.value))
  457. copy(value, inode.value)
  458. inode.value = value
  459. }
  460. // Recursively dereference children.
  461. for _, child := range n.children {
  462. child.dereference()
  463. }
  464. // Update statistics.
  465. n.bucket.tx.stats.NodeDeref++
  466. }
  467. // free adds the node's underlying page to the freelist.
  468. func (n *node) free() {
  469. if n.pgid != 0 {
  470. n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
  471. n.pgid = 0
  472. }
  473. }
  474. // dump writes the contents of the node to STDERR for debugging purposes.
  475. /*
  476. func (n *node) dump() {
  477. // Write node header.
  478. var typ = "branch"
  479. if n.isLeaf {
  480. typ = "leaf"
  481. }
  482. warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
  483. // Write out abbreviated version of each item.
  484. for _, item := range n.inodes {
  485. if n.isLeaf {
  486. if item.flags&bucketLeafFlag != 0 {
  487. bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
  488. warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
  489. } else {
  490. warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
  491. }
  492. } else {
  493. warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
  494. }
  495. }
  496. warn("")
  497. }
  498. */
  499. type nodes []*node
  500. func (s nodes) Len() int { return len(s) }
  501. func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  502. func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
  503. // inode represents an internal node inside of a node.
  504. // It can be used to point to elements in a page or point
  505. // to an element which hasn't been added to a page yet.
  506. type inode struct {
  507. flags uint32
  508. pgid pgid
  509. key []byte
  510. value []byte
  511. }
  512. type inodes []inode