You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

480 lines
12 KiB

  1. package arbo
  2. import (
  3. "bytes"
  4. "fmt"
  5. "math"
  6. "sort"
  7. )
  8. /*
  9. AddBatch design
  10. ===============
  11. CASE A: Empty Tree --> if tree is empty (root==0)
  12. =================================================
  13. - Build the full tree from bottom to top (from all the leaf to the root)
  14. CASE B: ALMOST CASE A, Almost empty Tree --> if Tree has numLeafs < minLeafsThreshold
  15. ==============================================================================
  16. - Get the Leafs (key & value) (iterate the tree from the current root getting
  17. the leafs)
  18. - Create a new empty Tree
  19. - Do CASE A for the new Tree, giving the already existing key&values (leafs)
  20. from the original Tree + the new key&values to be added from the AddBatch call
  21. R R
  22. / \ / \
  23. A * / \
  24. / \ / \
  25. B C * *
  26. / | / \
  27. / | / \
  28. / | / \
  29. L: A B G D
  30. / \
  31. / \
  32. / \
  33. C *
  34. / \
  35. / \
  36. / \
  37. ... ... (nLeafs < minLeafsThreshold)
  38. CASE C: ALMOST CASE B --> if Tree has few Leafs (but numLeafs>=minLeafsThreshold)
  39. ==============================================================================
  40. - Use A, B, G, F as Roots of subtrees
  41. - Do CASE B for each subtree
  42. - Then go from L to the Root
  43. R
  44. / \
  45. / \
  46. / \
  47. * *
  48. / | / \
  49. / | / \
  50. / | / \
  51. L: A B G D
  52. / \
  53. / \
  54. / \
  55. C *
  56. / \
  57. / \
  58. / \
  59. ... ... (nLeafs >= minLeafsThreshold)
  60. CASE D: Already populated Tree
  61. ==============================
  62. - Use A, B, C, D as subtree
  63. - Sort the Keys in Buckets that share the initial part of the path
  64. - For each subtree add there the new leafs
  65. R
  66. / \
  67. / \
  68. / \
  69. * *
  70. / | / \
  71. / | / \
  72. / | / \
  73. L: A B C D
  74. /\ /\ / \ / \
  75. ... ... ... ... ... ...
  76. CASE E: Already populated Tree Unbalanced
  77. =========================================
  78. - Need to fill M1 and M2, and then will be able to use CASE D
  79. - Search for M1 & M2 in the inputed Keys
  80. - Add M1 & M2 to the Tree
  81. - From here can use CASE D
  82. R
  83. / \
  84. / \
  85. / \
  86. * *
  87. | \
  88. | \
  89. | \
  90. L: M1 * M2 * (where M1 and M2 are empty)
  91. / | /
  92. / | /
  93. / | /
  94. A * *
  95. / \ | \
  96. / \ | \
  97. / \ | \
  98. B * * C
  99. / \ |\
  100. ... ... | \
  101. | \
  102. D E
  103. Algorithm decision
  104. ==================
  105. - if nLeafs==0 (root==0): CASE A
  106. - if nLeafs<minLeafsThreshold: CASE B
  107. - if nLeafs>=minLeafsThreshold && (nLeafs/nBuckets) < minLeafsThreshold: CASE C
  108. - else: CASE D & CASE E
  109. - Multiple tree.Add calls: O(n log n)
  110. - Used in: cases A, B, C
  111. - Tree from bottom to top: O(log n)
  112. - Used in: cases D, E
  113. */
  114. const (
  115. minLeafsThreshold = uint64(100) // nolint:gomnd // TMP WIP this will be autocalculated
  116. nBuckets = uint64(4) // TMP WIP this will be autocalculated from
  117. )
  118. // AddBatchOpt is the WIP implementation of the AddBatch method in a more
  119. // optimized approach.
  120. func (t *Tree) AddBatchOpt(keys, values [][]byte) ([]int, error) {
  121. t.updateAccessTime()
  122. t.Lock()
  123. defer t.Unlock()
  124. // TODO if len(keys) is not a power of 2, add padding of empty
  125. // keys&values. Maybe when len(keyvalues) is not a power of 2, cut at
  126. // the biggest power of 2 under the len(keys), add those 2**n key-values
  127. // using the AddBatch approach, and then add the remaining key-values
  128. // using tree.Add.
  129. kvs, err := t.keysValuesToKvs(keys, values)
  130. if err != nil {
  131. return nil, err
  132. }
  133. t.tx, err = t.db.NewTx() // TODO add t.tx.Commit()
  134. if err != nil {
  135. return nil, err
  136. }
  137. // CASE A: if nLeafs==0 (root==0)
  138. if bytes.Equal(t.root, t.emptyHash) {
  139. // sort keys & values by path
  140. sortKvs(kvs)
  141. return t.buildTreeBottomUp(kvs)
  142. }
  143. // CASE B: if nLeafs<nBuckets
  144. nLeafs, err := t.GetNLeafs()
  145. if err != nil {
  146. return nil, err
  147. }
  148. if nLeafs < minLeafsThreshold { // CASE B
  149. invalids, excedents, err := t.caseB(0, kvs)
  150. if err != nil {
  151. return nil, err
  152. }
  153. // add the excedents
  154. for i := 0; i < len(excedents); i++ {
  155. err = t.add(0, excedents[i].k, excedents[i].v)
  156. if err != nil {
  157. invalids = append(invalids, excedents[i].pos)
  158. }
  159. }
  160. return invalids, nil
  161. }
  162. // CASE C: if nLeafs>=minLeafsThreshold && (nLeafs/nBuckets) < minLeafsThreshold
  163. // available parallelization, will need to be a power of 2 (2**n)
  164. var excedents []kv
  165. l := int(math.Log2(float64(nBuckets)))
  166. if nLeafs >= minLeafsThreshold && (nLeafs/nBuckets) < minLeafsThreshold {
  167. // TODO move to own function
  168. // 1. go down until level L (L=log2(nBuckets))
  169. keysAtL, err := t.getKeysAtLevel(l + 1)
  170. if err != nil {
  171. return nil, err
  172. }
  173. buckets := splitInBuckets(kvs, nBuckets)
  174. // 2. use keys at level L as roots of the subtrees under each one
  175. var subRoots [][]byte
  176. // TODO parallelize
  177. for i := 0; i < len(keysAtL); i++ {
  178. bucketTree := Tree{tx: t.tx, db: t.db, maxLevels: t.maxLevels,
  179. hashFunction: t.hashFunction, root: keysAtL[i]}
  180. // 3. and do CASE B for each
  181. _, bucketExcedents, err := bucketTree.caseB(l, buckets[i])
  182. if err != nil {
  183. return nil, err
  184. }
  185. excedents = append(excedents, bucketExcedents...)
  186. subRoots = append(subRoots, bucketTree.root)
  187. }
  188. // 4. go upFromKeys from the new roots of the subtrees
  189. newRoot, err := t.upFromKeys(subRoots)
  190. if err != nil {
  191. return nil, err
  192. }
  193. t.root = newRoot
  194. var invalids []int
  195. for i := 0; i < len(excedents); i++ {
  196. // Add until the level L
  197. err = t.add(0, excedents[i].k, excedents[i].v)
  198. if err != nil {
  199. invalids = append(invalids, excedents[i].pos) // TODO WIP
  200. }
  201. }
  202. return invalids, nil
  203. }
  204. // TODO store t.root into DB
  205. // TODO update NLeafs from DB
  206. return nil, fmt.Errorf("UNIMPLEMENTED")
  207. }
  208. func (t *Tree) caseB(l int, kvs []kv) ([]int, []kv, error) {
  209. // get already existing keys
  210. aKs, aVs, err := t.getLeafs(t.root)
  211. if err != nil {
  212. return nil, nil, err
  213. }
  214. aKvs, err := t.keysValuesToKvs(aKs, aVs)
  215. if err != nil {
  216. return nil, nil, err
  217. }
  218. // add already existing key-values to the inputted key-values
  219. kvs = append(kvs, aKvs...)
  220. // proceed with CASE A
  221. sortKvs(kvs)
  222. // cutPowerOfTwo, the excedent add it as normal Tree.Add
  223. kvsP2, kvsNonP2 := cutPowerOfTwo(kvs)
  224. invalids, err := t.buildTreeBottomUp(kvsP2)
  225. if err != nil {
  226. return nil, nil, err
  227. }
  228. // return the excedents which will be added at the full tree at the end
  229. return invalids, kvsNonP2, nil
  230. }
  231. func splitInBuckets(kvs []kv, nBuckets uint64) [][]kv {
  232. buckets := make([][]kv, nBuckets)
  233. // 1. classify the keyvalues into buckets
  234. for i := 0; i < len(kvs); i++ {
  235. pair := kvs[i]
  236. bucketnum := keyToBucket(pair.k, int(nBuckets))
  237. buckets[bucketnum] = append(buckets[bucketnum], pair)
  238. }
  239. return buckets
  240. }
  241. // TODO rename in a more 'real' name (calculate bucket from/for key)
  242. func keyToBucket(k []byte, nBuckets int) int {
  243. nLevels := int(math.Log2(float64(nBuckets)))
  244. b := make([]int, nBuckets)
  245. for i := 0; i < nBuckets; i++ {
  246. b[i] = i
  247. }
  248. r := b
  249. mid := len(r) / 2 //nolint:gomnd
  250. for i := 0; i < nLevels; i++ {
  251. if int(k[i/8]&(1<<(i%8))) != 0 {
  252. r = r[mid:]
  253. mid = len(r) / 2 //nolint:gomnd
  254. } else {
  255. r = r[:mid]
  256. mid = len(r) / 2 //nolint:gomnd
  257. }
  258. }
  259. return r[0]
  260. }
  261. type kv struct {
  262. pos int // original position in the array
  263. keyPath []byte
  264. k []byte
  265. v []byte
  266. }
  267. // compareBytes compares byte slices where the bytes are compared from left to
  268. // right and each byte is compared by bit from right to left
  269. func compareBytes(a, b []byte) bool {
  270. // WIP
  271. for i := 0; i < len(a); i++ {
  272. for j := 0; j < 8; j++ {
  273. aBit := a[i] & (1 << j)
  274. bBit := b[i] & (1 << j)
  275. if aBit > bBit {
  276. return false
  277. } else if aBit < bBit {
  278. return true
  279. }
  280. }
  281. }
  282. return false
  283. }
  284. // sortKvs sorts the kv by path
  285. func sortKvs(kvs []kv) {
  286. sort.Slice(kvs, func(i, j int) bool {
  287. return compareBytes(kvs[i].keyPath, kvs[j].keyPath)
  288. })
  289. }
  290. func (t *Tree) keysValuesToKvs(ks, vs [][]byte) ([]kv, error) {
  291. if len(ks) != len(vs) {
  292. return nil, fmt.Errorf("len(keys)!=len(values) (%d!=%d)",
  293. len(ks), len(vs))
  294. }
  295. kvs := make([]kv, len(ks))
  296. for i := 0; i < len(ks); i++ {
  297. keyPath := make([]byte, t.hashFunction.Len())
  298. copy(keyPath[:], ks[i])
  299. kvs[i].pos = i
  300. kvs[i].keyPath = ks[i]
  301. kvs[i].k = ks[i]
  302. kvs[i].v = vs[i]
  303. }
  304. return kvs, nil
  305. }
  306. /*
  307. func (t *Tree) kvsToKeysValues(kvs []kv) ([][]byte, [][]byte) {
  308. ks := make([][]byte, len(kvs))
  309. vs := make([][]byte, len(kvs))
  310. for i := 0; i < len(kvs); i++ {
  311. ks[i] = kvs[i].k
  312. vs[i] = kvs[i].v
  313. }
  314. return ks, vs
  315. }
  316. */
  317. // keys & values must be sorted by path, and the array ks must be length
  318. // multiple of 2
  319. // TODO return index of failed keyvaules
  320. func (t *Tree) buildTreeBottomUp(kvs []kv) ([]int, error) {
  321. // build the leafs
  322. leafKeys := make([][]byte, len(kvs))
  323. for i := 0; i < len(kvs); i++ {
  324. // TODO handle the case where Key&Value == 0
  325. leafKey, leafValue, err := newLeafValue(t.hashFunction, kvs[i].k, kvs[i].v)
  326. if err != nil {
  327. return nil, err
  328. }
  329. // store leafKey & leafValue to db
  330. if err := t.tx.Put(leafKey, leafValue); err != nil {
  331. return nil, err
  332. }
  333. leafKeys[i] = leafKey
  334. }
  335. // TODO parallelize t.upFromKeys until level log2(nBuckets) is reached
  336. r, err := t.upFromKeys(leafKeys)
  337. if err != nil {
  338. return nil, err
  339. }
  340. t.root = r
  341. return nil, nil
  342. }
  343. // keys & values must be sorted by path, and the array ks must be length
  344. // multiple of 2
  345. func (t *Tree) upFromKeys(ks [][]byte) ([]byte, error) {
  346. if len(ks) == 1 {
  347. return ks[0], nil
  348. }
  349. var rKs [][]byte
  350. for i := 0; i < len(ks); i += 2 {
  351. // TODO handle the case where Key&Value == 0
  352. k, v, err := newIntermediate(t.hashFunction, ks[i], ks[i+1])
  353. if err != nil {
  354. return nil, err
  355. }
  356. // store k-v to db
  357. if err = t.tx.Put(k, v); err != nil {
  358. return nil, err
  359. }
  360. rKs = append(rKs, k)
  361. }
  362. return t.upFromKeys(rKs)
  363. }
  364. func (t *Tree) getLeafs(root []byte) ([][]byte, [][]byte, error) {
  365. var ks, vs [][]byte
  366. err := t.iter(root, func(k, v []byte) {
  367. if v[0] != PrefixValueLeaf {
  368. return
  369. }
  370. leafK, leafV := readLeafValue(v)
  371. ks = append(ks, leafK)
  372. vs = append(vs, leafV)
  373. })
  374. return ks, vs, err
  375. }
  376. func (t *Tree) getKeysAtLevel(l int) ([][]byte, error) {
  377. var keys [][]byte
  378. err := t.iterWithStop(t.root, 0, func(currLvl int, k, v []byte) bool {
  379. if currLvl == l {
  380. keys = append(keys, k)
  381. }
  382. if currLvl >= l {
  383. return true // to stop the iter from going down
  384. }
  385. return false
  386. })
  387. return keys, err
  388. }
  389. // cutPowerOfTwo returns []kv of length that is a power of 2, and a second []kv
  390. // with the extra elements that don't fit in a power of 2 length
  391. func cutPowerOfTwo(kvs []kv) ([]kv, []kv) {
  392. x := len(kvs)
  393. if (x & (x - 1)) != 0 {
  394. p2 := highestPowerOfTwo(x)
  395. return kvs[:p2], kvs[p2:]
  396. }
  397. return kvs, nil
  398. }
  399. func highestPowerOfTwo(n int) int {
  400. res := 0
  401. for i := n; i >= 1; i-- {
  402. if (i & (i - 1)) == 0 {
  403. res = i
  404. break
  405. }
  406. }
  407. return res
  408. }
  409. // func computeSimpleAddCost(nLeafs int) int {
  410. // // nLvls 2^nLvls
  411. // nLvls := int(math.Log2(float64(nLeafs)))
  412. // return nLvls * int(math.Pow(2, float64(nLvls)))
  413. // }
  414. //
  415. // func computeBottomUpAddCost(nLeafs int) int {
  416. // // 2^nLvls * 2 - 1
  417. // nLvls := int(math.Log2(float64(nLeafs)))
  418. // return (int(math.Pow(2, float64(nLvls))) * 2) - 1
  419. // }