You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

529 lines
12 KiB

  1. // Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
  2. // All rights reserved.
  3. //
  4. // Use of this source code is governed by a BSD-style license that can be
  5. // found in the LICENSE file.
  6. package leveldb
  7. import (
  8. "fmt"
  9. "sort"
  10. "sync/atomic"
  11. "github.com/syndtr/goleveldb/leveldb/cache"
  12. "github.com/syndtr/goleveldb/leveldb/iterator"
  13. "github.com/syndtr/goleveldb/leveldb/opt"
  14. "github.com/syndtr/goleveldb/leveldb/storage"
  15. "github.com/syndtr/goleveldb/leveldb/table"
  16. "github.com/syndtr/goleveldb/leveldb/util"
  17. )
  18. // tFile holds basic information about a table.
  19. type tFile struct {
  20. fd storage.FileDesc
  21. seekLeft int32
  22. size int64
  23. imin, imax internalKey
  24. }
  25. // Returns true if given key is after largest key of this table.
  26. func (t *tFile) after(icmp *iComparer, ukey []byte) bool {
  27. return ukey != nil && icmp.uCompare(ukey, t.imax.ukey()) > 0
  28. }
  29. // Returns true if given key is before smallest key of this table.
  30. func (t *tFile) before(icmp *iComparer, ukey []byte) bool {
  31. return ukey != nil && icmp.uCompare(ukey, t.imin.ukey()) < 0
  32. }
  33. // Returns true if given key range overlaps with this table key range.
  34. func (t *tFile) overlaps(icmp *iComparer, umin, umax []byte) bool {
  35. return !t.after(icmp, umin) && !t.before(icmp, umax)
  36. }
  37. // Cosumes one seek and return current seeks left.
  38. func (t *tFile) consumeSeek() int32 {
  39. return atomic.AddInt32(&t.seekLeft, -1)
  40. }
  41. // Creates new tFile.
  42. func newTableFile(fd storage.FileDesc, size int64, imin, imax internalKey) *tFile {
  43. f := &tFile{
  44. fd: fd,
  45. size: size,
  46. imin: imin,
  47. imax: imax,
  48. }
  49. // We arrange to automatically compact this file after
  50. // a certain number of seeks. Let's assume:
  51. // (1) One seek costs 10ms
  52. // (2) Writing or reading 1MB costs 10ms (100MB/s)
  53. // (3) A compaction of 1MB does 25MB of IO:
  54. // 1MB read from this level
  55. // 10-12MB read from next level (boundaries may be misaligned)
  56. // 10-12MB written to next level
  57. // This implies that 25 seeks cost the same as the compaction
  58. // of 1MB of data. I.e., one seek costs approximately the
  59. // same as the compaction of 40KB of data. We are a little
  60. // conservative and allow approximately one seek for every 16KB
  61. // of data before triggering a compaction.
  62. f.seekLeft = int32(size / 16384)
  63. if f.seekLeft < 100 {
  64. f.seekLeft = 100
  65. }
  66. return f
  67. }
  68. func tableFileFromRecord(r atRecord) *tFile {
  69. return newTableFile(storage.FileDesc{storage.TypeTable, r.num}, r.size, r.imin, r.imax)
  70. }
  71. // tFiles hold multiple tFile.
  72. type tFiles []*tFile
  73. func (tf tFiles) Len() int { return len(tf) }
  74. func (tf tFiles) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] }
  75. func (tf tFiles) nums() string {
  76. x := "[ "
  77. for i, f := range tf {
  78. if i != 0 {
  79. x += ", "
  80. }
  81. x += fmt.Sprint(f.fd.Num)
  82. }
  83. x += " ]"
  84. return x
  85. }
  86. // Returns true if i smallest key is less than j.
  87. // This used for sort by key in ascending order.
  88. func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool {
  89. a, b := tf[i], tf[j]
  90. n := icmp.Compare(a.imin, b.imin)
  91. if n == 0 {
  92. return a.fd.Num < b.fd.Num
  93. }
  94. return n < 0
  95. }
  96. // Returns true if i file number is greater than j.
  97. // This used for sort by file number in descending order.
  98. func (tf tFiles) lessByNum(i, j int) bool {
  99. return tf[i].fd.Num > tf[j].fd.Num
  100. }
  101. // Sorts tables by key in ascending order.
  102. func (tf tFiles) sortByKey(icmp *iComparer) {
  103. sort.Sort(&tFilesSortByKey{tFiles: tf, icmp: icmp})
  104. }
  105. // Sorts tables by file number in descending order.
  106. func (tf tFiles) sortByNum() {
  107. sort.Sort(&tFilesSortByNum{tFiles: tf})
  108. }
  109. // Returns sum of all tables size.
  110. func (tf tFiles) size() (sum int64) {
  111. for _, t := range tf {
  112. sum += t.size
  113. }
  114. return sum
  115. }
  116. // Searches smallest index of tables whose its smallest
  117. // key is after or equal with given key.
  118. func (tf tFiles) searchMin(icmp *iComparer, ikey internalKey) int {
  119. return sort.Search(len(tf), func(i int) bool {
  120. return icmp.Compare(tf[i].imin, ikey) >= 0
  121. })
  122. }
  123. // Searches smallest index of tables whose its largest
  124. // key is after or equal with given key.
  125. func (tf tFiles) searchMax(icmp *iComparer, ikey internalKey) int {
  126. return sort.Search(len(tf), func(i int) bool {
  127. return icmp.Compare(tf[i].imax, ikey) >= 0
  128. })
  129. }
  130. // Returns true if given key range overlaps with one or more
  131. // tables key range. If unsorted is true then binary search will not be used.
  132. func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) bool {
  133. if unsorted {
  134. // Check against all files.
  135. for _, t := range tf {
  136. if t.overlaps(icmp, umin, umax) {
  137. return true
  138. }
  139. }
  140. return false
  141. }
  142. i := 0
  143. if len(umin) > 0 {
  144. // Find the earliest possible internal key for min.
  145. i = tf.searchMax(icmp, makeInternalKey(nil, umin, keyMaxSeq, keyTypeSeek))
  146. }
  147. if i >= len(tf) {
  148. // Beginning of range is after all files, so no overlap.
  149. return false
  150. }
  151. return !tf[i].before(icmp, umax)
  152. }
  153. // Returns tables whose its key range overlaps with given key range.
  154. // Range will be expanded if ukey found hop across tables.
  155. // If overlapped is true then the search will be restarted if umax
  156. // expanded.
  157. // The dst content will be overwritten.
  158. func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles {
  159. dst = dst[:0]
  160. for i := 0; i < len(tf); {
  161. t := tf[i]
  162. if t.overlaps(icmp, umin, umax) {
  163. if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 {
  164. umin = t.imin.ukey()
  165. dst = dst[:0]
  166. i = 0
  167. continue
  168. } else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 {
  169. umax = t.imax.ukey()
  170. // Restart search if it is overlapped.
  171. if overlapped {
  172. dst = dst[:0]
  173. i = 0
  174. continue
  175. }
  176. }
  177. dst = append(dst, t)
  178. }
  179. i++
  180. }
  181. return dst
  182. }
  183. // Returns tables key range.
  184. func (tf tFiles) getRange(icmp *iComparer) (imin, imax internalKey) {
  185. for i, t := range tf {
  186. if i == 0 {
  187. imin, imax = t.imin, t.imax
  188. continue
  189. }
  190. if icmp.Compare(t.imin, imin) < 0 {
  191. imin = t.imin
  192. }
  193. if icmp.Compare(t.imax, imax) > 0 {
  194. imax = t.imax
  195. }
  196. }
  197. return
  198. }
  199. // Creates iterator index from tables.
  200. func (tf tFiles) newIndexIterator(tops *tOps, icmp *iComparer, slice *util.Range, ro *opt.ReadOptions) iterator.IteratorIndexer {
  201. if slice != nil {
  202. var start, limit int
  203. if slice.Start != nil {
  204. start = tf.searchMax(icmp, internalKey(slice.Start))
  205. }
  206. if slice.Limit != nil {
  207. limit = tf.searchMin(icmp, internalKey(slice.Limit))
  208. } else {
  209. limit = tf.Len()
  210. }
  211. tf = tf[start:limit]
  212. }
  213. return iterator.NewArrayIndexer(&tFilesArrayIndexer{
  214. tFiles: tf,
  215. tops: tops,
  216. icmp: icmp,
  217. slice: slice,
  218. ro: ro,
  219. })
  220. }
  221. // Tables iterator index.
  222. type tFilesArrayIndexer struct {
  223. tFiles
  224. tops *tOps
  225. icmp *iComparer
  226. slice *util.Range
  227. ro *opt.ReadOptions
  228. }
  229. func (a *tFilesArrayIndexer) Search(key []byte) int {
  230. return a.searchMax(a.icmp, internalKey(key))
  231. }
  232. func (a *tFilesArrayIndexer) Get(i int) iterator.Iterator {
  233. if i == 0 || i == a.Len()-1 {
  234. return a.tops.newIterator(a.tFiles[i], a.slice, a.ro)
  235. }
  236. return a.tops.newIterator(a.tFiles[i], nil, a.ro)
  237. }
  238. // Helper type for sortByKey.
  239. type tFilesSortByKey struct {
  240. tFiles
  241. icmp *iComparer
  242. }
  243. func (x *tFilesSortByKey) Less(i, j int) bool {
  244. return x.lessByKey(x.icmp, i, j)
  245. }
  246. // Helper type for sortByNum.
  247. type tFilesSortByNum struct {
  248. tFiles
  249. }
  250. func (x *tFilesSortByNum) Less(i, j int) bool {
  251. return x.lessByNum(i, j)
  252. }
  253. // Table operations.
  254. type tOps struct {
  255. s *session
  256. noSync bool
  257. cache *cache.Cache
  258. bcache *cache.Cache
  259. bpool *util.BufferPool
  260. }
  261. // Creates an empty table and returns table writer.
  262. func (t *tOps) create() (*tWriter, error) {
  263. fd := storage.FileDesc{storage.TypeTable, t.s.allocFileNum()}
  264. fw, err := t.s.stor.Create(fd)
  265. if err != nil {
  266. return nil, err
  267. }
  268. return &tWriter{
  269. t: t,
  270. fd: fd,
  271. w: fw,
  272. tw: table.NewWriter(fw, t.s.o.Options),
  273. }, nil
  274. }
  275. // Builds table from src iterator.
  276. func (t *tOps) createFrom(src iterator.Iterator) (f *tFile, n int, err error) {
  277. w, err := t.create()
  278. if err != nil {
  279. return
  280. }
  281. defer func() {
  282. if err != nil {
  283. w.drop()
  284. }
  285. }()
  286. for src.Next() {
  287. err = w.append(src.Key(), src.Value())
  288. if err != nil {
  289. return
  290. }
  291. }
  292. err = src.Error()
  293. if err != nil {
  294. return
  295. }
  296. n = w.tw.EntriesLen()
  297. f, err = w.finish()
  298. return
  299. }
  300. // Opens table. It returns a cache handle, which should
  301. // be released after use.
  302. func (t *tOps) open(f *tFile) (ch *cache.Handle, err error) {
  303. ch = t.cache.Get(0, uint64(f.fd.Num), func() (size int, value cache.Value) {
  304. var r storage.Reader
  305. r, err = t.s.stor.Open(f.fd)
  306. if err != nil {
  307. return 0, nil
  308. }
  309. var bcache *cache.NamespaceGetter
  310. if t.bcache != nil {
  311. bcache = &cache.NamespaceGetter{Cache: t.bcache, NS: uint64(f.fd.Num)}
  312. }
  313. var tr *table.Reader
  314. tr, err = table.NewReader(r, f.size, f.fd, bcache, t.bpool, t.s.o.Options)
  315. if err != nil {
  316. r.Close()
  317. return 0, nil
  318. }
  319. return 1, tr
  320. })
  321. if ch == nil && err == nil {
  322. err = ErrClosed
  323. }
  324. return
  325. }
  326. // Finds key/value pair whose key is greater than or equal to the
  327. // given key.
  328. func (t *tOps) find(f *tFile, key []byte, ro *opt.ReadOptions) (rkey, rvalue []byte, err error) {
  329. ch, err := t.open(f)
  330. if err != nil {
  331. return nil, nil, err
  332. }
  333. defer ch.Release()
  334. return ch.Value().(*table.Reader).Find(key, true, ro)
  335. }
  336. // Finds key that is greater than or equal to the given key.
  337. func (t *tOps) findKey(f *tFile, key []byte, ro *opt.ReadOptions) (rkey []byte, err error) {
  338. ch, err := t.open(f)
  339. if err != nil {
  340. return nil, err
  341. }
  342. defer ch.Release()
  343. return ch.Value().(*table.Reader).FindKey(key, true, ro)
  344. }
  345. // Returns approximate offset of the given key.
  346. func (t *tOps) offsetOf(f *tFile, key []byte) (offset int64, err error) {
  347. ch, err := t.open(f)
  348. if err != nil {
  349. return
  350. }
  351. defer ch.Release()
  352. return ch.Value().(*table.Reader).OffsetOf(key)
  353. }
  354. // Creates an iterator from the given table.
  355. func (t *tOps) newIterator(f *tFile, slice *util.Range, ro *opt.ReadOptions) iterator.Iterator {
  356. ch, err := t.open(f)
  357. if err != nil {
  358. return iterator.NewEmptyIterator(err)
  359. }
  360. iter := ch.Value().(*table.Reader).NewIterator(slice, ro)
  361. iter.SetReleaser(ch)
  362. return iter
  363. }
  364. // Removes table from persistent storage. It waits until
  365. // no one use the the table.
  366. func (t *tOps) remove(f *tFile) {
  367. t.cache.Delete(0, uint64(f.fd.Num), func() {
  368. if err := t.s.stor.Remove(f.fd); err != nil {
  369. t.s.logf("table@remove removing @%d %q", f.fd.Num, err)
  370. } else {
  371. t.s.logf("table@remove removed @%d", f.fd.Num)
  372. }
  373. if t.bcache != nil {
  374. t.bcache.EvictNS(uint64(f.fd.Num))
  375. }
  376. })
  377. }
  378. // Closes the table ops instance. It will close all tables,
  379. // regadless still used or not.
  380. func (t *tOps) close() {
  381. t.bpool.Close()
  382. t.cache.Close()
  383. if t.bcache != nil {
  384. t.bcache.CloseWeak()
  385. }
  386. }
  387. // Creates new initialized table ops instance.
  388. func newTableOps(s *session) *tOps {
  389. var (
  390. cacher cache.Cacher
  391. bcache *cache.Cache
  392. bpool *util.BufferPool
  393. )
  394. if s.o.GetOpenFilesCacheCapacity() > 0 {
  395. cacher = cache.NewLRU(s.o.GetOpenFilesCacheCapacity())
  396. }
  397. if !s.o.GetDisableBlockCache() {
  398. var bcacher cache.Cacher
  399. if s.o.GetBlockCacheCapacity() > 0 {
  400. bcacher = cache.NewLRU(s.o.GetBlockCacheCapacity())
  401. }
  402. bcache = cache.NewCache(bcacher)
  403. }
  404. if !s.o.GetDisableBufferPool() {
  405. bpool = util.NewBufferPool(s.o.GetBlockSize() + 5)
  406. }
  407. return &tOps{
  408. s: s,
  409. noSync: s.o.GetNoSync(),
  410. cache: cache.NewCache(cacher),
  411. bcache: bcache,
  412. bpool: bpool,
  413. }
  414. }
  415. // tWriter wraps the table writer. It keep track of file descriptor
  416. // and added key range.
  417. type tWriter struct {
  418. t *tOps
  419. fd storage.FileDesc
  420. w storage.Writer
  421. tw *table.Writer
  422. first, last []byte
  423. }
  424. // Append key/value pair to the table.
  425. func (w *tWriter) append(key, value []byte) error {
  426. if w.first == nil {
  427. w.first = append([]byte{}, key...)
  428. }
  429. w.last = append(w.last[:0], key...)
  430. return w.tw.Append(key, value)
  431. }
  432. // Returns true if the table is empty.
  433. func (w *tWriter) empty() bool {
  434. return w.first == nil
  435. }
  436. // Closes the storage.Writer.
  437. func (w *tWriter) close() {
  438. if w.w != nil {
  439. w.w.Close()
  440. w.w = nil
  441. }
  442. }
  443. // Finalizes the table and returns table file.
  444. func (w *tWriter) finish() (f *tFile, err error) {
  445. defer w.close()
  446. err = w.tw.Close()
  447. if err != nil {
  448. return
  449. }
  450. if !w.t.noSync {
  451. err = w.w.Sync()
  452. if err != nil {
  453. return
  454. }
  455. }
  456. f = newTableFile(w.fd, int64(w.tw.BytesLen()), internalKey(w.first), internalKey(w.last))
  457. return
  458. }
  459. // Drops the table.
  460. func (w *tWriter) drop() {
  461. w.close()
  462. w.t.s.stor.Remove(w.fd)
  463. w.t.s.reuseFileNum(w.fd.Num)
  464. w.tw = nil
  465. w.first = nil
  466. w.last = nil
  467. }