You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

460 lines
11 KiB

  1. // Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
  2. // All rights reserved.
  3. //
  4. // Use of this source code is governed by a BSD-style license that can be
  5. // found in the LICENSE file.
  6. package leveldb
  7. import (
  8. "sync/atomic"
  9. "time"
  10. "github.com/syndtr/goleveldb/leveldb/memdb"
  11. "github.com/syndtr/goleveldb/leveldb/opt"
  12. "github.com/syndtr/goleveldb/leveldb/util"
  13. )
  14. func (db *DB) writeJournal(batches []*Batch, seq uint64, sync bool) error {
  15. wr, err := db.journal.Next()
  16. if err != nil {
  17. return err
  18. }
  19. if err := writeBatchesWithHeader(wr, batches, seq); err != nil {
  20. return err
  21. }
  22. if err := db.journal.Flush(); err != nil {
  23. return err
  24. }
  25. if sync {
  26. return db.journalWriter.Sync()
  27. }
  28. return nil
  29. }
  30. func (db *DB) rotateMem(n int, wait bool) (mem *memDB, err error) {
  31. retryLimit := 3
  32. retry:
  33. // Wait for pending memdb compaction.
  34. err = db.compTriggerWait(db.mcompCmdC)
  35. if err != nil {
  36. return
  37. }
  38. retryLimit--
  39. // Create new memdb and journal.
  40. mem, err = db.newMem(n)
  41. if err != nil {
  42. if err == errHasFrozenMem {
  43. if retryLimit <= 0 {
  44. panic("BUG: still has frozen memdb")
  45. }
  46. goto retry
  47. }
  48. return
  49. }
  50. // Schedule memdb compaction.
  51. if wait {
  52. err = db.compTriggerWait(db.mcompCmdC)
  53. } else {
  54. db.compTrigger(db.mcompCmdC)
  55. }
  56. return
  57. }
  58. func (db *DB) flush(n int) (mdb *memDB, mdbFree int, err error) {
  59. delayed := false
  60. slowdownTrigger := db.s.o.GetWriteL0SlowdownTrigger()
  61. pauseTrigger := db.s.o.GetWriteL0PauseTrigger()
  62. flush := func() (retry bool) {
  63. mdb = db.getEffectiveMem()
  64. if mdb == nil {
  65. err = ErrClosed
  66. return false
  67. }
  68. defer func() {
  69. if retry {
  70. mdb.decref()
  71. mdb = nil
  72. }
  73. }()
  74. tLen := db.s.tLen(0)
  75. mdbFree = mdb.Free()
  76. switch {
  77. case tLen >= slowdownTrigger && !delayed:
  78. delayed = true
  79. time.Sleep(time.Millisecond)
  80. case mdbFree >= n:
  81. return false
  82. case tLen >= pauseTrigger:
  83. delayed = true
  84. err = db.compTriggerWait(db.tcompCmdC)
  85. if err != nil {
  86. return false
  87. }
  88. default:
  89. // Allow memdb to grow if it has no entry.
  90. if mdb.Len() == 0 {
  91. mdbFree = n
  92. } else {
  93. mdb.decref()
  94. mdb, err = db.rotateMem(n, false)
  95. if err == nil {
  96. mdbFree = mdb.Free()
  97. } else {
  98. mdbFree = 0
  99. }
  100. }
  101. return false
  102. }
  103. return true
  104. }
  105. start := time.Now()
  106. for flush() {
  107. }
  108. if delayed {
  109. db.writeDelay += time.Since(start)
  110. db.writeDelayN++
  111. } else if db.writeDelayN > 0 {
  112. db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
  113. atomic.AddInt32(&db.cWriteDelayN, int32(db.writeDelayN))
  114. atomic.AddInt64(&db.cWriteDelay, int64(db.writeDelay))
  115. db.writeDelay = 0
  116. db.writeDelayN = 0
  117. }
  118. return
  119. }
  120. type writeMerge struct {
  121. sync bool
  122. batch *Batch
  123. keyType keyType
  124. key, value []byte
  125. }
  126. func (db *DB) unlockWrite(overflow bool, merged int, err error) {
  127. for i := 0; i < merged; i++ {
  128. db.writeAckC <- err
  129. }
  130. if overflow {
  131. // Pass lock to the next write (that failed to merge).
  132. db.writeMergedC <- false
  133. } else {
  134. // Release lock.
  135. <-db.writeLockC
  136. }
  137. }
  138. // ourBatch is batch that we can modify.
  139. func (db *DB) writeLocked(batch, ourBatch *Batch, merge, sync bool) error {
  140. // Try to flush memdb. This method would also trying to throttle writes
  141. // if it is too fast and compaction cannot catch-up.
  142. mdb, mdbFree, err := db.flush(batch.internalLen)
  143. if err != nil {
  144. db.unlockWrite(false, 0, err)
  145. return err
  146. }
  147. defer mdb.decref()
  148. var (
  149. overflow bool
  150. merged int
  151. batches = []*Batch{batch}
  152. )
  153. if merge {
  154. // Merge limit.
  155. var mergeLimit int
  156. if batch.internalLen > 128<<10 {
  157. mergeLimit = (1 << 20) - batch.internalLen
  158. } else {
  159. mergeLimit = 128 << 10
  160. }
  161. mergeCap := mdbFree - batch.internalLen
  162. if mergeLimit > mergeCap {
  163. mergeLimit = mergeCap
  164. }
  165. merge:
  166. for mergeLimit > 0 {
  167. select {
  168. case incoming := <-db.writeMergeC:
  169. if incoming.batch != nil {
  170. // Merge batch.
  171. if incoming.batch.internalLen > mergeLimit {
  172. overflow = true
  173. break merge
  174. }
  175. batches = append(batches, incoming.batch)
  176. mergeLimit -= incoming.batch.internalLen
  177. } else {
  178. // Merge put.
  179. internalLen := len(incoming.key) + len(incoming.value) + 8
  180. if internalLen > mergeLimit {
  181. overflow = true
  182. break merge
  183. }
  184. if ourBatch == nil {
  185. ourBatch = db.batchPool.Get().(*Batch)
  186. ourBatch.Reset()
  187. batches = append(batches, ourBatch)
  188. }
  189. // We can use same batch since concurrent write doesn't
  190. // guarantee write order.
  191. ourBatch.appendRec(incoming.keyType, incoming.key, incoming.value)
  192. mergeLimit -= internalLen
  193. }
  194. sync = sync || incoming.sync
  195. merged++
  196. db.writeMergedC <- true
  197. default:
  198. break merge
  199. }
  200. }
  201. }
  202. // Release ourBatch if any.
  203. if ourBatch != nil {
  204. defer db.batchPool.Put(ourBatch)
  205. }
  206. // Seq number.
  207. seq := db.seq + 1
  208. // Write journal.
  209. if err := db.writeJournal(batches, seq, sync); err != nil {
  210. db.unlockWrite(overflow, merged, err)
  211. return err
  212. }
  213. // Put batches.
  214. for _, batch := range batches {
  215. if err := batch.putMem(seq, mdb.DB); err != nil {
  216. panic(err)
  217. }
  218. seq += uint64(batch.Len())
  219. }
  220. // Incr seq number.
  221. db.addSeq(uint64(batchesLen(batches)))
  222. // Rotate memdb if it's reach the threshold.
  223. if batch.internalLen >= mdbFree {
  224. db.rotateMem(0, false)
  225. }
  226. db.unlockWrite(overflow, merged, nil)
  227. return nil
  228. }
  229. // Write apply the given batch to the DB. The batch records will be applied
  230. // sequentially. Write might be used concurrently, when used concurrently and
  231. // batch is small enough, write will try to merge the batches. Set NoWriteMerge
  232. // option to true to disable write merge.
  233. //
  234. // It is safe to modify the contents of the arguments after Write returns but
  235. // not before. Write will not modify content of the batch.
  236. func (db *DB) Write(batch *Batch, wo *opt.WriteOptions) error {
  237. if err := db.ok(); err != nil || batch == nil || batch.Len() == 0 {
  238. return err
  239. }
  240. // If the batch size is larger than write buffer, it may justified to write
  241. // using transaction instead. Using transaction the batch will be written
  242. // into tables directly, skipping the journaling.
  243. if batch.internalLen > db.s.o.GetWriteBuffer() && !db.s.o.GetDisableLargeBatchTransaction() {
  244. tr, err := db.OpenTransaction()
  245. if err != nil {
  246. return err
  247. }
  248. if err := tr.Write(batch, wo); err != nil {
  249. tr.Discard()
  250. return err
  251. }
  252. return tr.Commit()
  253. }
  254. merge := !wo.GetNoWriteMerge() && !db.s.o.GetNoWriteMerge()
  255. sync := wo.GetSync() && !db.s.o.GetNoSync()
  256. // Acquire write lock.
  257. if merge {
  258. select {
  259. case db.writeMergeC <- writeMerge{sync: sync, batch: batch}:
  260. if <-db.writeMergedC {
  261. // Write is merged.
  262. return <-db.writeAckC
  263. }
  264. // Write is not merged, the write lock is handed to us. Continue.
  265. case db.writeLockC <- struct{}{}:
  266. // Write lock acquired.
  267. case err := <-db.compPerErrC:
  268. // Compaction error.
  269. return err
  270. case <-db.closeC:
  271. // Closed
  272. return ErrClosed
  273. }
  274. } else {
  275. select {
  276. case db.writeLockC <- struct{}{}:
  277. // Write lock acquired.
  278. case err := <-db.compPerErrC:
  279. // Compaction error.
  280. return err
  281. case <-db.closeC:
  282. // Closed
  283. return ErrClosed
  284. }
  285. }
  286. return db.writeLocked(batch, nil, merge, sync)
  287. }
  288. func (db *DB) putRec(kt keyType, key, value []byte, wo *opt.WriteOptions) error {
  289. if err := db.ok(); err != nil {
  290. return err
  291. }
  292. merge := !wo.GetNoWriteMerge() && !db.s.o.GetNoWriteMerge()
  293. sync := wo.GetSync() && !db.s.o.GetNoSync()
  294. // Acquire write lock.
  295. if merge {
  296. select {
  297. case db.writeMergeC <- writeMerge{sync: sync, keyType: kt, key: key, value: value}:
  298. if <-db.writeMergedC {
  299. // Write is merged.
  300. return <-db.writeAckC
  301. }
  302. // Write is not merged, the write lock is handed to us. Continue.
  303. case db.writeLockC <- struct{}{}:
  304. // Write lock acquired.
  305. case err := <-db.compPerErrC:
  306. // Compaction error.
  307. return err
  308. case <-db.closeC:
  309. // Closed
  310. return ErrClosed
  311. }
  312. } else {
  313. select {
  314. case db.writeLockC <- struct{}{}:
  315. // Write lock acquired.
  316. case err := <-db.compPerErrC:
  317. // Compaction error.
  318. return err
  319. case <-db.closeC:
  320. // Closed
  321. return ErrClosed
  322. }
  323. }
  324. batch := db.batchPool.Get().(*Batch)
  325. batch.Reset()
  326. batch.appendRec(kt, key, value)
  327. return db.writeLocked(batch, batch, merge, sync)
  328. }
  329. // Put sets the value for the given key. It overwrites any previous value
  330. // for that key; a DB is not a multi-map. Write merge also applies for Put, see
  331. // Write.
  332. //
  333. // It is safe to modify the contents of the arguments after Put returns but not
  334. // before.
  335. func (db *DB) Put(key, value []byte, wo *opt.WriteOptions) error {
  336. return db.putRec(keyTypeVal, key, value, wo)
  337. }
  338. // Delete deletes the value for the given key. Delete will not returns error if
  339. // key doesn't exist. Write merge also applies for Delete, see Write.
  340. //
  341. // It is safe to modify the contents of the arguments after Delete returns but
  342. // not before.
  343. func (db *DB) Delete(key []byte, wo *opt.WriteOptions) error {
  344. return db.putRec(keyTypeDel, key, nil, wo)
  345. }
  346. func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool {
  347. iter := mem.NewIterator(nil)
  348. defer iter.Release()
  349. return (max == nil || (iter.First() && icmp.uCompare(max, internalKey(iter.Key()).ukey()) >= 0)) &&
  350. (min == nil || (iter.Last() && icmp.uCompare(min, internalKey(iter.Key()).ukey()) <= 0))
  351. }
  352. // CompactRange compacts the underlying DB for the given key range.
  353. // In particular, deleted and overwritten versions are discarded,
  354. // and the data is rearranged to reduce the cost of operations
  355. // needed to access the data. This operation should typically only
  356. // be invoked by users who understand the underlying implementation.
  357. //
  358. // A nil Range.Start is treated as a key before all keys in the DB.
  359. // And a nil Range.Limit is treated as a key after all keys in the DB.
  360. // Therefore if both is nil then it will compact entire DB.
  361. func (db *DB) CompactRange(r util.Range) error {
  362. if err := db.ok(); err != nil {
  363. return err
  364. }
  365. // Lock writer.
  366. select {
  367. case db.writeLockC <- struct{}{}:
  368. case err := <-db.compPerErrC:
  369. return err
  370. case <-db.closeC:
  371. return ErrClosed
  372. }
  373. // Check for overlaps in memdb.
  374. mdb := db.getEffectiveMem()
  375. if mdb == nil {
  376. return ErrClosed
  377. }
  378. defer mdb.decref()
  379. if isMemOverlaps(db.s.icmp, mdb.DB, r.Start, r.Limit) {
  380. // Memdb compaction.
  381. if _, err := db.rotateMem(0, false); err != nil {
  382. <-db.writeLockC
  383. return err
  384. }
  385. <-db.writeLockC
  386. if err := db.compTriggerWait(db.mcompCmdC); err != nil {
  387. return err
  388. }
  389. } else {
  390. <-db.writeLockC
  391. }
  392. // Table compaction.
  393. return db.compTriggerRange(db.tcompCmdC, -1, r.Start, r.Limit)
  394. }
  395. // SetReadOnly makes DB read-only. It will stay read-only until reopened.
  396. func (db *DB) SetReadOnly() error {
  397. if err := db.ok(); err != nil {
  398. return err
  399. }
  400. // Lock writer.
  401. select {
  402. case db.writeLockC <- struct{}{}:
  403. db.compWriteLocking = true
  404. case err := <-db.compPerErrC:
  405. return err
  406. case <-db.closeC:
  407. return ErrClosed
  408. }
  409. // Set compaction read-only.
  410. select {
  411. case db.compErrSetC <- ErrReadOnly:
  412. case perr := <-db.compPerErrC:
  413. return perr
  414. case <-db.closeC:
  415. return ErrClosed
  416. }
  417. return nil
  418. }