aboutsummaryrefslogtreecommitdiff
path: root/drivers/block/drbd/drbd_main.c
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2011-11-28 15:04:49 +0100
committerPhilipp Reisner <philipp.reisner@linbit.com>2012-11-08 16:58:35 +0100
commitb6dd1a89767bc33e9c98b3195f8925b46c5c95f3 (patch)
treee82371062171f5cade79cb0c4a6cd22486b5f082 /drivers/block/drbd/drbd_main.c
parentd5b27b01f17ef1f0badc45f9eea521be3457c9cb (diff)
downloadlinux-b6dd1a89767bc33e9c98b3195f8925b46c5c95f3.tar.gz
drbd: remove struct drbd_tl_epoch objects (barrier works)
cherry-picked and adapted from drbd 9 devel branch DRBD requests (struct drbd_request) are already on the per resource transfer log list, and carry their epoch number. We do not need to additionally link them on other ring lists in other structs. The drbd sender thread can recognize itself when to send a P_BARRIER, by tracking the currently processed epoch, and how many writes have been processed for that epoch. If the epoch of the request to be processed does not match the currently processed epoch, any writes have been processed in it, a P_BARRIER for this last processed epoch is send out first. The new epoch then becomes the currently processed epoch. To not get stuck in drbd_al_begin_io() waiting for P_BARRIER_ACK, the sender thread also needs to handle the case when the current epoch was closed already, but no new requests are queued yet, and send out P_BARRIER as soon as possible. This is done by comparing the per resource "current transfer log epoch" (tconn->current_tle_nr) with the per connection "currently processed epoch number" (tconn->send.current_epoch_nr), while waiting for new requests to be processed in wait_for_work(). Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd/drbd_main.c')
-rw-r--r--drivers/block/drbd/drbd_main.c316
1 files changed, 66 insertions, 250 deletions
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 7e37149684e..8c6c48e363c 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -188,147 +188,75 @@ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
#endif
/**
- * DOC: The transfer log
- *
- * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
- * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
- * of the list. There is always at least one &struct drbd_tl_epoch object.
- *
- * Each &struct drbd_tl_epoch has a circular double linked list of requests
- * attached.
- */
-static int tl_init(struct drbd_tconn *tconn)
-{
- struct drbd_tl_epoch *b;
-
- /* during device minor initialization, we may well use GFP_KERNEL */
- b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
- if (!b)
- return 0;
- INIT_LIST_HEAD(&b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->next = NULL;
- b->br_number = atomic_inc_return(&tconn->current_tle_nr);
- b->n_writes = 0;
- b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-
- tconn->oldest_tle = b;
- tconn->newest_tle = b;
- INIT_LIST_HEAD(&tconn->out_of_sequence_requests);
- INIT_LIST_HEAD(&tconn->barrier_acked_requests);
-
- return 1;
-}
-
-static void tl_cleanup(struct drbd_tconn *tconn)
-{
- if (tconn->oldest_tle != tconn->newest_tle)
- conn_err(tconn, "ASSERT FAILED: oldest_tle == newest_tle\n");
- if (!list_empty(&tconn->out_of_sequence_requests))
- conn_err(tconn, "ASSERT FAILED: list_empty(out_of_sequence_requests)\n");
- kfree(tconn->oldest_tle);
- tconn->oldest_tle = NULL;
- kfree(tconn->unused_spare_tle);
- tconn->unused_spare_tle = NULL;
-}
-
-/**
- * _tl_add_barrier() - Adds a barrier to the transfer log
- * @mdev: DRBD device.
- * @new: Barrier to be added before the current head of the TL.
- *
- * The caller must hold the req_lock.
- */
-void _tl_add_barrier(struct drbd_tconn *tconn, struct drbd_tl_epoch *new)
-{
- INIT_LIST_HEAD(&new->requests);
- INIT_LIST_HEAD(&new->w.list);
- new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
- new->next = NULL;
- new->n_writes = 0;
-
- new->br_number = atomic_inc_return(&tconn->current_tle_nr);
- if (tconn->newest_tle != new) {
- tconn->newest_tle->next = new;
- tconn->newest_tle = new;
- }
-}
-
-/**
- * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
- * @mdev: DRBD device.
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @tconn: DRBD connection.
* @barrier_nr: Expected identifier of the DRBD write barrier packet.
* @set_size: Expected number of requests before that barrier.
*
* In case the passed barrier_nr or set_size does not match the oldest
- * &struct drbd_tl_epoch objects this function will cause a termination
- * of the connection.
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
*/
void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
unsigned int set_size)
{
- struct drbd_conf *mdev;
- struct drbd_tl_epoch *b, *nob; /* next old barrier */
- struct list_head *le, *tle;
struct drbd_request *r;
+ struct drbd_request *req = NULL;
+ int expect_epoch = 0;
+ int expect_size = 0;
spin_lock_irq(&tconn->req_lock);
- b = tconn->oldest_tle;
+ /* find latest not yet barrier-acked write request,
+ * count writes in its epoch. */
+ list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+ const unsigned long s = r->rq_state;
+ if (!req) {
+ if (!(s & RQ_WRITE))
+ continue;
+ if (!(s & RQ_NET_MASK))
+ continue;
+ if (s & RQ_NET_DONE)
+ continue;
+ req = r;
+ expect_epoch = req->epoch;
+ expect_size ++;
+ } else {
+ if (r->epoch != expect_epoch)
+ break;
+ if (!(s & RQ_WRITE))
+ continue;
+ /* if (s & RQ_DONE): not expected */
+ /* if (!(s & RQ_NET_MASK)): not expected */
+ expect_size++;
+ }
+ }
/* first some paranoia code */
- if (b == NULL) {
+ if (req == NULL) {
conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
barrier_nr);
goto bail;
}
- if (b->br_number != barrier_nr) {
+ if (expect_epoch != barrier_nr) {
conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
- barrier_nr, b->br_number);
+ barrier_nr, expect_epoch);
goto bail;
}
- if (b->n_writes != set_size) {
+
+ if (expect_size != set_size) {
conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
- barrier_nr, set_size, b->n_writes);
+ barrier_nr, set_size, expect_size);
goto bail;
}
/* Clean up list of requests processed during current epoch */
- list_for_each_safe(le, tle, &b->requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- _req_mod(r, BARRIER_ACKED);
- }
- /* There could be requests on the list waiting for completion
- of the write to the local disk. To avoid corruptions of
- slab's data structures we have to remove the lists head.
-
- Also there could have been a barrier ack out of sequence, overtaking
- the write acks - which would be a bug and violating write ordering.
- To not deadlock in case we lose connection while such requests are
- still pending, we need some way to find them for the
- _req_mode(CONNECTION_LOST_WHILE_PENDING).
-
- These have been list_move'd to the out_of_sequence_requests list in
- _req_mod(, BARRIER_ACKED) above.
- */
- list_splice_init(&b->requests, &tconn->barrier_acked_requests);
- mdev = b->w.mdev;
-
- nob = b->next;
- if (test_and_clear_bit(CREATE_BARRIER, &tconn->flags)) {
- _tl_add_barrier(tconn, b);
- if (nob)
- tconn->oldest_tle = nob;
- /* if nob == NULL b was the only barrier, and becomes the new
- barrier. Therefore tconn->oldest_tle points already to b */
- } else {
- D_ASSERT(nob != NULL);
- tconn->oldest_tle = nob;
- kfree(b);
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
+ if (req->epoch != expect_epoch)
+ break;
+ _req_mod(req, BARRIER_ACKED);
}
-
spin_unlock_irq(&tconn->req_lock);
- dec_ap_pending(mdev);
return;
@@ -346,91 +274,20 @@ bail:
* @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
* RESTART_FROZEN_DISK_IO.
*/
+/* must hold resource->req_lock */
void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
{
- struct drbd_tl_epoch *b, *tmp, **pn;
- struct list_head *le, *tle, carry_reads;
- struct drbd_request *req;
- int rv, n_writes, n_reads;
-
- b = tconn->oldest_tle;
- pn = &tconn->oldest_tle;
- while (b) {
- n_writes = 0;
- n_reads = 0;
- INIT_LIST_HEAD(&carry_reads);
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- rv = _req_mod(req, what);
-
- if (rv & MR_WRITE)
- n_writes++;
- if (rv & MR_READ)
- n_reads++;
- }
- tmp = b->next;
-
- if (n_writes) {
- if (what == RESEND) {
- b->n_writes = n_writes;
- if (b->w.cb == NULL) {
- b->w.cb = w_send_barrier;
- inc_ap_pending(b->w.mdev);
- set_bit(CREATE_BARRIER, &tconn->flags);
- }
-
- drbd_queue_work(&tconn->sender_work, &b->w);
- }
- pn = &b->next;
- } else {
- if (n_reads)
- list_add(&carry_reads, &b->requests);
- /* there could still be requests on that ring list,
- * in case local io is still pending */
- list_del(&b->requests);
-
- /* dec_ap_pending corresponding to queue_barrier.
- * the newest barrier may not have been queued yet,
- * in which case w.cb is still NULL. */
- if (b->w.cb != NULL)
- dec_ap_pending(b->w.mdev);
-
- if (b == tconn->newest_tle) {
- /* recycle, but reinit! */
- if (tmp != NULL)
- conn_err(tconn, "ASSERT FAILED tmp == NULL");
- INIT_LIST_HEAD(&b->requests);
- list_splice(&carry_reads, &b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->w.cb = NULL;
- b->br_number = atomic_inc_return(&tconn->current_tle_nr);
- b->n_writes = 0;
-
- *pn = b;
- break;
- }
- *pn = tmp;
- kfree(b);
- }
- b = tmp;
- list_splice(&carry_reads, &b->requests);
- }
-
- /* Actions operating on the disk state, also want to work on
- requests that got barrier acked. */
- switch (what) {
- case FAIL_FROZEN_DISK_IO:
- case RESTART_FROZEN_DISK_IO:
- list_for_each_safe(le, tle, &tconn->barrier_acked_requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- _req_mod(req, what);
- }
- case CONNECTION_LOST_WHILE_PENDING:
- case RESEND:
- break;
- default:
- conn_err(tconn, "what = %d in _tl_restart()\n", what);
- }
+ struct drbd_request *req, *r;
+
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests)
+ _req_mod(req, what);
+}
+
+void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+{
+ spin_lock_irq(&tconn->req_lock);
+ _tl_restart(tconn, what);
+ spin_unlock_irq(&tconn->req_lock);
}
/**
@@ -443,36 +300,7 @@ void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
*/
void tl_clear(struct drbd_tconn *tconn)
{
- struct list_head *le, *tle;
- struct drbd_request *r;
-
- spin_lock_irq(&tconn->req_lock);
-
- _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
-
- /* we expect this list to be empty. */
- if (!list_empty(&tconn->out_of_sequence_requests))
- conn_err(tconn, "ASSERT FAILED list_empty(&out_of_sequence_requests)\n");
-
- /* but just in case, clean it up anyways! */
- list_for_each_safe(le, tle, &tconn->out_of_sequence_requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
- }
-
- /* ensure bit indicating barrier is required is clear */
- clear_bit(CREATE_BARRIER, &tconn->flags);
-
- spin_unlock_irq(&tconn->req_lock);
-}
-
-void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
-{
- spin_lock_irq(&tconn->req_lock);
- _tl_restart(tconn, what);
- spin_unlock_irq(&tconn->req_lock);
+ tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
}
/**
@@ -482,31 +310,16 @@ void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
void tl_abort_disk_io(struct drbd_conf *mdev)
{
struct drbd_tconn *tconn = mdev->tconn;
- struct drbd_tl_epoch *b;
- struct list_head *le, *tle;
- struct drbd_request *req;
+ struct drbd_request *req, *r;
spin_lock_irq(&tconn->req_lock);
- b = tconn->oldest_tle;
- while (b) {
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- if (!(req->rq_state & RQ_LOCAL_PENDING))
- continue;
- if (req->w.mdev == mdev)
- _req_mod(req, ABORT_DISK_IO);
- }
- b = b->next;
- }
-
- list_for_each_safe(le, tle, &tconn->barrier_acked_requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
if (!(req->rq_state & RQ_LOCAL_PENDING))
continue;
- if (req->w.mdev == mdev)
- _req_mod(req, ABORT_DISK_IO);
+ if (req->w.mdev != mdev)
+ continue;
+ _req_mod(req, ABORT_DISK_IO);
}
-
spin_unlock_irq(&tconn->req_lock);
}
@@ -2680,17 +2493,21 @@ struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts)
if (set_resource_options(tconn, res_opts))
goto fail;
- if (!tl_init(tconn))
- goto fail;
-
tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
if (!tconn->current_epoch)
goto fail;
+
+ INIT_LIST_HEAD(&tconn->transfer_log);
+
INIT_LIST_HEAD(&tconn->current_epoch->list);
tconn->epochs = 1;
spin_lock_init(&tconn->epoch_lock);
tconn->write_ordering = WO_bdev_flush;
+ tconn->send.seen_any_write_yet = false;
+ tconn->send.current_epoch_nr = 0;
+ tconn->send.current_epoch_writes = 0;
+
tconn->cstate = C_STANDALONE;
mutex_init(&tconn->cstate_mutex);
spin_lock_init(&tconn->req_lock);
@@ -2713,7 +2530,6 @@ struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts)
fail:
kfree(tconn->current_epoch);
- tl_cleanup(tconn);
free_cpumask_var(tconn->cpu_mask);
drbd_free_socket(&tconn->meta);
drbd_free_socket(&tconn->data);