aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/block/drbd/Makefile2
-rw-r--r--drivers/block/drbd/drbd_actlog.c689
-rw-r--r--drivers/block/drbd/drbd_bitmap.c219
-rw-r--r--drivers/block/drbd/drbd_int.h1398
-rw-r--r--drivers/block/drbd/drbd_interval.c207
-rw-r--r--drivers/block/drbd/drbd_interval.h40
-rw-r--r--drivers/block/drbd/drbd_main.c3773
-rw-r--r--drivers/block/drbd/drbd_nl.c3318
-rw-r--r--drivers/block/drbd/drbd_nla.c55
-rw-r--r--drivers/block/drbd/drbd_nla.h8
-rw-r--r--drivers/block/drbd/drbd_proc.c33
-rw-r--r--drivers/block/drbd/drbd_receiver.c3869
-rw-r--r--drivers/block/drbd/drbd_req.c1569
-rw-r--r--drivers/block/drbd/drbd_req.h187
-rw-r--r--drivers/block/drbd/drbd_state.c1857
-rw-r--r--drivers/block/drbd/drbd_state.h161
-rw-r--r--drivers/block/drbd/drbd_strings.c1
-rw-r--r--drivers/block/drbd/drbd_worker.c1168
-rw-r--r--drivers/block/drbd/drbd_wrappers.h11
-rw-r--r--include/linux/drbd.h81
-rw-r--r--include/linux/drbd_genl.h378
-rw-r--r--include/linux/drbd_genl_api.h55
-rw-r--r--include/linux/drbd_limits.h90
-rw-r--r--include/linux/drbd_nl.h164
-rw-r--r--include/linux/drbd_tag_magic.h84
-rw-r--r--include/linux/genl_magic_func.h422
-rw-r--r--include/linux/genl_magic_struct.h277
-rw-r--r--include/linux/idr.h11
-rw-r--r--include/linux/lru_cache.h67
-rw-r--r--lib/lru_cache.c359
30 files changed, 12020 insertions, 8533 deletions
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 0d3f337ff5f..8b450338075 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -1,5 +1,7 @@
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index d4dd563d0d5..92510f8ad01 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -24,21 +24,73 @@
*/
#include <linux/slab.h>
+#include <linux/crc32c.h>
#include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include <linux/dynamic_debug.h>
#include "drbd_int.h"
#include "drbd_wrappers.h"
-/* We maintain a trivial checksum in our on disk activity log.
- * With that we can ensure correct operation even when the storage
- * device might do a partial (last) sector write while losing power.
- */
-struct __packed al_transaction {
- u32 magic;
- u32 tr_number;
- struct __packed {
- u32 pos;
- u32 extent; } updates[1 + AL_EXTENTS_PT];
- u32 xor_sum;
+
+enum al_transaction_types {
+ AL_TR_UPDATE = 0,
+ AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+ /* don't we all like magic */
+ __be32 magic;
+
+ /* to identify the most recent transaction block
+ * in the on disk ring buffer */
+ __be32 tr_number;
+
+ /* checksum on the full 4k block, with this field set to 0. */
+ __be32 crc32c;
+
+ /* type of transaction, special transaction types like:
+ * purge-all, set-all-idle, set-all-active, ... to-be-defined
+ * see also enum al_transaction_types */
+ __be16 transaction_type;
+
+ /* we currently allow only a few thousand extents,
+ * so 16bit will be enough for the slot number. */
+
+ /* how many updates in this transaction */
+ __be16 n_updates;
+
+ /* maximum slot number, "al-extents" in drbd.conf speak.
+ * Having this in each transaction should make reconfiguration
+ * of that parameter easier. */
+ __be16 context_size;
+
+ /* slot number the context starts with */
+ __be16 context_start_slot_nr;
+
+ /* Some reserved bytes. Expected usage is a 64bit counter of
+ * sectors-written since device creation, and other data generation tag
+ * supporting usage */
+ __be32 __reserved[4];
+
+ /* --- 36 byte used --- */
+
+ /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+ * in one transaction, then use the remaining byte in the 4k block for
+ * context information. "Flexible" number of updates per transaction
+ * does not help, as we have to account for the case when all update
+ * slots are used anyways, so it would only complicate code without
+ * additional benefit.
+ */
+ __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* but the extent number is 32bit, which at an extent size of 4 MiB
+ * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+ __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* --- 420 bytes used (36 + 64*6) --- */
+
+ /* 4096 - 420 = 3676 = 919 * 4 */
+ __be32 context[AL_CONTEXT_PER_TRANSACTION];
};
struct update_odbm_work {
@@ -48,22 +100,11 @@ struct update_odbm_work {
struct update_al_work {
struct drbd_work w;
- struct lc_element *al_ext;
struct completion event;
- unsigned int enr;
- /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
- unsigned int old_enr;
-};
-
-struct drbd_atodb_wait {
- atomic_t count;
- struct completion io_done;
- struct drbd_conf *mdev;
- int error;
+ int err;
};
-
-int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+static int al_write_transaction(struct drbd_conf *mdev);
void *drbd_md_get_buffer(struct drbd_conf *mdev)
{
@@ -85,12 +126,17 @@ void drbd_md_put_buffer(struct drbd_conf *mdev)
void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
unsigned int *done)
{
- long dt = bdev->dc.disk_timeout * HZ / 10;
+ long dt;
+
+ rcu_read_lock();
+ dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+ rcu_read_unlock();
+ dt = dt * HZ / 10;
if (dt == 0)
dt = MAX_SCHEDULE_TIMEOUT;
dt = wait_event_timeout(mdev->misc_wait,
- *done || drbd_test_flag(mdev, FORCE_DETACH), dt);
+ *done || test_bit(FORCE_DETACH, &mdev->flags), dt);
if (dt == 0) {
dev_err(DEV, "meta-data IO operation timed out\n");
drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH);
@@ -103,20 +149,20 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
int rw, int size)
{
struct bio *bio;
- int ok;
+ int err;
mdev->md_io.done = 0;
mdev->md_io.error = -ENODEV;
- if ((rw & WRITE) && !drbd_test_flag(mdev, MD_NO_FUA))
+ if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
rw |= REQ_FUA | REQ_FLUSH;
rw |= REQ_SYNC;
bio = bio_alloc_drbd(GFP_NOIO);
bio->bi_bdev = bdev->md_bdev;
bio->bi_sector = sector;
- ok = (bio_add_page(bio, page, size, 0) == size);
- if (!ok)
+ err = -EIO;
+ if (bio_add_page(bio, page, size, 0) != size)
goto out;
bio->bi_private = &mdev->md_io;
bio->bi_end_io = drbd_md_io_complete;
@@ -124,7 +170,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
- ok = 0;
+ err = -ENODEV;
goto out;
}
@@ -135,85 +181,46 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
else
submit_bio(rw, bio);
wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done);
- ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
+ if (bio_flagged(bio, BIO_UPTODATE))
+ err = mdev->md_io.error;
out:
bio_put(bio);
- return ok;
+ return err;
}
int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
sector_t sector, int rw)
{
- int logical_block_size, mask, ok;
- int offset = 0;
+ int err;
struct page *iop = mdev->md_io_page;
D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
BUG_ON(!bdev->md_bdev);
- logical_block_size = bdev_logical_block_size(bdev->md_bdev);
- if (logical_block_size == 0)
- logical_block_size = MD_SECTOR_SIZE;
-
- /* in case logical_block_size != 512 [ s390 only? ] */
- if (logical_block_size != MD_SECTOR_SIZE) {
- mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
- D_ASSERT(mask == 1 || mask == 3 || mask == 7);
- D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
- offset = sector & mask;
- sector = sector & ~mask;
- iop = mdev->md_io_tmpp;
-
- if (rw & WRITE) {
- /* these are GFP_KERNEL pages, pre-allocated
- * on device initialization */
- void *p = page_address(mdev->md_io_page);
- void *hp = page_address(mdev->md_io_tmpp);
-
- ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
- READ, logical_block_size);
-
- if (unlikely(!ok)) {
- dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
- "READ [logical_block_size!=512]) failed!\n",
- (unsigned long long)sector);
- return 0;
- }
-
- memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
- }
- }
+ dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
if (sector < drbd_md_first_sector(bdev) ||
- sector > drbd_md_last_sector(bdev))
+ sector + 7 > drbd_md_last_sector(bdev))
dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
- ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
- if (unlikely(!ok)) {
- dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
- (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
- return 0;
+ err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
+ if (err) {
+ dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
}
-
- if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
- void *p = page_address(mdev->md_io_page);
- void *hp = page_address(mdev->md_io_tmpp);
-
- memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
- }
-
- return ok;
+ return err;
}
static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
{
struct lc_element *al_ext;
struct lc_element *tmp;
- unsigned long al_flags = 0;
int wake;
spin_lock_irq(&mdev->al_lock);
@@ -228,76 +235,92 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
return NULL;
}
}
- al_ext = lc_get(mdev->act_log, enr);
- al_flags = mdev->act_log->flags;
+ al_ext = lc_get(mdev->act_log, enr);
spin_unlock_irq(&mdev->al_lock);
-
- /*
- if (!al_ext) {
- if (al_flags & LC_STARVING)
- dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
- if (al_flags & LC_DIRTY)
- dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
- }
- */
-
return al_ext;
}
-void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
{
- unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
- struct lc_element *al_ext;
- struct update_al_work al_work;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ bool locked = false;
+
+ D_ASSERT(first <= last);
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
- wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+ for (enr = first; enr <= last; enr++)
+ wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
- if (al_ext->lc_number != enr) {
+ /* Serialize multiple transactions.
+ * This uses test_and_set_bit, memory barrier is implicit.
+ */
+ wait_event(mdev->al_wait,
+ mdev->act_log->pending_changes == 0 ||
+ (locked = lc_try_lock_for_transaction(mdev->act_log)));
+
+ if (locked) {
/* drbd_al_write_transaction(mdev,al_ext,enr);
* recurses into generic_make_request(), which
* disallows recursion, bios being serialized on the
* current->bio_tail list now.
* we have to delegate updates to the activity log
* to the worker thread. */
- init_completion(&al_work.event);
- al_work.al_ext = al_ext;
- al_work.enr = enr;
- al_work.old_enr = al_ext->lc_number;
- al_work.w.cb = w_al_write_transaction;
- drbd_queue_work_front(&mdev->data.work, &al_work.w);
- wait_for_completion(&al_work.event);
-
- mdev->al_writ_cnt++;
-
- spin_lock_irq(&mdev->al_lock);
- lc_changed(mdev->act_log, al_ext);
- spin_unlock_irq(&mdev->al_lock);
+
+ /* Double check: it may have been committed by someone else,
+ * while we have been waiting for the lock. */
+ if (mdev->act_log->pending_changes) {
+ bool write_al_updates;
+
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+
+ if (write_al_updates) {
+ al_write_transaction(mdev);
+ mdev->al_writ_cnt++;
+ }
+
+ spin_lock_irq(&mdev->al_lock);
+ /* FIXME
+ if (err)
+ we need an "lc_cancel" here;
+ */
+ lc_committed(mdev->act_log);
+ spin_unlock_irq(&mdev->al_lock);
+ }
+ lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
}
}
-void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
{
- unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
struct lc_element *extent;
unsigned long flags;
+ D_ASSERT(first <= last);
spin_lock_irqsave(&mdev->al_lock, flags);
- extent = lc_find(mdev->act_log, enr);
-
- if (!extent) {
- spin_unlock_irqrestore(&mdev->al_lock, flags);
- dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
- return;
+ for (enr = first; enr <= last; enr++) {
+ extent = lc_find(mdev->act_log, enr);
+ if (!extent) {
+ dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+ continue;
+ }
+ lc_put(mdev->act_log, extent);
}
-
- if (lc_put(mdev->act_log, extent) == 0)
- wake_up(&mdev->al_wait);
-
spin_unlock_irqrestore(&mdev->al_lock, flags);
+ wake_up(&mdev->al_wait);
}
#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
@@ -323,296 +346,148 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
return rs_enr >>
/* bit to page */
((PAGE_SHIFT + 3) -
- /* al extent number to bit */
+ /* resync extent number to bit */
(BM_EXT_SHIFT - BM_BLOCK_SHIFT));
}
-int
-w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int
+_al_write_transaction(struct drbd_conf *mdev)
{
- struct update_al_work *aw = container_of(w, struct update_al_work, w);
- struct lc_element *updated = aw->al_ext;
- const unsigned int new_enr = aw->enr;
- const unsigned int evicted = aw->old_enr;
- struct al_transaction *buffer;
+ struct al_transaction_on_disk *buffer;
+ struct lc_element *e;
sector_t sector;
- int i, n, mx;
- unsigned int extent_nr;
- u32 xor_sum = 0;
+ int i, mx;
+ unsigned extent_nr;
+ unsigned crc = 0;
+ int err = 0;
if (!get_ldev(mdev)) {
- dev_err(DEV,
- "disk is %s, cannot start al transaction (-%d +%d)\n",
- drbd_disk_str(mdev->state.disk), evicted, new_enr);
- complete(&((struct update_al_work *)w)->event);
- return 1;
+ dev_err(DEV, "disk is %s, cannot start al transaction\n",
+ drbd_disk_str(mdev->state.disk));
+ return -EIO;
}
- /* do we have to do a bitmap write, first?
- * TODO reduce maximum latency:
- * submit both bios, then wait for both,
- * instead of doing two synchronous sector writes.
- * For now, we must not write the transaction,
- * if we cannot write out the bitmap of the evicted extent. */
- if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
- drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
/* The bitmap write may have failed, causing a state change. */
if (mdev->state.disk < D_INCONSISTENT) {
dev_err(DEV,
- "disk is %s, cannot write al transaction (-%d +%d)\n",
- drbd_disk_str(mdev->state.disk), evicted, new_enr);
- complete(&((struct update_al_work *)w)->event);
+ "disk is %s, cannot write al transaction\n",
+ drbd_disk_str(mdev->state.disk));
put_ldev(mdev);
- return 1;
+ return -EIO;
}
buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
if (!buffer) {
dev_err(DEV, "disk failed while waiting for md_io buffer\n");
- complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
- return 1;
+ return -ENODEV;
}
- buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+ memset(buffer, 0, sizeof(*buffer));
+ buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
- n = lc_index_of(mdev->act_log, updated);
+ i = 0;
- buffer->updates[0].pos = cpu_to_be32(n);
- buffer->updates[0].extent = cpu_to_be32(new_enr);
+ /* Even though no one can start to change this list
+ * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+ * lc_try_lock_for_transaction() --, someone may still
+ * be in the process of changing it. */
+ spin_lock_irq(&mdev->al_lock);
+ list_for_each_entry(e, &mdev->act_log->to_be_changed, list) {
+ if (i == AL_UPDATES_PER_TRANSACTION) {
+ i++;
+ break;
+ }
+ buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+ buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+ if (e->lc_number != LC_FREE)
+ drbd_bm_mark_for_writeout(mdev,
+ al_extent_to_bm_page(e->lc_number));
+ i++;
+ }
+ spin_unlock_irq(&mdev->al_lock);
+ BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
- xor_sum ^= new_enr;
+ buffer->n_updates = cpu_to_be16(i);
+ for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+ buffer->update_slot_nr[i] = cpu_to_be16(-1);
+ buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+ }
+
+ buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements);
+ buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle);
- mx = min_t(int, AL_EXTENTS_PT,
+ mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
mdev->act_log->nr_elements - mdev->al_tr_cycle);
for (i = 0; i < mx; i++) {
unsigned idx = mdev->al_tr_cycle + i;
extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
- buffer->updates[i+1].pos = cpu_to_be32(idx);
- buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
- xor_sum ^= extent_nr;
- }
- for (; i < AL_EXTENTS_PT; i++) {
- buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
- buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
- xor_sum ^= LC_FREE;
+ buffer->context[i] = cpu_to_be32(extent_nr);
}
- mdev->al_tr_cycle += AL_EXTENTS_PT;
+ for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+ buffer->context[i] = cpu_to_be32(LC_FREE);
+
+ mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
mdev->al_tr_cycle = 0;
- buffer->xor_sum = cpu_to_be32(xor_sum);
-
sector = mdev->ldev->md.md_offset
- + mdev->ldev->md.al_offset + mdev->al_tr_pos;
+ + mdev->ldev->md.al_offset
+ + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
- if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
- drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
-
- if (++mdev->al_tr_pos >
- div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
- mdev->al_tr_pos = 0;
+ crc = crc32c(0, buffer, 4096);
+ buffer->crc32c = cpu_to_be32(crc);
- D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
- mdev->al_tr_number++;
+ if (drbd_bm_write_hinted(mdev))
+ err = -EIO;
+ /* drbd_chk_io_error done already */
+ else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+ err = -EIO;
+ drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
+ } else {
+ /* advance ringbuffer position and transaction counter */
+ mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
+ mdev->al_tr_number++;
+ }
drbd_md_put_buffer(mdev);
-
- complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
- return 1;
+ return err;
}
-/**
- * drbd_al_read_tr() - Read a single transaction from the on disk activity log
- * @mdev: DRBD device.
- * @bdev: Block device to read form.
- * @b: pointer to an al_transaction.
- * @index: On disk slot of the transaction to read.
- *
- * Returns -1 on IO error, 0 on checksum error and 1 upon success.
- */
-static int drbd_al_read_tr(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev,
- struct al_transaction *b,
- int index)
-{
- sector_t sector;
- int rv, i;
- u32 xor_sum = 0;
-
- sector = bdev->md.md_offset + bdev->md.al_offset + index;
-
- /* Dont process error normally,
- * as this is done before disk is attached! */
- if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
- return -1;
- rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
-
- for (i = 0; i < AL_EXTENTS_PT + 1; i++)
- xor_sum ^= be32_to_cpu(b->updates[i].extent);
- rv &= (xor_sum == be32_to_cpu(b->xor_sum));
-
- return rv;
-}
-
-/**
- * drbd_al_read_log() - Restores the activity log from its on disk representation.
- * @mdev: DRBD device.
- * @bdev: Block device to read form.
- *
- * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
- */
-int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+static int w_al_write_transaction(struct drbd_work *w, int unused)
{
- struct al_transaction *buffer;
- int i;
- int rv;
- int mx;
- int active_extents = 0;
- int transactions = 0;
- int found_valid = 0;
- int from = 0;
- int to = 0;
- u32 from_tnr = 0;
- u32 to_tnr = 0;
- u32 cnr;
-
- mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
-
- /* lock out all other meta data io for now,
- * and make sure the page is mapped.
- */
- buffer = drbd_md_get_buffer(mdev);
- if (!buffer)
- return 0;
-
- /* Find the valid transaction in the log */
- for (i = 0; i <= mx; i++) {
- rv = drbd_al_read_tr(mdev, bdev, buffer, i);
- if (rv == 0)
- continue;
- if (rv == -1) {
- drbd_md_put_buffer(mdev);
- return 0;
- }
- cnr = be32_to_cpu(buffer->tr_number);
-
- if (++found_valid == 1) {
- from = i;
- to = i;
- from_tnr = cnr;
- to_tnr = cnr;
- continue;
- }
- if ((int)cnr - (int)from_tnr < 0) {
- D_ASSERT(from_tnr - cnr + i - from == mx+1);
- from = i;
- from_tnr = cnr;
- }
- if ((int)cnr - (int)to_tnr > 0) {
- D_ASSERT(cnr - to_tnr == i - to);
- to = i;
- to_tnr = cnr;
- }
- }
-
- if (!found_valid) {
- dev_warn(DEV, "No usable activity log found.\n");
- drbd_md_put_buffer(mdev);
- return 1;
- }
-
- /* Read the valid transactions.
- * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
- i = from;
- while (1) {
- int j, pos;
- unsigned int extent_nr;
- unsigned int trn;
-
- rv = drbd_al_read_tr(mdev, bdev, buffer, i);
- ERR_IF(rv == 0) goto cancel;
- if (rv == -1) {
- drbd_md_put_buffer(mdev);
- return 0;
- }
-
- trn = be32_to_cpu(buffer->tr_number);
-
- spin_lock_irq(&mdev->al_lock);
-
- /* This loop runs backwards because in the cyclic
- elements there might be an old version of the
- updated element (in slot 0). So the element in slot 0
- can overwrite old versions. */
- for (j = AL_EXTENTS_PT; j >= 0; j--) {
- pos = be32_to_cpu(buffer->updates[j].pos);
- extent_nr = be32_to_cpu(buffer->updates[j].extent);
-
- if (extent_nr == LC_FREE)
- continue;
-
- lc_set(mdev->act_log, extent_nr, pos);
- active_extents++;
- }
- spin_unlock_irq(&mdev->al_lock);
-
- transactions++;
-
-cancel:
- if (i == to)
- break;
- i++;
- if (i > mx)
- i = 0;
- }
-
- mdev->al_tr_number = to_tnr+1;
- mdev->al_tr_pos = to;
- if (++mdev->al_tr_pos >
- div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
- mdev->al_tr_pos = 0;
-
- /* ok, we are done with it */
- drbd_md_put_buffer(mdev);
+ struct update_al_work *aw = container_of(w, struct update_al_work, w);
+ struct drbd_conf *mdev = w->mdev;
+ int err;
- dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
- transactions, active_extents);
+ err = _al_write_transaction(mdev);
+ aw->err = err;
+ complete(&aw->event);
- return 1;
+ return err != -EIO ? err : 0;
}
-/**
- * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
- * @mdev: DRBD device.
- */
-void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+/* Calls from worker context (see w_restart_disk_io()) need to write the
+ transaction directly. Others came through generic_make_request(),
+ those need to delegate it to the worker. */
+static int al_write_transaction(struct drbd_conf *mdev)
{
- unsigned int enr;
- unsigned long add = 0;
- char ppb[10];
- int i, tmp;
+ struct update_al_work al_work;
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+ if (current == mdev->tconn->worker.task)
+ return _al_write_transaction(mdev);
- for (i = 0; i < mdev->act_log->nr_elements; i++) {
- enr = lc_element_by_index(mdev->act_log, i)->lc_number;
- if (enr == LC_FREE)
- continue;
- tmp = drbd_bm_ALe_set_all(mdev, enr);
- dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
- add += tmp;
- }
+ init_completion(&al_work.event);
+ al_work.w.cb = w_al_write_transaction;
+ al_work.w.mdev = mdev;
+ drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
+ wait_for_completion(&al_work.event);
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
-
- dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
- ppsize(ppb, Bit2KB(add)));
+ return al_work.err;
}
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
@@ -642,7 +517,7 @@ void drbd_al_shrink(struct drbd_conf *mdev)
struct lc_element *al_ext;
int i;
- D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+ D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags));
for (i = 0; i < mdev->act_log->nr_elements; i++) {
al_ext = lc_element_by_index(mdev->act_log, i);
@@ -654,15 +529,17 @@ void drbd_al_shrink(struct drbd_conf *mdev)
wake_up(&mdev->al_wait);
}
-static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_update_odbm(struct drbd_work *w, int unused)
{
struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+ struct drbd_conf *mdev = w->mdev;
+ struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
if (!get_ldev(mdev)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
kfree(udw);
- return 1;
+ return 0;
}
drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
@@ -680,9 +557,9 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
break;
}
}
- drbd_bcast_sync_progress(mdev);
+ drbd_bcast_event(mdev, &sib);
- return 1;
+ return 0;
}
@@ -752,7 +629,9 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
}
ext->rs_left = rs_left;
ext->rs_failed = success ? 0 : count;
- lc_changed(mdev->resync, &ext->lce);
+ /* we don't keep a persistent log of the resync lru,
+ * we can commit any change right away. */
+ lc_committed(mdev->resync);
}
lc_put(mdev->resync, &ext->lce);
/* no race, we are within the al_lock! */
@@ -764,7 +643,8 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
if (udw) {
udw->enr = ext->lce.lc_number;
udw->w.cb = w_update_odbm;
- drbd_queue_work_front(&mdev->data.work, &udw->w);
+ udw->w.mdev = mdev;
+ drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w);
} else {
dev_warn(DEV, "Could not kmalloc an udw\n");
}
@@ -810,16 +690,22 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
int wake_up = 0;
unsigned long flags;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
}
+
+ if (!get_ldev(mdev))
+ return; /* no disk, no metadata, no bitmap to clear bits in */
+
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors) return;
- ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+ if (!expect(sector < nr_sectors))
+ goto out;
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
@@ -827,7 +713,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
* round up start sector, round down end sector. we make sure we only
* clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
if (unlikely(esector < BM_SECT_PER_BIT-1))
- return;
+ goto out;
if (unlikely(esector == (nr_sectors-1)))
ebnr = lbnr;
else
@@ -835,14 +721,14 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
if (sbnr > ebnr)
- return;
+ goto out;
/*
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
- if (count && get_ldev(mdev)) {
+ if (count) {
drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
spin_lock_irqsave(&mdev->al_lock, flags);
drbd_try_clear_on_disk_bm(mdev, sector, count, true);
@@ -851,8 +737,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
- put_ldev(mdev);
}
+out:
+ put_ldev(mdev);
if (wake_up)
wake_up(&mdev->al_wait);
}
@@ -868,7 +755,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
const char *file, const unsigned int line)
{
- unsigned long sbnr, ebnr, lbnr, flags;
+ unsigned long sbnr, ebnr, flags;
sector_t esector, nr_sectors;
unsigned int enr, count = 0;
struct lc_element *e;
@@ -877,7 +764,7 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
if (size == 0)
return 0;
- if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "sector: %llus, size: %d\n",
(unsigned long long)sector, size);
return 0;
@@ -889,12 +776,10 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors)
+ if (!expect(sector < nr_sectors))
goto out;
- ERR_IF(esector >= nr_sectors)
- esector = (nr_sectors-1);
-
- lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
/* we set it out of sync,
* we do not need to round anything here */
@@ -937,7 +822,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
if (bm_ext->lce.lc_number != enr) {
bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
bm_ext->rs_failed = 0;
- lc_changed(mdev->resync, &bm_ext->lce);
+ lc_committed(mdev->resync);
wakeup = 1;
}
if (bm_ext->lce.refcnt == 1)
@@ -953,7 +838,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
if (rs_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for element"
" (resync LRU too small?)\n");
- BUG_ON(rs_flags & LC_DIRTY);
+ BUG_ON(rs_flags & LC_LOCKED);
}
return bm_ext;
@@ -961,26 +846,12 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
{
- struct lc_element *al_ext;
- int rv = 0;
+ int rv;
spin_lock_irq(&mdev->al_lock);
- if (unlikely(enr == mdev->act_log->new_number))
- rv = 1;
- else {
- al_ext = lc_find(mdev->act_log, enr);
- if (al_ext) {
- if (al_ext->refcnt)
- rv = 1;
- }
- }
+ rv = lc_is_used(mdev->act_log, enr);
spin_unlock_irq(&mdev->al_lock);
- /*
- if (unlikely(rv)) {
- dev_info(DEV, "Delaying sync read until app's write is done\n");
- }
- */
return rv;
}
@@ -1110,13 +981,13 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
if (rs_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for element"
" (resync LRU too small?)\n");
- BUG_ON(rs_flags & LC_DIRTY);
+ BUG_ON(rs_flags & LC_LOCKED);
goto try_again;
}
if (bm_ext->lce.lc_number != enr) {
bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
bm_ext->rs_failed = 0;
- lc_changed(mdev->resync, &bm_ext->lce);
+ lc_committed(mdev->resync);
wake_up(&mdev->al_wait);
D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
}
@@ -1127,8 +998,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
}
check_al:
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
- if (unlikely(al_enr+i == mdev->act_log->new_number))
- goto try_again;
if (lc_is_used(mdev->act_log, al_enr+i))
goto try_again;
}
@@ -1263,7 +1132,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
sector_t esector, nr_sectors;
int wake_up = 0;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
@@ -1271,8 +1140,10 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors) return;
- ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+ if (!expect(sector < nr_sectors))
+ return;
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 8d806975804..1ab205a4bf6 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -119,13 +119,9 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
if (!__ratelimit(&drbd_ratelimit_state))
return;
dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
- current == mdev->receiver.task ? "receiver" :
- current == mdev->asender.task ? "asender" :
- current == mdev->worker.task ? "worker" : current->comm,
- func, b->bm_why ?: "?",
- b->bm_task == mdev->receiver.task ? "receiver" :
- b->bm_task == mdev->asender.task ? "asender" :
- b->bm_task == mdev->worker.task ? "worker" : "?");
+ drbd_task_to_thread_name(mdev->tconn, current),
+ func, b->bm_why ?: "?",
+ drbd_task_to_thread_name(mdev->tconn, b->bm_task));
}
void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
@@ -142,13 +138,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
if (trylock_failed) {
dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
- current == mdev->receiver.task ? "receiver" :
- current == mdev->asender.task ? "asender" :
- current == mdev->worker.task ? "worker" : current->comm,
- why, b->bm_why ?: "?",
- b->bm_task == mdev->receiver.task ? "receiver" :
- b->bm_task == mdev->asender.task ? "asender" :
- b->bm_task == mdev->worker.task ? "worker" : "?");
+ drbd_task_to_thread_name(mdev->tconn, current),
+ why, b->bm_why ?: "?",
+ drbd_task_to_thread_name(mdev->tconn, b->bm_task));
mutex_lock(&b->bm_change);
}
if (BM_LOCKED_MASK & b->bm_flags)
@@ -196,6 +188,9 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
/* to mark for lazy writeout once syncer cleared all clearable bits,
* we if bits have been cleared since last IO. */
#define BM_PAGE_LAZY_WRITEOUT 28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT 27
/* store_page_idx uses non-atomic assignment. It is only used directly after
* allocating the page. All other bm_set_page_* and bm_clear_page_* need to
@@ -227,8 +222,7 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
{
struct drbd_bitmap *b = mdev->bitmap;
void *addr = &page_private(b->bm_pages[page_nr]);
- clear_bit(BM_PAGE_IO_LOCK, addr);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
wake_up(&mdev->bitmap->bm_io_wait);
}
@@ -246,6 +240,27 @@ static void bm_set_page_need_writeout(struct page *page)
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
}
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @mdev: DRBD device.
+ * @page_nr: the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr)
+{
+ struct page *page;
+ if (page_nr >= mdev->bitmap->bm_number_of_pages) {
+ dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n",
+ page_nr, (int)mdev->bitmap->bm_number_of_pages);
+ return;
+ }
+ page = mdev->bitmap->bm_pages[page_nr];
+ set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+}
+
static int bm_test_page_unchanged(struct page *page)
{
volatile const unsigned long *addr = &page_private(page);
@@ -376,7 +391,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
* GFP_NOIO, as this is called while drbd IO is "suspended",
* and during resize or attach on diskless Primary,
* we must not block on IO to ourselves.
- * Context is receiver thread or cqueue thread/dmsetup. */
+ * Context is receiver thread or dmsetup. */
bytes = sizeof(struct page *)*want;
new_pages = kzalloc(bytes, GFP_NOIO);
if (!new_pages) {
@@ -441,7 +456,8 @@ int drbd_bm_init(struct drbd_conf *mdev)
sector_t drbd_bm_capacity(struct drbd_conf *mdev)
{
- ERR_IF(!mdev->bitmap) return 0;
+ if (!expect(mdev->bitmap))
+ return 0;
return mdev->bitmap->bm_dev_capacity;
}
@@ -449,7 +465,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev)
*/
void drbd_bm_cleanup(struct drbd_conf *mdev)
{
- ERR_IF (!mdev->bitmap) return;
+ if (!expect(mdev->bitmap))
+ return;
bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
kfree(mdev->bitmap);
@@ -612,7 +629,8 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
int err = 0, growing;
int opages_vmalloced;
- ERR_IF(!b) return -ENOMEM;
+ if (!expect(b))
+ return -ENOMEM;
drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
@@ -734,8 +752,10 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
unsigned long s;
unsigned long flags;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
s = b->bm_set;
@@ -758,8 +778,10 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
size_t drbd_bm_words(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
return b->bm_words;
}
@@ -767,7 +789,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev)
unsigned long drbd_bm_bits(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return 0;
+ if (!expect(b))
+ return 0;
return b->bm_bits;
}
@@ -788,8 +811,10 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
end = offset + number;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
if (number == 0)
return;
WARN_ON(offset >= b->bm_words);
@@ -833,8 +858,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
end = offset + number;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
if ((offset >= b->bm_words) ||
@@ -862,8 +889,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
void drbd_bm_set_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0xff, b->bm_words);
@@ -876,8 +905,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev)
void drbd_bm_clear_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0, b->bm_words);
@@ -891,7 +922,8 @@ struct bm_aio_ctx {
unsigned int done;
unsigned flags;
#define BM_AIO_COPY_PAGES 1
-#define BM_WRITE_ALL_PAGES 2
+#define BM_AIO_WRITE_HINTED 2
+#define BM_WRITE_ALL_PAGES 4
int error;
struct kref kref;
};
@@ -1062,6 +1094,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (rw & WRITE) {
+ if ((flags & BM_AIO_WRITE_HINTED) &&
+ !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+ &page_private(b->bm_pages[i])))
+ continue;
+
if (!(flags & BM_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
@@ -1094,9 +1131,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
else
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
- dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
- rw == WRITE ? "WRITE" : "READ",
- count, jiffies - now);
+ /* summary for global bitmap IO */
+ if (flags == 0)
+ dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
+ rw == WRITE ? "WRITE" : "READ",
+ count, jiffies - now);
if (ctx->error) {
dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
@@ -1117,8 +1156,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
}
now = b->bm_set;
- dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
- ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+ if (flags == 0)
+ dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+ ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
return err;
@@ -1181,9 +1221,17 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
}
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @mdev: DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local)
+{
+ return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
/**
- * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
+ * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
* @mdev: DRBD device.
* @idx: bitmap page index
*
@@ -1291,8 +1339,10 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
struct drbd_bitmap *b = mdev->bitmap;
unsigned long i = DRBD_END_OF_BITMAP;
- ERR_IF(!b) return i;
- ERR_IF(!b->bm_pages) return i;
+ if (!expect(b))
+ return i;
+ if (!expect(b->bm_pages))
+ return i;
spin_lock_irq(&b->bm_lock);
if (BM_DONT_TEST & b->bm_flags)
@@ -1393,8 +1443,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
struct drbd_bitmap *b = mdev->bitmap;
int c = 0;
- ERR_IF(!b) return 1;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
@@ -1425,13 +1477,21 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
{
int i;
int bits;
+ int changed = 0;
unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
- b->bm_set += BITS_PER_LONG - bits;
+ changed += BITS_PER_LONG - bits;
}
kunmap_atomic(paddr);
+ if (changed) {
+ /* We only need lazy writeout, the information is still in the
+ * remote bitmap as well, and is reconstructed during the next
+ * bitmap exchange, if lost locally due to a crash. */
+ bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+ b->bm_set += changed;
+ }
}
/* Same thing as drbd_bm_set_bits,
@@ -1526,8 +1586,10 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
unsigned long *p_addr;
int i;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@@ -1561,8 +1623,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
- ERR_IF(!b) return 1;
- ERR_IF(!b->bm_pages) return 1;
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 1;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@@ -1575,11 +1639,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
bm_unmap(p_addr);
p_addr = bm_map_pidx(b, idx);
}
- ERR_IF (bitnr >= b->bm_bits) {
- dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
- } else {
+ if (expect(bitnr < b->bm_bits))
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
- }
+ else
+ dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
}
if (p_addr)
bm_unmap(p_addr);
@@ -1609,8 +1672,10 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
unsigned long flags;
unsigned long *p_addr, *bm;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@@ -1632,47 +1697,3 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
spin_unlock_irqrestore(&b->bm_lock, flags);
return count;
}
-
-/* Set all bits covered by the AL-extent al_enr.
- * Returns number of bits changed. */
-unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
-{
- struct drbd_bitmap *b = mdev->bitmap;
- unsigned long *p_addr, *bm;
- unsigned long weight;
- unsigned long s, e;
- int count, i, do_now;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
-
- spin_lock_irq(&b->bm_lock);
- if (BM_DONT_SET & b->bm_flags)
- bm_print_lock_info(mdev);
- weight = b->bm_set;
-
- s = al_enr * BM_WORDS_PER_AL_EXT;
- e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
- /* assert that s and e are on the same page */
- D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
- == s >> (PAGE_SHIFT - LN2_BPL + 3));
- count = 0;
- if (s < b->bm_words) {
- i = do_now = e-s;
- p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
- bm = p_addr + MLPP(s);
- while (i--) {
- count += hweight_long(*bm);
- *bm = -1UL;
- bm++;
- }
- bm_unmap(p_addr);
- b->bm_set += do_now*BITS_PER_LONG - count;
- if (e == b->bm_words)
- b->bm_set -= bm_clear_surplus(b);
- } else {
- dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
- }
- weight = b->bm_set - weight;
- spin_unlock_irq(&b->bm_lock);
- return weight;
-}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 277c69c9465..ef72a72814c 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -39,9 +39,13 @@
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
+#include <linux/idr.h>
#include <net/tcp.h>
#include <linux/lru_cache.h>
#include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_state.h"
#ifdef __CHECKER__
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
@@ -61,7 +65,6 @@
extern unsigned int minor_count;
extern bool disable_sendpage;
extern bool allow_oos;
-extern unsigned int cn_idx;
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern int enable_faults;
@@ -86,34 +89,44 @@ extern char usermode_helper[];
*/
#define DRBD_SIGKILL SIGHUP
-/* All EEs on the free list should have ID_VACANT (== 0)
- * freshly allocated EEs get !ID_VACANT (== 1)
- * so if it says "cannot dereference null pointer at address 0x00000001",
- * it is most likely one of these :( */
-
#define ID_IN_SYNC (4711ULL)
#define ID_OUT_OF_SYNC (4712ULL)
-
#define ID_SYNCER (-1ULL)
-#define ID_VACANT 0
-#define is_syncer_block_id(id) ((id) == ID_SYNCER)
+
#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
struct drbd_conf;
+struct drbd_tconn;
/* to shorten dev_warn(DEV, "msg"); and relatives statements */
#define DEV (disk_to_dev(mdev->vdisk))
+#define conn_printk(LEVEL, TCONN, FMT, ARGS...) \
+ printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS)
+#define conn_alert(TCONN, FMT, ARGS...) conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS)
+#define conn_crit(TCONN, FMT, ARGS...) conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS)
+#define conn_err(TCONN, FMT, ARGS...) conn_printk(KERN_ERR, TCONN, FMT, ## ARGS)
+#define conn_warn(TCONN, FMT, ARGS...) conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS)
+#define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS)
+#define conn_info(TCONN, FMT, ARGS...) conn_printk(KERN_INFO, TCONN, FMT, ## ARGS)
+#define conn_dbg(TCONN, FMT, ARGS...) conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS)
+
#define D_ASSERT(exp) if (!(exp)) \
dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
-#define ERR_IF(exp) if (({ \
- int _b = (exp) != 0; \
- if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \
- __func__, #exp, __FILE__, __LINE__); \
- _b; \
- }))
+/**
+ * expect - Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({ \
+ bool _bool = (exp); \
+ if (!_bool) \
+ dev_err(DEV, "ASSERTION %s FAILED in %s\n", \
+ #exp, __func__); \
+ _bool; \
+ })
/* Defines to control fault insertion */
enum {
@@ -150,15 +163,12 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
/* usual integer division */
#define div_floor(A, B) ((A)/(B))
-/* drbd_meta-data.c (still in drbd_main.c) */
-/* 4th incarnation of the disk layout. */
-#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
-
-extern struct drbd_conf **minor_table;
extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr minors; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */
/* on the wire */
-enum drbd_packets {
+enum drbd_packet {
/* receiver (data socket) */
P_DATA = 0x00,
P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
@@ -186,7 +196,7 @@ enum drbd_packets {
P_RECV_ACK = 0x15, /* Used in protocol B */
P_WRITE_ACK = 0x16, /* Used in protocol C */
P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
- P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
+ P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */
P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
@@ -207,77 +217,23 @@ enum drbd_packets {
P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+ P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
+ P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
+ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
+ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */
- P_MAX_CMD = 0x2A,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
/* special command ids for handshake */
- P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
- P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
+ P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */
+ P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */
- P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
+ P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */
};
-static inline const char *cmdname(enum drbd_packets cmd)
-{
- /* THINK may need to become several global tables
- * when we want to support more than
- * one PRO_VERSION */
- static const char *cmdnames[] = {
- [P_DATA] = "Data",
- [P_DATA_REPLY] = "DataReply",
- [P_RS_DATA_REPLY] = "RSDataReply",
- [P_BARRIER] = "Barrier",
- [P_BITMAP] = "ReportBitMap",
- [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
- [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
- [P_UNPLUG_REMOTE] = "UnplugRemote",
- [P_DATA_REQUEST] = "DataRequest",
- [P_RS_DATA_REQUEST] = "RSDataRequest",
- [P_SYNC_PARAM] = "SyncParam",
- [P_SYNC_PARAM89] = "SyncParam89",
- [P_PROTOCOL] = "ReportProtocol",
- [P_UUIDS] = "ReportUUIDs",
- [P_SIZES] = "ReportSizes",
- [P_STATE] = "ReportState",
- [P_SYNC_UUID] = "ReportSyncUUID",
- [P_AUTH_CHALLENGE] = "AuthChallenge",
- [P_AUTH_RESPONSE] = "AuthResponse",
- [P_PING] = "Ping",
- [P_PING_ACK] = "PingAck",
- [P_RECV_ACK] = "RecvAck",
- [P_WRITE_ACK] = "WriteAck",
- [P_RS_WRITE_ACK] = "RSWriteAck",
- [P_DISCARD_ACK] = "DiscardAck",
- [P_NEG_ACK] = "NegAck",
- [P_NEG_DREPLY] = "NegDReply",
- [P_NEG_RS_DREPLY] = "NegRSDReply",
- [P_BARRIER_ACK] = "BarrierAck",
- [P_STATE_CHG_REQ] = "StateChgRequest",
- [P_STATE_CHG_REPLY] = "StateChgReply",
- [P_OV_REQUEST] = "OVRequest",
- [P_OV_REPLY] = "OVReply",
- [P_OV_RESULT] = "OVResult",
- [P_CSUM_RS_REQUEST] = "CsumRSRequest",
- [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
- [P_COMPRESSED_BITMAP] = "CBitmap",
- [P_DELAY_PROBE] = "DelayProbe",
- [P_OUT_OF_SYNC] = "OutOfSync",
- [P_MAX_CMD] = NULL,
- };
-
- if (cmd == P_HAND_SHAKE_M)
- return "HandShakeM";
- if (cmd == P_HAND_SHAKE_S)
- return "HandShakeS";
- if (cmd == P_HAND_SHAKE)
- return "HandShake";
- if (cmd >= P_MAX_CMD)
- return "Unknown";
- return cmdnames[cmd];
-}
+extern const char *cmdname(enum drbd_packet cmd);
/* for sending/receiving the bitmap,
* possibly in some encoding scheme */
@@ -337,37 +293,24 @@ struct p_header80 {
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
- u8 payload[0];
} __packed;
/* Header for big packets, Used for data packets exceeding 64kB */
struct p_header95 {
u16 magic; /* use DRBD_MAGIC_BIG here */
u16 command;
- u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
- u8 payload[0];
+ u32 length;
} __packed;
-union p_header {
- struct p_header80 h80;
- struct p_header95 h95;
-};
-
-/*
- * short commands, packets without payload, plain p_header:
- * P_PING
- * P_PING_ACK
- * P_BECOME_SYNC_TARGET
- * P_BECOME_SYNC_SOURCE
- * P_UNPLUG_REMOTE
- */
+struct p_header100 {
+ u32 magic;
+ u16 volume;
+ u16 command;
+ u32 length;
+ u32 pad;
+} __packed;
-/*
- * commands with out-of-struct payload:
- * P_BITMAP (no additional fields)
- * P_DATA, P_DATA_REPLY (see p_data)
- * P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
- */
+extern unsigned int drbd_header_size(struct drbd_tconn *tconn);
/* these defines must not be changed without changing the protocol version */
#define DP_HARDBARRIER 1 /* depricated */
@@ -377,9 +320,10 @@ union p_header {
#define DP_FUA 16 /* equals REQ_FUA */
#define DP_FLUSH 32 /* equals REQ_FLUSH */
#define DP_DISCARD 64 /* equals REQ_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
struct p_data {
- union p_header head;
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
@@ -390,21 +334,18 @@ struct p_data {
* commands which share a struct:
* p_block_ack:
* P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
- * P_DISCARD_ACK (proto C, two-primaries conflict detection)
+ * P_SUPERSEDED (proto C, two-primaries conflict detection)
* p_block_req:
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
- struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
u32 seq_num;
} __packed;
-
struct p_block_req {
- struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
@@ -413,59 +354,52 @@ struct p_block_req {
/*
* commands with their own struct for additional fields:
- * P_HAND_SHAKE
+ * P_CONNECTION_FEATURES
* P_BARRIER
* P_BARRIER_ACK
* P_SYNC_PARAM
* ReportParams
*/
-struct p_handshake {
- struct p_header80 head; /* 8 bytes */
+struct p_connection_features {
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
/* should be more than enough for future enhancements
- * for now, feature_flags and the reserverd array shall be zero.
+ * for now, feature_flags and the reserved array shall be zero.
*/
u32 _pad;
- u64 reserverd[7];
+ u64 reserved[7];
} __packed;
-/* 80 bytes, FIXED for the next century */
struct p_barrier {
- struct p_header80 head;
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
- struct p_header80 head;
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
- struct p_header80 head;
- u32 rate;
+ u32 resync_rate;
/* Since protocol version 88 and higher. */
char verify_alg[0];
} __packed;
struct p_rs_param_89 {
- struct p_header80 head;
- u32 rate;
+ u32 resync_rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
struct p_rs_param_95 {
- struct p_header80 head;
- u32 rate;
+ u32 resync_rate;
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
u32 c_plan_ahead;
@@ -475,12 +409,11 @@ struct p_rs_param_95 {
} __packed;
enum drbd_conn_flags {
- CF_WANT_LOSE = 1,
+ CF_DISCARD_MY_DATA = 1,
CF_DRY_RUN = 2,
};
struct p_protocol {
- struct p_header80 head;
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
@@ -494,17 +427,14 @@ struct p_protocol {
} __packed;
struct p_uuids {
- struct p_header80 head;
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
- struct p_header80 head;
u64 uuid;
} __packed;
struct p_sizes {
- struct p_header80 head;
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
@@ -514,18 +444,15 @@ struct p_sizes {
} __packed;
struct p_state {
- struct p_header80 head;
u32 state;
} __packed;
struct p_req_state {
- struct p_header80 head;
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
- struct p_header80 head;
u32 retcode;
} __packed;
@@ -539,15 +466,7 @@ struct p_drbd06_param {
u32 bit_map_gen[5];
} __packed;
-struct p_discard {
- struct p_header80 head;
- u64 block_id;
- u32 seq_num;
- u32 pad;
-} __packed;
-
struct p_block_desc {
- struct p_header80 head;
u64 sector;
u32 blksize;
u32 pad; /* to multiple of 8 Byte */
@@ -563,7 +482,6 @@ enum drbd_bitmap_code {
};
struct p_compressed_bm {
- struct p_header80 head;
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@ -575,90 +493,22 @@ struct p_compressed_bm {
} __packed;
struct p_delay_probe93 {
- struct p_header80 head;
u32 seq_num; /* sequence number to match the two probe packets */
u32 offset; /* usecs the probe got sent after the reference time point */
} __packed;
-/* DCBP: Drbd Compressed Bitmap Packet ... */
-static inline enum drbd_bitmap_code
-DCBP_get_code(struct p_compressed_bm *p)
-{
- return (enum drbd_bitmap_code)(p->encoding & 0x0f);
-}
-
-static inline void
-DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
-{
- BUG_ON(code & ~0xf);
- p->encoding = (p->encoding & ~0xf) | code;
-}
-
-static inline int
-DCBP_get_start(struct p_compressed_bm *p)
-{
- return (p->encoding & 0x80) != 0;
-}
-
-static inline void
-DCBP_set_start(struct p_compressed_bm *p, int set)
-{
- p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
-}
-
-static inline int
-DCBP_get_pad_bits(struct p_compressed_bm *p)
-{
- return (p->encoding >> 4) & 0x7;
-}
-
-static inline void
-DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
-{
- BUG_ON(n & ~0x7);
- p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
-}
-
-/* one bitmap packet, including the p_header,
- * should fit within one _architecture independend_ page.
- * so we need to use the fixed size 4KiB page size
- * most architectures have used for a long time.
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
*/
-#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
-#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
-#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
-#if (PAGE_SIZE < 4096)
-/* drbd_send_bitmap / receive_bitmap would break horribly */
-#error "PAGE_SIZE too small"
-#endif
-
-union p_polymorph {
- union p_header header;
- struct p_handshake handshake;
- struct p_data data;
- struct p_block_ack block_ack;
- struct p_barrier barrier;
- struct p_barrier_ack barrier_ack;
- struct p_rs_param_89 rs_param_89;
- struct p_rs_param_95 rs_param_95;
- struct p_protocol protocol;
- struct p_sizes sizes;
- struct p_uuids uuids;
- struct p_state state;
- struct p_req_state req_state;
- struct p_req_state_reply req_state_reply;
- struct p_block_req block_req;
- struct p_delay_probe93 delay_probe93;
- struct p_rs_uuid rs_uuid;
- struct p_block_desc block_desc;
-} __packed;
+#define DRBD_SOCKET_BUFFER_SIZE 4096
/**********************************************************************/
enum drbd_thread_state {
- None,
- Running,
- Exiting,
- Restarting
+ NONE,
+ RUNNING,
+ EXITING,
+ RESTARTING
};
struct drbd_thread {
@@ -667,8 +517,9 @@ struct drbd_thread {
struct completion stop;
enum drbd_thread_state t_state;
int (*function) (struct drbd_thread *);
- struct drbd_conf *mdev;
+ struct drbd_tconn *tconn;
int reset_cpu_mask;
+ char name[9];
};
static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
@@ -681,58 +532,54 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
return thi->t_state;
}
-struct drbd_work;
-typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
struct drbd_work {
struct list_head list;
- drbd_work_cb cb;
+ int (*cb)(struct drbd_work *, int cancel);
+ union {
+ struct drbd_conf *mdev;
+ struct drbd_tconn *tconn;
+ };
};
-struct drbd_tl_epoch;
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *);
+
struct drbd_request {
struct drbd_work w;
- struct drbd_conf *mdev;
/* if local IO is not allowed, will be NULL.
* if local IO _is_ allowed, holds the locally submitted bio clone,
* or, after local IO completion, the ERR_PTR(error).
- * see drbd_endio_pri(). */
+ * see drbd_request_endio(). */
struct bio *private_bio;
- struct hlist_node collision;
- sector_t sector;
- unsigned int size;
- unsigned int epoch; /* barrier_nr */
+ struct drbd_interval i;
- /* barrier_nr: used to check on "completion" whether this req was in
+ /* epoch: used to check on "completion" whether this req was in
* the current epoch, and we therefore have to close it,
- * starting a new epoch...
+ * causing a p_barrier packet to be send, starting a new epoch.
+ *
+ * This corresponds to "barrier" in struct p_barrier[_ack],
+ * and to "barrier_nr" in struct drbd_epoch (and various
+ * comments/function parameters/local variable names).
*/
+ unsigned int epoch;
struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */
- unsigned long rq_state; /* see comments above _req_mod() */
unsigned long start_time;
-};
-struct drbd_tl_epoch {
- struct drbd_work w;
- struct list_head requests; /* requests before */
- struct drbd_tl_epoch *next; /* pointer to the next barrier */
- unsigned int br_number; /* the barriers identifier. */
- int n_writes; /* number of requests attached before this barrier */
-};
+ /* once it hits 0, we may complete the master_bio */
+ atomic_t completion_ref;
+ /* once it hits 0, we may destroy this drbd_request object */
+ struct kref kref;
-struct drbd_request;
-
-/* These Tl_epoch_entries may be in one of 6 lists:
- active_ee .. data packet being written
- sync_ee .. syncer block being written
- done_ee .. block written, need to send P_WRITE_ACK
- read_ee .. [RS]P_DATA_REQUEST being read
-*/
+ unsigned rq_state; /* see comments above _req_mod() */
+};
struct drbd_epoch {
+ struct drbd_tconn *tconn;
struct list_head list;
unsigned int barrier_nr;
atomic_t epoch_size; /* increased on every request added. */
@@ -762,17 +609,14 @@ struct digest_info {
void *digest;
};
-struct drbd_epoch_entry {
+struct drbd_peer_request {
struct drbd_work w;
- struct hlist_node collision;
struct drbd_epoch *epoch; /* for writes */
- struct drbd_conf *mdev;
struct page *pages;
atomic_t pending_bios;
- unsigned int size;
+ struct drbd_interval i;
/* see comments on ee flag bits below */
unsigned long flags;
- sector_t sector;
union {
u64 block_id;
struct digest_info *digest;
@@ -793,31 +637,37 @@ enum {
* we need to resubmit without the barrier flag. */
__EE_RESUBMITTED,
- /* we may have several bios per epoch entry.
+ /* we may have several bios per peer request.
* if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,
+
+ /* Conflicting local requests need to be restarted after this request */
+ __EE_RESTART_REQUESTS,
+
+ /* The peer wants a write ACK for this (wire proto C) */
+ __EE_SEND_WRITE_ACK,
+
+ /* Is set when net_conf had two_primaries set while creating this peer_req */
+ __EE_IN_INTERVAL_TREE,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
-/* global flag bits */
-enum drbd_flag {
- CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
- SIGNAL_ASENDER, /* whether asender wants to be interrupted */
- SEND_PING, /* whether asender should send a ping asap */
-
+/* flag bits per mdev */
+enum {
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
- DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
- CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
CL_ST_CHG_SUCCESS,
CL_ST_CHG_FAIL,
CRASHED_PRIMARY, /* This node was a crashed primary.
@@ -835,33 +685,14 @@ enum drbd_flag {
WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
- NET_CONGESTED, /* The data socket is congested */
-
- CONFIG_PENDING, /* serialization of (re)configuration requests.
- * if set, also prevents the device from dying */
- DEVICE_DYING, /* device became unconfigured,
- * but worker thread is still handling the cleanup.
- * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
- * while this is set. */
RESIZE_PENDING, /* Size change detected locally, waiting for the response from
* the peer, if it changed there as well. */
- CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
- GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
NEW_CUR_UUID, /* Create new current UUID when thawing IO */
AL_SUSPENDED, /* Activity logging is currently suspended. */
AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
- STATE_SENT, /* Do not change state/UUIDs while this is set */
-
- CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
- * pending, from drbd worker context.
- * If set, bdi_write_congested() returns true,
- * so shrink_page_list() would not recurse into,
- * and potentially deadlock on, this drbd worker.
- */
- DISCONNECT_SENT, /* Currently the last bit in this 32bit word */
-
- /* keep last */
- DRBD_N_FLAGS,
+ B_RS_H_DONE, /* Before resync handler done (already executed) */
+ DISCARD_MY_DATA, /* discard_my_data flag per volume */
+ READ_BALANCE_RR,
};
struct drbd_bitmap; /* opaque for drbd_conf */
@@ -899,18 +730,17 @@ enum bm_flag {
struct drbd_work_queue {
struct list_head q;
- struct semaphore s; /* producers up it, worker down()s it */
spinlock_t q_lock; /* to protect the list. */
+ wait_queue_head_t q_wait;
};
struct drbd_socket {
- struct drbd_work_queue work;
struct mutex mutex;
struct socket *socket;
/* this way we get our
* send/receive buffers off the stack */
- union p_polymorph sbuf;
- union p_polymorph rbuf;
+ void *sbuf;
+ void *rbuf;
};
struct drbd_md {
@@ -927,24 +757,16 @@ struct drbd_md {
s32 bm_offset; /* signed relative sector offset to bitmap */
/* u32 al_nr_extents; important for restoring the AL
- * is stored into sync_conf.al_extents, which in turn
+ * is stored into ldev->dc.al_extents, which in turn
* gets applied to act_log->nr_elements
*/
};
-/* for sync_conf and other types... */
-#define NL_PACKET(name, number, fields) struct name { fields };
-#define NL_INTEGER(pn,pr,member) int member;
-#define NL_INT64(pn,pr,member) __u64 member;
-#define NL_BIT(pn,pr,member) unsigned member:1;
-#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
-#include <linux/drbd_nl.h>
-
struct drbd_backing_dev {
struct block_device *backing_bdev;
struct block_device *md_bdev;
struct drbd_md md;
- struct disk_conf dc; /* The user provided config... */
+ struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */
sector_t known_size; /* last known size of that backing device */
};
@@ -968,17 +790,116 @@ enum write_ordering_e {
};
struct fifo_buffer {
- int *values;
unsigned int head_index;
unsigned int size;
+ int total; /* sum of all values */
+ int values[0];
+};
+extern struct fifo_buffer *fifo_alloc(int fifo_size);
+
+/* flag bits per tconn */
+enum {
+ NET_CONGESTED, /* The data socket is congested */
+ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
+ SEND_PING, /* whether asender should send a ping asap */
+ SIGNAL_ASENDER, /* whether asender wants to be interrupted */
+ GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
+ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
+ CONN_WD_ST_CHG_OKAY,
+ CONN_WD_ST_CHG_FAIL,
+ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
+ CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
+ STATE_SENT, /* Do not change state/UUIDs while this is set */
+ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+ * pending, from drbd worker context.
+ * If set, bdi_write_congested() returns true,
+ * so shrink_page_list() would not recurse into,
+ * and potentially deadlock on, this drbd worker.
+ */
+ DISCONNECT_SENT,
+};
+
+struct drbd_tconn { /* is a resource from the config file */
+ char *name; /* Resource name */
+ struct list_head all_tconn; /* linked on global drbd_tconns */
+ struct kref kref;
+ struct idr volumes; /* <tconn, vnr> to mdev mapping */
+ enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+ unsigned susp:1; /* IO suspended by user */
+ unsigned susp_nod:1; /* IO suspended because no data */
+ unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
+ struct mutex cstate_mutex; /* Protects graceful disconnects */
+
+ unsigned long flags;
+ struct net_conf *net_conf; /* content protected by rcu */
+ struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */
+ wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */
+ struct res_opts res_opts;
+
+ struct sockaddr_storage my_addr;
+ int my_addr_len;
+ struct sockaddr_storage peer_addr;
+ int peer_addr_len;
+
+ struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+ struct drbd_socket meta; /* ping/ack (metadata) packets */
+ int agreed_pro_version; /* actually used protocol version */
+ unsigned long last_received; /* in jiffies, either socket */
+ unsigned int ko_count;
+
+ spinlock_t req_lock;
+
+ struct list_head transfer_log; /* all requests not yet fully processed */
+
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */
+ struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *verify_tfm;
+ void *int_dig_in;
+ void *int_dig_vv;
+
+ /* receiver side */
+ struct drbd_epoch *current_epoch;
+ spinlock_t epoch_lock;
+ unsigned int epochs;
+ enum write_ordering_e write_ordering;
+ atomic_t current_tle_nr; /* transfer log epoch number */
+ unsigned current_tle_writes; /* writes seen within this tl epoch */
+
+ unsigned long last_reconnect_jif;
+ struct drbd_thread receiver;
+ struct drbd_thread worker;
+ struct drbd_thread asender;
+ cpumask_var_t cpu_mask;
+
+ /* sender side */
+ struct drbd_work_queue sender_work;
+
+ struct {
+ /* whether this sender thread
+ * has processed a single write yet. */
+ bool seen_any_write_yet;
+
+ /* Which barrier number to send with the next P_BARRIER */
+ int current_epoch_nr;
+
+ /* how many write requests have been sent
+ * with req->epoch == current_epoch_nr.
+ * If none, no P_BARRIER will be sent. */
+ unsigned current_epoch_writes;
+ } send;
};
struct drbd_conf {
- unsigned long drbd_flags[(DRBD_N_FLAGS + BITS_PER_LONG -1)/BITS_PER_LONG];
+ struct drbd_tconn *tconn;
+ int vnr; /* volume number within the connection */
+ struct kref kref;
+
+ /* things that are stored as / read from meta data on disk */
+ unsigned long flags;
/* configured by drbdsetup */
- struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
- struct syncer_conf sync_conf;
struct drbd_backing_dev *ldev __protected_by(local);
sector_t p_size; /* partner's disk size */
@@ -986,11 +907,7 @@ struct drbd_conf {
struct block_device *this_bdev;
struct gendisk *vdisk;
- struct drbd_socket data; /* data/barrier/cstate/parameter packets */
- struct drbd_socket meta; /* ping/ack (metadata) packets */
- int agreed_pro_version; /* actually used protocol version */
- unsigned long last_received; /* in jiffies, either socket */
- unsigned int ko_count;
+ unsigned long last_reattach_jif;
struct drbd_work resync_work,
unplug_work,
go_diskless,
@@ -1010,10 +927,9 @@ struct drbd_conf {
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
- union drbd_state state;
+ union drbd_dev_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
- wait_queue_head_t net_cnt_wait;
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
@@ -1023,17 +939,12 @@ struct drbd_conf {
atomic_t ap_bio_cnt; /* Requests we need to complete */
atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
- atomic_t unacked_cnt; /* Need to send replys for */
+ atomic_t unacked_cnt; /* Need to send replies for */
atomic_t local_cnt; /* Waiting for local completion */
- atomic_t net_cnt; /* Users of net_conf */
- spinlock_t req_lock;
- struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
- struct drbd_tl_epoch *newest_tle;
- struct drbd_tl_epoch *oldest_tle;
- struct list_head out_of_sequence_requests;
- struct list_head barrier_acked_requests;
- struct hlist_head *tl_hash;
- unsigned int tl_hash_s;
+
+ /* Interval tree of pending local requests */
+ struct rb_root read_requests;
+ struct rb_root write_requests;
/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
unsigned long rs_total;
@@ -1053,6 +964,7 @@ struct drbd_conf {
unsigned long rs_mark_time[DRBD_SYNC_MARKS];
/* current index into rs_mark_{left,time} */
int rs_last_mark;
+ unsigned long rs_last_bcast; /* [unit jiffies] */
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
@@ -1064,14 +976,7 @@ struct drbd_conf {
/* size of out-of-sync range in sectors. */
sector_t ov_last_oos_size;
unsigned long ov_left; /* in bits */
- struct crypto_hash *csums_tfm;
- struct crypto_hash *verify_tfm;
- unsigned long last_reattach_jif;
- unsigned long last_reconnect_jif;
- struct drbd_thread receiver;
- struct drbd_thread worker;
- struct drbd_thread asender;
struct drbd_bitmap *bitmap;
unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
@@ -1084,29 +989,19 @@ struct drbd_conf {
int open_cnt;
u64 *p_uuid;
- struct drbd_epoch *current_epoch;
- spinlock_t epoch_lock;
- unsigned int epochs;
- enum write_ordering_e write_ordering;
+
struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
- struct list_head done_ee; /* send ack */
- struct list_head read_ee; /* IO in progress (any read) */
+ struct list_head done_ee; /* need to send P_WRITE_ACK */
+ struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
struct list_head net_ee; /* zero-copy network send in progress */
- struct hlist_head *ee_hash; /* is proteced by req_lock! */
- unsigned int ee_hash_s;
-
- /* this one is protected by ee_lock, single thread */
- struct drbd_epoch_entry *last_write_w_barrier;
int next_barrier_nr;
- struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
- struct page *md_io_tmpp; /* for logical_block_size != 512 */
struct drbd_md_io md_io;
atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */
spinlock_t al_lock;
@@ -1115,22 +1010,16 @@ struct drbd_conf {
unsigned int al_tr_number;
int al_tr_cycle;
int al_tr_pos; /* position of the next transaction in the journal */
- struct crypto_hash *cram_hmac_tfm;
- struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
- struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
- void *int_dig_out;
- void *int_dig_in;
- void *int_dig_vv;
wait_queue_head_t seq_wait;
atomic_t packet_seq;
unsigned int peer_seq;
spinlock_t peer_seq_lock;
unsigned int minor;
unsigned long comm_bm_set; /* communicated number of set bits. */
- cpumask_var_t cpu_mask;
struct bm_io_work bm_io_work;
u64 ed_uuid; /* UUID of the exposed data */
- struct mutex state_mutex;
+ struct mutex own_state_mutex;
+ struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */
char congestion_reason; /* Why we where congested... */
atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
atomic_t rs_sect_ev; /* for submitted resync data rate, both */
@@ -1138,46 +1027,16 @@ struct drbd_conf {
int rs_last_events; /* counter of read or write "events" (unit sectors)
* on the lower level device when we last looked. */
int c_sync_rate; /* current resync rate after syncer throttle magic */
- struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+ struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
- int rs_planed; /* resync sectors already planned */
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
unsigned int peer_max_bio_size;
unsigned int local_max_bio_size;
};
-static inline void drbd_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- set_bit(f, &mdev->drbd_flags[0]);
-}
-
-static inline void drbd_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- clear_bit(f, &mdev->drbd_flags[0]);
-}
-
-static inline int drbd_test_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- return test_bit(f, &mdev->drbd_flags[0]);
-}
-
-static inline int drbd_test_and_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- return test_and_set_bit(f, &mdev->drbd_flags[0]);
-}
-
-static inline int drbd_test_and_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- return test_and_clear_bit(f, &mdev->drbd_flags[0]);
-}
-
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
{
- struct drbd_conf *mdev;
-
- mdev = minor < minor_count ? minor_table[minor] : NULL;
-
- return mdev;
+ return (struct drbd_conf *)idr_find(&minors, minor);
}
static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
@@ -1185,29 +1044,9 @@ static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
return mdev->minor;
}
-/* returns 1 if it was successful,
- * returns 0 if there was no data socket.
- * so wherever you are going to use the data.socket, e.g. do
- * if (!drbd_get_data_sock(mdev))
- * return 0;
- * CODE();
- * drbd_put_data_sock(mdev);
- */
-static inline int drbd_get_data_sock(struct drbd_conf *mdev)
-{
- mutex_lock(&mdev->data.mutex);
- /* drbd_disconnect() could have called drbd_free_sock()
- * while we were waiting in down()... */
- if (unlikely(mdev->data.socket == NULL)) {
- mutex_unlock(&mdev->data.mutex);
- return 0;
- }
- return 1;
-}
-
-static inline void drbd_put_data_sock(struct drbd_conf *mdev)
+static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr)
{
- mutex_unlock(&mdev->data.mutex);
+ return (struct drbd_conf *)idr_find(&tconn->volumes, vnr);
}
/*
@@ -1216,99 +1055,69 @@ static inline void drbd_put_data_sock(struct drbd_conf *mdev)
/* drbd_main.c */
-enum chg_state_flags {
- CS_HARD = 1,
- CS_VERBOSE = 2,
- CS_WAIT_COMPLETE = 4,
- CS_SERIALIZE = 8,
- CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
-};
-
enum dds_flags {
DDSF_FORCED = 1,
DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
};
extern void drbd_init_set_defaults(struct drbd_conf *mdev);
-extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
- enum chg_state_flags f,
- union drbd_state mask,
- union drbd_state val);
-extern void drbd_force_state(struct drbd_conf *, union drbd_state,
- union drbd_state);
-extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
- union drbd_state,
- union drbd_state,
- enum chg_state_flags);
-extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
- enum chg_state_flags,
- struct completion *done);
-extern void print_st_err(struct drbd_conf *, union drbd_state,
- union drbd_state, int);
extern int drbd_thread_start(struct drbd_thread *thi);
extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task);
#ifdef CONFIG_SMP
-extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
-extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn);
#else
#define drbd_thread_current_set_cpu(A) ({})
#define drbd_calc_cpu_mask(A) ({})
#endif
-extern void drbd_free_resources(struct drbd_conf *mdev);
-extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr,
unsigned int set_size);
-extern void tl_clear(struct drbd_conf *mdev);
-extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
-extern void drbd_free_sock(struct drbd_conf *mdev);
-extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
- void *buf, size_t size, unsigned msg_flags);
-extern int drbd_send_protocol(struct drbd_conf *mdev);
+extern void tl_clear(struct drbd_tconn *);
+extern void drbd_free_sock(struct drbd_tconn *tconn);
+extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t,
+ unsigned);
+
+extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_tconn *tconn);
extern int drbd_send_uuids(struct drbd_conf *mdev);
extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
-extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
extern int drbd_send_current_state(struct drbd_conf *mdev);
-extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size, unsigned msg_flags);
-#define USE_DATA_SOCKET 1
-#define USE_META_SOCKET 0
-extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size);
-extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
- char *data, size_t size);
-extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
-extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
- u32 set_size);
-extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e);
-extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_block_req *rp);
-extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp, int data_size);
-extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+extern int drbd_send_sync_param(struct drbd_conf *mdev);
+extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr,
+ u32 set_size);
+extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet,
+ struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
sector_t sector, int blksize, u64 block_id);
-extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
-extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e);
+extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_conf *, enum drbd_packet,
+ struct drbd_peer_request *);
extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
sector_t sector, int size, u64 block_id);
-extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
- sector_t sector,int size,
- void *digest, int digest_size,
- enum drbd_packets cmd);
+extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector,
+ int size, void *digest, int digest_size,
+ enum drbd_packet cmd);
extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
extern int drbd_send_bitmap(struct drbd_conf *mdev);
-extern int _drbd_send_bitmap(struct drbd_conf *mdev);
-extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode);
extern void drbd_free_bc(struct drbd_backing_dev *ldev);
extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
+extern void conn_md_sync(struct drbd_tconn *tconn);
extern void drbd_md_sync(struct drbd_conf *mdev);
extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
@@ -1334,33 +1143,52 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
extern int drbd_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+ int (*io_fn)(struct drbd_conf *),
+ char *why, enum bm_flag flags);
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern void drbd_go_diskless(struct drbd_conf *mdev);
extern void drbd_ldev_destroy(struct drbd_conf *mdev);
-
/* Meta data layout
We reserve a 128MB Block (4k aligned)
* either at the end of the backing device
* or on a separate meta data device. */
-#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
/* The following numbers are sectors */
-#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
-#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
-/* Allows up to about 3.8TB */
-#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
-
-/* Since the smalles IO unit is usually 512 byte */
-#define MD_SECTOR_SHIFT 9
-#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
-
-/* activity log */
-#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
-#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
+/* Allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
+#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
+#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */
+#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
+
+/* we do all meta data IO in 4k blocks */
+#define MD_BLOCK_SHIFT 12
+#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
+
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ * This many changes to the active set can be logged with one transaction.
+ * This number is arbitrary.
+ * context per transaction:
+ * This many context extent numbers are logged with each transaction.
+ * This number is resulting from the transaction block size (4k), the layout
+ * of the transaction header, and the number of updates per transaction.
+ * See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
+#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
+
#if BITS_PER_LONG == 32
#define LN2_BPL 5
#define cpu_to_lel(A) cpu_to_le32(A)
@@ -1396,11 +1224,14 @@ struct bm_extent {
#define SLEEP_TIME (HZ/10)
-#define BM_BLOCK_SHIFT 12 /* 4k per bit */
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
-/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
- * per sector of on disk bitmap */
-#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */
#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
@@ -1468,17 +1299,20 @@ struct bm_extent {
#endif
#endif
-/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 8 all IO in one 128K block make it to the same slot of the
- * hash table. */
-#define HT_SHIFT 8
-#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
+/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
-#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
-
-/* Number of elements in the app_reads_hash */
-#define APP_R_HSIZE 15
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
extern int drbd_bm_init(struct drbd_conf *mdev);
extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
@@ -1500,11 +1334,11 @@ extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr);
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
-extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
- unsigned long al_enr);
extern size_t drbd_bm_words(struct drbd_conf *mdev);
extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
@@ -1529,7 +1363,7 @@ extern void drbd_bm_unlock(struct drbd_conf *mdev);
/* drbd_main.c */
extern struct kmem_cache *drbd_request_cache;
-extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
+extern struct kmem_cache *drbd_ee_cache; /* peer requests */
extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t *drbd_request_mempool;
@@ -1569,12 +1403,22 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
extern rwlock_t global_state_lock;
-extern struct drbd_conf *drbd_new_device(unsigned int minor);
-extern void drbd_free_mdev(struct drbd_conf *mdev);
+extern int conn_lowest_minor(struct drbd_tconn *tconn);
+enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr);
+extern void drbd_minor_destroy(struct kref *kref);
+
+extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts);
+extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts);
+extern void conn_destroy(struct kref *kref);
+struct drbd_tconn *conn_get_by_name(const char *name);
+extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len);
+extern void conn_free_crypto(struct drbd_tconn *tconn);
extern int proc_details;
/* drbd_req */
+extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
extern void drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
@@ -1582,10 +1426,11 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
/* drbd_nl.c */
+extern int drbd_msg_put_info(const char *info);
extern void drbd_suspend_io(struct drbd_conf *mdev);
extern void drbd_resume_io(struct drbd_conf *mdev);
extern char *ppsize(char *buf, unsigned long long size);
-extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
+extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
extern void resync_after_online_grow(struct drbd_conf *);
@@ -1593,13 +1438,14 @@ extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
enum drbd_role new_role,
int force);
-extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
-extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
+extern bool conn_try_outdate_peer(struct drbd_tconn *tconn);
+extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn);
extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
/* drbd_worker.c */
extern int drbd_worker(struct drbd_thread *thi);
-extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor);
+void drbd_resync_after_changed(struct drbd_conf *mdev);
extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
extern void resume_next_sg(struct drbd_conf *mdev);
extern void suspend_other_sg(struct drbd_conf *mdev);
@@ -1608,13 +1454,13 @@ extern int drbd_resync_finished(struct drbd_conf *mdev);
extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
extern void drbd_md_put_buffer(struct drbd_conf *mdev);
extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev, sector_t sector, int rw);
+ struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int);
extern void wait_until_done_or_force_detached(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev, unsigned int *done);
-extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
-static inline void ov_oos_print(struct drbd_conf *mdev)
+static inline void ov_out_of_sync_print(struct drbd_conf *mdev)
{
if (mdev->ov_last_oos_size) {
dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
@@ -1626,97 +1472,102 @@ static inline void ov_oos_print(struct drbd_conf *mdev)
extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
-extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
+extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *,
+ struct drbd_peer_request *, void *);
/* worker callbacks */
-extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
-extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
-extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
-extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_make_resync_request(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_prev_work_done(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+extern int w_start_resync(struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
extern void start_resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
-extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- const unsigned rw, const int fault_type);
-extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
-extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
- u64 id,
- sector_t sector,
- unsigned int data_size,
- gfp_t gfp_mask) __must_hold(local);
-extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- int is_net);
-#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
-#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
-extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
- struct list_head *head);
-extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
- struct list_head *head);
+extern int drbd_submit_peer_request(struct drbd_conf *,
+ struct drbd_peer_request *, const unsigned,
+ const int);
+extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64,
+ sector_t, unsigned int,
+ gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *,
+ int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool);
extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
-extern void drbd_flush_workqueue(struct drbd_conf *mdev);
-extern void drbd_free_tl_hash(struct drbd_conf *mdev);
+extern void conn_flush_workqueue(struct drbd_tconn *tconn);
+extern int drbd_connected(struct drbd_conf *mdev);
+static inline void drbd_flush_workqueue(struct drbd_conf *mdev)
+{
+ conn_flush_workqueue(mdev->tconn);
+}
-/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
- * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+/* Yes, there is kernel_setsockopt, but only since 2.6.18.
+ * So we have our own copy of it here. */
static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ char *optval, int optlen)
{
+ mm_segment_t oldfs = get_fs();
+ char __user *uoptval;
int err;
+
+ uoptval = (char __user __force *)optval;
+
+ set_fs(KERNEL_DS);
if (level == SOL_SOCKET)
- err = sock_setsockopt(sock, level, optname, optval, optlen);
+ err = sock_setsockopt(sock, level, optname, uoptval, optlen);
else
- err = sock->ops->setsockopt(sock, level, optname, optval,
+ err = sock->ops->setsockopt(sock, level, optname, uoptval,
optlen);
+ set_fs(oldfs);
return err;
}
static inline void drbd_tcp_cork(struct socket *sock)
{
- int __user val = 1;
+ int val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_uncork(struct socket *sock)
{
- int __user val = 0;
+ int val = 0;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_nodelay(struct socket *sock)
{
- int __user val = 1;
+ int val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_quickack(struct socket *sock)
{
- int __user val = 2;
+ int val = 2;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
-void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo);
/* drbd_proc.c */
extern struct proc_dir_entry *drbd_proc;
@@ -1725,8 +1576,8 @@ extern const char *drbd_conn_str(enum drbd_conns s);
extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
-extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
-extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@ -1734,7 +1585,6 @@ extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
extern int drbd_rs_del_all(struct drbd_conf *mdev);
extern void drbd_rs_failed_io(struct drbd_conf *mdev,
sector_t sector, int size);
-extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
int size, const char *file, const unsigned int line);
@@ -1744,73 +1594,24 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
int size, const char *file, const unsigned int line);
#define drbd_set_out_of_sync(mdev, sector, size) \
__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
-extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
extern void drbd_al_shrink(struct drbd_conf *mdev);
-
/* drbd_nl.c */
-
-void drbd_nl_cleanup(void);
-int __init drbd_nl_init(void);
-void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
-void drbd_bcast_sync_progress(struct drbd_conf *mdev);
-void drbd_bcast_ee(struct drbd_conf *mdev,
- const char *reason, const int dgs,
- const char* seen_hash, const char* calc_hash,
- const struct drbd_epoch_entry* e);
-
-
-/**
- * DOC: DRBD State macros
- *
- * These macros are used to express state changes in easily readable form.
- *
- * The NS macros expand to a mask and a value, that can be bit ored onto the
- * current state as soon as the spinlock (req_lock) was taken.
- *
- * The _NS macros are used for state functions that get called with the
- * spinlock. These macros expand directly to the new state value.
- *
- * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
- * to express state changes that affect more than one aspect of the state.
- *
- * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
- * Means that the network connection was established and that the peer
- * is in secondary role.
- */
-#define role_MASK R_MASK
-#define peer_MASK R_MASK
-#define disk_MASK D_MASK
-#define pdsk_MASK D_MASK
-#define conn_MASK C_MASK
-#define susp_MASK 1
-#define user_isp_MASK 1
-#define aftr_isp_MASK 1
-#define susp_nod_MASK 1
-#define susp_fen_MASK 1
-
-#define NS(T, S) \
- ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T = (S); val; })
-#define NS2(T1, S1, T2, S2) \
- ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
- mask.T2 = T2##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
- val.T2 = (S2); val; })
-#define NS3(T1, S1, T2, S2, T3, S3) \
- ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
- mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
- val.T2 = (S2); val.T3 = (S3); val; })
-
-#define _NS(D, T, S) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
-#define _NS2(D, T1, S1, T2, S2) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
- __ns.T2 = (S2); __ns; })
-#define _NS3(D, T1, S1, T2, S2, T3, S3) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
- __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+/* state info broadcast */
+struct sib_info {
+ enum drbd_state_info_bcast_reason sib_reason;
+ union {
+ struct {
+ char *helper_name;
+ unsigned helper_exit_code;
+ };
+ struct {
+ union drbd_state os;
+ union drbd_state ns;
+ };
+ };
+};
+void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib);
/*
* inline helper functions
@@ -1827,9 +1628,10 @@ static inline struct page *page_chain_next(struct page *page)
#define page_chain_for_each_safe(page, n) \
for (; page && ({ n = page_chain_next(page); 1; }); page = n)
-static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
{
- struct page *page = e->pages;
+ struct page *page = peer_req->pages;
page_chain_for_each(page) {
if (page_count(page) > 1)
return 1;
@@ -1837,18 +1639,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
return 0;
}
-static inline void drbd_state_lock(struct drbd_conf *mdev)
-{
- wait_event(mdev->misc_wait,
- !drbd_test_and_set_flag(mdev, CLUSTER_ST_CHANGE));
-}
-
-static inline void drbd_state_unlock(struct drbd_conf *mdev)
-{
- drbd_clear_flag(mdev, CLUSTER_ST_CHANGE);
- wake_up(&mdev->misc_wait);
-}
-
static inline enum drbd_state_rv
_drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
enum chg_state_flags flags, struct completion *done)
@@ -1862,21 +1652,16 @@ _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
return rv;
}
-/**
- * drbd_request_state() - Reqest a state change
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- *
- * This is the most graceful way of requesting a state change. It is verbose
- * quite verbose in case the state change is not possible, and all those
- * state changes are globally serialized.
- */
-static inline int drbd_request_state(struct drbd_conf *mdev,
- union drbd_state mask,
- union drbd_state val)
+static inline union drbd_state drbd_read_state(struct drbd_conf *mdev)
{
- return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+ union drbd_state rv;
+
+ rv.i = mdev->state.i;
+ rv.susp = mdev->tconn->susp;
+ rv.susp_nod = mdev->tconn->susp_nod;
+ rv.susp_fen = mdev->tconn->susp_fen;
+
+ return rv;
}
enum drbd_force_detach_flags {
@@ -1891,8 +1676,13 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
enum drbd_force_detach_flags df,
const char *where)
{
- switch (mdev->ldev->dc.on_io_error) {
- case EP_PASS_ON:
+ enum drbd_io_error_p ep;
+
+ rcu_read_lock();
+ ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+ switch (ep) {
+ case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Local IO failed in %s.\n", where);
@@ -1923,11 +1713,11 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
* we read meta data only once during attach,
* which will fail in case of errors.
*/
- drbd_set_flag(mdev, WAS_IO_ERROR);
+ set_bit(WAS_IO_ERROR, &mdev->flags);
if (df == DRBD_READ_ERROR)
- drbd_set_flag(mdev, WAS_READ_ERROR);
+ set_bit(WAS_READ_ERROR, &mdev->flags);
if (df == DRBD_FORCE_DETACH)
- drbd_set_flag(mdev, FORCE_DETACH);
+ set_bit(FORCE_DETACH, &mdev->flags);
if (mdev->state.disk > D_FAILED) {
_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
dev_err(DEV,
@@ -1951,9 +1741,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
{
if (error) {
unsigned long flags;
- spin_lock_irqsave(&mdev->req_lock, flags);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
__drbd_chk_io_error_(mdev, forcedetach, where);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
}
}
@@ -1965,9 +1755,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
* BTW, for internal meta data, this happens to be the maximum capacity
* we could agree upon with our peer node.
*/
-static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
+ switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1977,13 +1767,30 @@ static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
}
}
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ rcu_read_unlock();
+
+ return _drbd_md_first_sector(meta_dev_idx, bdev);
+}
+
/**
* drbd_md_last_sector() - Return the last sector number of the meta data area
* @bdev: Meta data block device.
*/
static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ rcu_read_unlock();
+
+ switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + MD_AL_OFFSET - 1;
@@ -2011,12 +1818,18 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
{
sector_t s;
- switch (bdev->dc.meta_dev_idx) {
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ rcu_read_unlock();
+
+ switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
s = drbd_get_capacity(bdev->backing_bdev)
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
- drbd_md_first_sector(bdev))
+ _drbd_md_first_sector(meta_dev_idx, bdev))
: 0;
break;
case DRBD_MD_INDEX_FLEX_EXT:
@@ -2042,9 +1855,15 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ rcu_read_unlock();
+
+ switch (meta_dev_idx) {
default: /* external, some index */
- return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
+ return MD_RESERVED_SECT * meta_dev_idx;
case DRBD_MD_INDEX_INTERNAL:
/* with drbd08, internal meta data is always "flexible" */
case DRBD_MD_INDEX_FLEX_INT:
@@ -2070,9 +1889,8 @@ drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add(&w->list, &q->q);
- up(&q->s); /* within the spinlock,
- see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
}
static inline void
@@ -2081,41 +1899,35 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add_tail(&w->list, &q->q);
- up(&q->s); /* within the spinlock,
- see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
}
-static inline void wake_asender(struct drbd_conf *mdev)
-{
- if (drbd_test_flag(mdev, SIGNAL_ASENDER))
- force_sig(DRBD_SIG, mdev->asender.task);
-}
-
-static inline void request_ping(struct drbd_conf *mdev)
+static inline void wake_asender(struct drbd_tconn *tconn)
{
- drbd_set_flag(mdev, SEND_PING);
- wake_asender(mdev);
+ if (test_bit(SIGNAL_ASENDER, &tconn->flags))
+ force_sig(DRBD_SIG, tconn->asender.task);
}
-static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
- enum drbd_packets cmd)
+static inline void request_ping(struct drbd_tconn *tconn)
{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
+ set_bit(SEND_PING, &tconn->flags);
+ wake_asender(tconn);
}
-static inline int drbd_send_ping(struct drbd_conf *mdev)
-{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
-}
+extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
+extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
-static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
-{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
-}
+extern int drbd_send_ping(struct drbd_tconn *tconn);
+extern int drbd_send_ping_ack(struct drbd_tconn *tconn);
+extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state);
static inline void drbd_thread_stop(struct drbd_thread *thi)
{
@@ -2137,21 +1949,21 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
* or implicit barrier packets as necessary.
* increased:
* w_send_barrier
- * _req_mod(req, queue_for_net_write or queue_for_net_read);
+ * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
* it is much easier and equally valid to count what we queue for the
* worker, even before it actually was queued or send.
* (drbd_make_request_common; recovery path on read io-error)
* decreased:
* got_BarrierAck (respective tl_clear, tl_clear_barrier)
- * _req_mod(req, data_received)
+ * _req_mod(req, DATA_RECEIVED)
* [from receive_DataReply]
- * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+ * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
* [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
* for some reason it is NOT decreased in got_NegAck,
* but in the resulting cleanup code from report_params.
* we should try to remember the reason for that...
- * _req_mod(req, send_failed or send_canceled)
- * _req_mod(req, connection_lost_while_pending)
+ * _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ * _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
* [from tl_clear_barrier]
*/
static inline void inc_ap_pending(struct drbd_conf *mdev)
@@ -2159,17 +1971,19 @@ static inline void inc_ap_pending(struct drbd_conf *mdev)
atomic_inc(&mdev->ap_pending_cnt);
}
-#define ERR_IF_CNT_IS_NEGATIVE(which) \
- if (atomic_read(&mdev->which) < 0) \
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \
+ if (atomic_read(&mdev->which) < 0) \
dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
- __func__ , __LINE__ , \
- atomic_read(&mdev->which))
+ func, line, \
+ atomic_read(&mdev->which))
-#define dec_ap_pending(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
- wake_up(&mdev->misc_wait); \
- ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+#define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line)
+{
+ if (atomic_dec_and_test(&mdev->ap_pending_cnt))
+ wake_up(&mdev->misc_wait);
+ ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
/* counts how many resync-related answers we still expect from the peer
* increase decrease
@@ -2182,10 +1996,12 @@ static inline void inc_rs_pending(struct drbd_conf *mdev)
atomic_inc(&mdev->rs_pending_cnt);
}
-#define dec_rs_pending(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_dec(&mdev->rs_pending_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+#define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line)
+{
+ atomic_dec(&mdev->rs_pending_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
/* counts how many answers we still need to send to the peer.
* increased on
@@ -2201,38 +2017,18 @@ static inline void inc_unacked(struct drbd_conf *mdev)
atomic_inc(&mdev->unacked_cnt);
}
-#define dec_unacked(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_dec(&mdev->unacked_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
-
-#define sub_unacked(mdev, n) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_sub(n, &mdev->unacked_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
-
-
-static inline void put_net_conf(struct drbd_conf *mdev)
+#define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line)
{
- if (atomic_dec_and_test(&mdev->net_cnt))
- wake_up(&mdev->net_cnt_wait);
+ atomic_dec(&mdev->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
-/**
- * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
- * @mdev: DRBD device.
- *
- * You have to call put_net_conf() when finished working with mdev->net_conf.
- */
-static inline int get_net_conf(struct drbd_conf *mdev)
+#define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__)
+static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line)
{
- int have_net_conf;
-
- atomic_inc(&mdev->net_cnt);
- have_net_conf = mdev->state.conn >= C_UNCONNECTED;
- if (!have_net_conf)
- put_net_conf(mdev);
- return have_net_conf;
+ atomic_sub(n, &mdev->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
/**
@@ -2336,17 +2132,20 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
* maybe re-implement using semaphores? */
static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
{
- int mxb = 1000000; /* arbitrary limit on open requests */
- if (get_net_conf(mdev)) {
- mxb = mdev->net_conf->max_buffers;
- put_net_conf(mdev);
- }
+ struct net_conf *nc;
+ int mxb;
+
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */
+ rcu_read_unlock();
+
return mxb;
}
static inline int drbd_state_is_stable(struct drbd_conf *mdev)
{
- union drbd_state s = mdev->state;
+ union drbd_dev_state s = mdev->state;
/* DO NOT add a default clause, we want the compiler to warn us
* for any newly introduced state we may have forgotten to add here */
@@ -2380,7 +2179,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
/* Allow IO in BM exchange states with new protocols */
case C_WF_BITMAP_S:
- if (mdev->agreed_pro_version < 96)
+ if (mdev->tconn->agreed_pro_version < 96)
return 0;
break;
@@ -2402,7 +2201,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
/* disk state is stable as well. */
break;
- /* no new io accepted during tansitional states */
+ /* no new io accepted during transitional states */
case D_ATTACHING:
case D_NEGOTIATING:
case D_UNKNOWN:
@@ -2414,18 +2213,20 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
return 1;
}
-static inline int is_susp(union drbd_state s)
+static inline int drbd_suspended(struct drbd_conf *mdev)
{
- return s.susp || s.susp_nod || s.susp_fen;
+ struct drbd_tconn *tconn = mdev->tconn;
+
+ return tconn->susp || tconn->susp_fen || tconn->susp_nod;
}
static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
- if (is_susp(mdev->state))
+ if (drbd_suspended(mdev))
return false;
- if (drbd_test_flag(mdev, SUSPEND_IO))
+ if (test_bit(SUSPEND_IO, &mdev->flags))
return false;
/* to avoid potential deadlock or bitmap corruption,
@@ -2440,35 +2241,35 @@ static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
* and we are within the spinlock anyways, we have this workaround. */
if (atomic_read(&mdev->ap_bio_cnt) > mxb)
return false;
- if (drbd_test_flag(mdev, BITMAP_IO))
+ if (test_bit(BITMAP_IO, &mdev->flags))
return false;
return true;
}
-static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
+static inline bool inc_ap_bio_cond(struct drbd_conf *mdev)
{
bool rv = false;
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
rv = may_inc_ap_bio(mdev);
if (rv)
- atomic_add(count, &mdev->ap_bio_cnt);
- spin_unlock_irq(&mdev->req_lock);
+ atomic_inc(&mdev->ap_bio_cnt);
+ spin_unlock_irq(&mdev->tconn->req_lock);
return rv;
}
-static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
+static inline void inc_ap_bio(struct drbd_conf *mdev)
{
/* we wait here
* as long as the device is suspended
* until the bitmap is no longer on the fly during connection
- * handshake as long as we would exeed the max_buffer limit.
+ * handshake as long as we would exceed the max_buffer limit.
*
* to avoid races with the reconnect code,
* we need to atomic_inc within the spinlock. */
- wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
+ wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev));
}
static inline void dec_ap_bio(struct drbd_conf *mdev)
@@ -2478,9 +2279,9 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
D_ASSERT(ap_bio >= 0);
- if (ap_bio == 0 && drbd_test_flag(mdev, BITMAP_IO)) {
- if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
- drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+ if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
}
/* this currently does wake_up for every dec_ap_bio!
@@ -2490,6 +2291,12 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
wake_up(&mdev->misc_wait);
}
+static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev)
+{
+ return mdev->tconn->agreed_pro_version >= 97 &&
+ mdev->tconn->agreed_pro_version != 100;
+}
+
static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
{
int changed = mdev->ed_uuid != val;
@@ -2497,40 +2304,6 @@ static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
return changed;
}
-static inline int seq_cmp(u32 a, u32 b)
-{
- /* we assume wrap around at 32bit.
- * for wrap around at 24bit (old atomic_t),
- * we'd have to
- * a <<= 8; b <<= 8;
- */
- return (s32)(a) - (s32)(b);
-}
-#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
-#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
-#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
-#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
-/* CAUTION: please no side effects in arguments! */
-#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
-
-static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
-{
- unsigned int m;
- spin_lock(&mdev->peer_seq_lock);
- m = seq_max(mdev->peer_seq, new_seq);
- mdev->peer_seq = m;
- spin_unlock(&mdev->peer_seq_lock);
- if (m == new_seq)
- wake_up(&mdev->seq_wait);
-}
-
-static inline void drbd_update_congested(struct drbd_conf *mdev)
-{
- struct sock *sk = mdev->data.socket->sk;
- if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
- drbd_set_flag(mdev, NET_CONGESTED);
-}
-
static inline int drbd_queue_order_type(struct drbd_conf *mdev)
{
/* sorry, we currently have no working implementation
@@ -2545,15 +2318,46 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
{
int r;
- if (drbd_test_flag(mdev, MD_NO_FUA))
+ if (mdev->ldev == NULL) {
+ dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n");
+ return;
+ }
+
+ if (test_bit(MD_NO_FUA, &mdev->flags))
return;
r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL);
if (r) {
- drbd_set_flag(mdev, MD_NO_FUA);
+ set_bit(MD_NO_FUA, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
}
}
-
#endif
+
+/* This is defined in drivers/md/md.h as well. Should go into wait.h */
+#define __wait_event_lock_irq(wq, condition, lock, cmd) \
+do { \
+ wait_queue_t __wait; \
+ init_waitqueue_entry(&__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ spin_unlock_irq(&lock); \
+ cmd; \
+ schedule(); \
+ spin_lock_irq(&lock); \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock, cmd) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_lock_irq(wq, condition, lock, cmd); \
+} while (0)
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 00000000000..89c497c630b
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,207 @@
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/**
+ * interval_end - return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+ struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+ return this->end;
+}
+
+/**
+ * compute_subtree_last - compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children. Called for @node and its parents whenever the end
+ * may have changed.
+ */
+static inline sector_t
+compute_subtree_last(struct drbd_interval *node)
+{
+ sector_t max = node->sector + (node->size >> 9);
+
+ if (node->rb.rb_left) {
+ sector_t left = interval_end(node->rb.rb_left);
+ if (left > max)
+ max = left;
+ }
+ if (node->rb.rb_right) {
+ sector_t right = interval_end(node->rb.rb_right);
+ if (right > max)
+ max = right;
+ }
+ return max;
+}
+
+static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+ while (rb != stop) {
+ struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
+ sector_t subtree_last = compute_subtree_last(node);
+ if (node->end == subtree_last)
+ break;
+ node->end = subtree_last;
+ rb = rb_parent(&node->rb);
+ }
+}
+
+static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+ new->end = old->end;
+}
+
+static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+ new->end = old->end;
+ old->end = compute_subtree_last(old);
+}
+
+static const struct rb_augment_callbacks augment_callbacks = {
+ augment_propagate,
+ augment_copy,
+ augment_rotate,
+};
+
+/**
+ * drbd_insert_interval - insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+
+ BUG_ON(!IS_ALIGNED(this->size, 512));
+
+ while (*new) {
+ struct drbd_interval *here =
+ rb_entry(*new, struct drbd_interval, rb);
+
+ parent = *new;
+ if (this->sector < here->sector)
+ new = &(*new)->rb_left;
+ else if (this->sector > here->sector)
+ new = &(*new)->rb_right;
+ else if (this < here)
+ new = &(*new)->rb_left;
+ else if (this > here)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ rb_link_node(&this->rb, parent, new);
+ rb_insert_augmented(&this->rb, root, &augment_callbacks);
+ return true;
+}
+
+/**
+ * drbd_contains_interval - check if a tree contains a given interval
+ * @sector: start sector of @interval
+ * @interval: may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree. Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+ struct drbd_interval *interval)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (sector < here->sector)
+ node = node->rb_left;
+ else if (sector > here->sector)
+ node = node->rb_right;
+ else if (interval < here)
+ node = node->rb_left;
+ else if (interval > here)
+ node = node->rb_right;
+ else
+ return true;
+ }
+ return false;
+}
+
+/**
+ * drbd_remove_interval - remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
+ * @sector: start sector
+ * @size: size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none. When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+ struct rb_node *node = root->rb_node;
+ struct drbd_interval *overlap = NULL;
+ sector_t end = sector + (size >> 9);
+
+ BUG_ON(!IS_ALIGNED(size, 512));
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (node->rb_left &&
+ sector < interval_end(node->rb_left)) {
+ /* Overlap if any must be on left side */
+ node = node->rb_left;
+ } else if (here->sector < end &&
+ sector < here->sector + (here->size >> 9)) {
+ overlap = here;
+ break;
+ } else if (sector >= here->sector) {
+ /* Overlap if any must be on right side */
+ node = node->rb_right;
+ } else
+ break;
+ }
+ return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+ sector_t end = sector + (size >> 9);
+ struct rb_node *node;
+
+ for (;;) {
+ node = rb_next(&i->rb);
+ if (!node)
+ return NULL;
+ i = rb_entry(node, struct drbd_interval, rb);
+ if (i->sector >= end)
+ return NULL;
+ if (sector < i->sector + (i->size >> 9))
+ return i;
+ }
+}
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 00000000000..f38fcb00c10
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,40 @@
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+ struct rb_node rb;
+ sector_t sector; /* start sector of the interval */
+ unsigned int size; /* size in bytes */
+ sector_t end; /* highest interval end in subtree */
+ int local:1 /* local or remote request? */;
+ int waiting:1;
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+ RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+ return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+ struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+ unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+ unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size) \
+ for (i = drbd_find_overlap(root, sector, size); \
+ i; \
+ i = drbd_next_overlap(i, sector, size))
+
+#endif /* __DRBD_INTERVAL_H */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 9b833e0fb44..52de26daa1f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -56,14 +56,6 @@
#include "drbd_vli.h"
-struct after_state_chg_work {
- struct drbd_work w;
- union drbd_state os;
- union drbd_state ns;
- enum chg_state_flags flags;
- struct completion *done;
-};
-
static DEFINE_MUTEX(drbd_main_mutex);
int drbdd_init(struct drbd_thread *);
int drbd_worker(struct drbd_thread *);
@@ -72,21 +64,17 @@ int drbd_asender(struct drbd_thread *);
int drbd_init(void);
static int drbd_open(struct block_device *bdev, fmode_t mode);
static int drbd_release(struct gendisk *gd, fmode_t mode);
-static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum chg_state_flags flags);
-static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static int w_md_sync(struct drbd_work *w, int unused);
static void md_sync_timer_fn(unsigned long data);
-static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static void _tl_clear(struct drbd_conf *mdev);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+static int w_go_diskless(struct drbd_work *w, int unused);
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
"Lars Ellenberg <lars@linbit.com>");
MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
MODULE_VERSION(REL_VERSION);
MODULE_LICENSE("GPL");
-MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
__stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
@@ -98,7 +86,6 @@ MODULE_PARM_DESC(allow_oos, "DONT USE!");
module_param(minor_count, uint, 0444);
module_param(disable_sendpage, bool, 0644);
module_param(allow_oos, bool, 0);
-module_param(cn_idx, uint, 0444);
module_param(proc_details, int, 0644);
#ifdef CONFIG_DRBD_FAULT_INJECTION
@@ -120,7 +107,6 @@ module_param(fault_devs, int, 0644);
unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
bool disable_sendpage;
bool allow_oos;
-unsigned int cn_idx = CN_IDX_DRBD;
int proc_details; /* Detail level in proc drbd*/
/* Module parameter for setting the user mode helper program
@@ -132,10 +118,11 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
/* in 2.6.x, our device mapping and config info contains our virtual gendisks
* as member "struct gendisk *vdisk;"
*/
-struct drbd_conf **minor_table;
+struct idr minors;
+struct list_head drbd_tconns; /* list of struct drbd_tconn */
struct kmem_cache *drbd_request_cache;
-struct kmem_cache *drbd_ee_cache; /* epoch entries */
+struct kmem_cache *drbd_ee_cache; /* peer requests */
struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
mempool_t *drbd_request_mempool;
@@ -164,10 +151,15 @@ static const struct block_device_operations drbd_ops = {
struct bio *bio_alloc_drbd(gfp_t gfp_mask)
{
+ struct bio *bio;
+
if (!drbd_md_io_bio_set)
return bio_alloc(gfp_mask, 1);
- return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ if (!bio)
+ return NULL;
+ return bio;
}
#ifdef __CHECKER__
@@ -190,158 +182,87 @@ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
#endif
/**
- * DOC: The transfer log
- *
- * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
- * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
- * of the list. There is always at least one &struct drbd_tl_epoch object.
- *
- * Each &struct drbd_tl_epoch has a circular double linked list of requests
- * attached.
- */
-static int tl_init(struct drbd_conf *mdev)
-{
- struct drbd_tl_epoch *b;
-
- /* during device minor initialization, we may well use GFP_KERNEL */
- b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
- if (!b)
- return 0;
- INIT_LIST_HEAD(&b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->next = NULL;
- b->br_number = 4711;
- b->n_writes = 0;
- b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-
- mdev->oldest_tle = b;
- mdev->newest_tle = b;
- INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
- INIT_LIST_HEAD(&mdev->barrier_acked_requests);
-
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
-
- return 1;
-}
-
-static void tl_cleanup(struct drbd_conf *mdev)
-{
- D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
- D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
- kfree(mdev->oldest_tle);
- mdev->oldest_tle = NULL;
- kfree(mdev->unused_spare_tle);
- mdev->unused_spare_tle = NULL;
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
-}
-
-/**
- * _tl_add_barrier() - Adds a barrier to the transfer log
- * @mdev: DRBD device.
- * @new: Barrier to be added before the current head of the TL.
- *
- * The caller must hold the req_lock.
- */
-void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
-{
- struct drbd_tl_epoch *newest_before;
-
- INIT_LIST_HEAD(&new->requests);
- INIT_LIST_HEAD(&new->w.list);
- new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
- new->next = NULL;
- new->n_writes = 0;
-
- newest_before = mdev->newest_tle;
- new->br_number = newest_before->br_number+1;
- if (mdev->newest_tle != new) {
- mdev->newest_tle->next = new;
- mdev->newest_tle = new;
- }
-}
-
-/**
- * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
- * @mdev: DRBD device.
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @tconn: DRBD connection.
* @barrier_nr: Expected identifier of the DRBD write barrier packet.
* @set_size: Expected number of requests before that barrier.
*
* In case the passed barrier_nr or set_size does not match the oldest
- * &struct drbd_tl_epoch objects this function will cause a termination
- * of the connection.
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
*/
-void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
- unsigned int set_size)
+void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
+ unsigned int set_size)
{
- struct drbd_tl_epoch *b, *nob; /* next old barrier */
- struct list_head *le, *tle;
struct drbd_request *r;
-
- spin_lock_irq(&mdev->req_lock);
-
- b = mdev->oldest_tle;
+ struct drbd_request *req = NULL;
+ int expect_epoch = 0;
+ int expect_size = 0;
+
+ spin_lock_irq(&tconn->req_lock);
+
+ /* find oldest not yet barrier-acked write request,
+ * count writes in its epoch. */
+ list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+ const unsigned s = r->rq_state;
+ if (!req) {
+ if (!(s & RQ_WRITE))
+ continue;
+ if (!(s & RQ_NET_MASK))
+ continue;
+ if (s & RQ_NET_DONE)
+ continue;
+ req = r;
+ expect_epoch = req->epoch;
+ expect_size ++;
+ } else {
+ if (r->epoch != expect_epoch)
+ break;
+ if (!(s & RQ_WRITE))
+ continue;
+ /* if (s & RQ_DONE): not expected */
+ /* if (!(s & RQ_NET_MASK)): not expected */
+ expect_size++;
+ }
+ }
/* first some paranoia code */
- if (b == NULL) {
- dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
- barrier_nr);
+ if (req == NULL) {
+ conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+ barrier_nr);
goto bail;
}
- if (b->br_number != barrier_nr) {
- dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
- barrier_nr, b->br_number);
+ if (expect_epoch != barrier_nr) {
+ conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
+ barrier_nr, expect_epoch);
goto bail;
}
- if (b->n_writes != set_size) {
- dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
- barrier_nr, set_size, b->n_writes);
+
+ if (expect_size != set_size) {
+ conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+ barrier_nr, set_size, expect_size);
goto bail;
}
- /* Clean up list of requests processed during current epoch */
- list_for_each_safe(le, tle, &b->requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- _req_mod(r, barrier_acked);
- }
- /* There could be requests on the list waiting for completion
- of the write to the local disk. To avoid corruptions of
- slab's data structures we have to remove the lists head.
-
- Also there could have been a barrier ack out of sequence, overtaking
- the write acks - which would be a bug and violating write ordering.
- To not deadlock in case we lose connection while such requests are
- still pending, we need some way to find them for the
- _req_mode(connection_lost_while_pending).
-
- These have been list_move'd to the out_of_sequence_requests list in
- _req_mod(, barrier_acked) above.
- */
- list_splice_init(&b->requests, &mdev->barrier_acked_requests);
-
- nob = b->next;
- if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
- _tl_add_barrier(mdev, b);
- if (nob)
- mdev->oldest_tle = nob;
- /* if nob == NULL b was the only barrier, and becomes the new
- barrier. Therefore mdev->oldest_tle points already to b */
- } else {
- D_ASSERT(nob != NULL);
- mdev->oldest_tle = nob;
- kfree(b);
+ /* Clean up list of requests processed during current epoch. */
+ /* this extra list walk restart is paranoia,
+ * to catch requests being barrier-acked "unexpectedly".
+ * It usually should find the same req again, or some READ preceding it. */
+ list_for_each_entry(req, &tconn->transfer_log, tl_requests)
+ if (req->epoch == expect_epoch)
+ break;
+ list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) {
+ if (req->epoch != expect_epoch)
+ break;
+ _req_mod(req, BARRIER_ACKED);
}
-
- spin_unlock_irq(&mdev->req_lock);
- dec_ap_pending(mdev);
+ spin_unlock_irq(&tconn->req_lock);
return;
bail:
- spin_unlock_irq(&mdev->req_lock);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ spin_unlock_irq(&tconn->req_lock);
+ conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
}
@@ -350,85 +271,24 @@ bail:
* @mdev: DRBD device.
* @what: The action/event to perform with all request objects
*
- * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
- * restart_frozen_disk_io.
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
*/
-static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
-{
- struct drbd_tl_epoch *b, *tmp, **pn;
- struct list_head *le, *tle, carry_reads;
- struct drbd_request *req;
- int rv, n_writes, n_reads;
-
- b = mdev->oldest_tle;
- pn = &mdev->oldest_tle;
- while (b) {
- n_writes = 0;
- n_reads = 0;
- INIT_LIST_HEAD(&carry_reads);
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- rv = _req_mod(req, what);
-
- n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
- n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
- }
- tmp = b->next;
-
- if (n_writes) {
- if (what == resend) {
- b->n_writes = n_writes;
- if (b->w.cb == NULL) {
- b->w.cb = w_send_barrier;
- inc_ap_pending(mdev);
- drbd_set_flag(mdev, CREATE_BARRIER);
- }
-
- drbd_queue_work(&mdev->data.work, &b->w);
- }
- pn = &b->next;
- } else {
- if (n_reads)
- list_add(&carry_reads, &b->requests);
- /* there could still be requests on that ring list,
- * in case local io is still pending */
- list_del(&b->requests);
-
- /* dec_ap_pending corresponding to queue_barrier.
- * the newest barrier may not have been queued yet,
- * in which case w.cb is still NULL. */
- if (b->w.cb != NULL)
- dec_ap_pending(mdev);
-
- if (b == mdev->newest_tle) {
- /* recycle, but reinit! */
- D_ASSERT(tmp == NULL);
- INIT_LIST_HEAD(&b->requests);
- list_splice(&carry_reads, &b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->w.cb = NULL;
- b->br_number = net_random();
- b->n_writes = 0;
-
- *pn = b;
- break;
- }
- *pn = tmp;
- kfree(b);
- }
- b = tmp;
- list_splice(&carry_reads, &b->requests);
- }
-
- /* Actions operating on the disk state, also want to work on
- requests that got barrier acked. */
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+{
+ struct drbd_request *req, *r;
- list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests)
_req_mod(req, what);
- }
}
+void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+{
+ spin_lock_irq(&tconn->req_lock);
+ _tl_restart(tconn, what);
+ spin_unlock_irq(&tconn->req_lock);
+}
/**
* tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
@@ -438,43 +298,9 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
* by the requests on the transfer gets marked as our of sync. Called from the
* receiver thread and the worker thread.
*/
-void tl_clear(struct drbd_conf *mdev)
+void tl_clear(struct drbd_tconn *tconn)
{
- spin_lock_irq(&mdev->req_lock);
- _tl_clear(mdev);
- spin_unlock_irq(&mdev->req_lock);
-}
-
-static void _tl_clear(struct drbd_conf *mdev)
-{
- struct list_head *le, *tle;
- struct drbd_request *r;
-
- _tl_restart(mdev, connection_lost_while_pending);
-
- /* we expect this list to be empty. */
- D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-
- /* but just in case, clean it up anyways! */
- list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(r, connection_lost_while_pending);
- }
-
- /* ensure bit indicating barrier is required is clear */
- drbd_clear_flag(mdev, CREATE_BARRIER);
-
- memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
-
-}
-
-void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
-{
- spin_lock_irq(&mdev->req_lock);
- _tl_restart(mdev, what);
- spin_unlock_irq(&mdev->req_lock);
+ tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
}
/**
@@ -483,1392 +309,131 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
*/
void tl_abort_disk_io(struct drbd_conf *mdev)
{
- struct drbd_tl_epoch *b;
- struct list_head *le, *tle;
- struct drbd_request *req;
+ struct drbd_tconn *tconn = mdev->tconn;
+ struct drbd_request *req, *r;
- spin_lock_irq(&mdev->req_lock);
- b = mdev->oldest_tle;
- while (b) {
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- if (!(req->rq_state & RQ_LOCAL_PENDING))
- continue;
- _req_mod(req, abort_disk_io);
- }
- b = b->next;
- }
-
- list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
+ spin_lock_irq(&tconn->req_lock);
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
if (!(req->rq_state & RQ_LOCAL_PENDING))
continue;
- _req_mod(req, abort_disk_io);
- }
-
- spin_unlock_irq(&mdev->req_lock);
-}
-
-/**
- * cl_wide_st_chg() - true if the state change is a cluster wide one
- * @mdev: DRBD device.
- * @os: old (current) state.
- * @ns: new (wanted) state.
- */
-static int cl_wide_st_chg(struct drbd_conf *mdev,
- union drbd_state os, union drbd_state ns)
-{
- return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
- ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
- (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
- (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
- (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
- (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
- (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
-}
-
-enum drbd_state_rv
-drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
- union drbd_state mask, union drbd_state val)
-{
- unsigned long flags;
- union drbd_state os, ns;
- enum drbd_state_rv rv;
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f, NULL);
- ns = mdev->state;
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- return rv;
-}
-
-/**
- * drbd_force_state() - Impose a change which happens outside our control on our state
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- */
-void drbd_force_state(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val)
-{
- drbd_change_state(mdev, CS_HARD, mask, val);
-}
-
-static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
-static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
- union drbd_state,
- union drbd_state);
-enum sanitize_state_warnings {
- NO_WARNING,
- ABORTED_ONLINE_VERIFY,
- ABORTED_RESYNC,
- CONNECTION_LOST_NEGOTIATING,
- IMPLICITLY_UPGRADED_DISK,
- IMPLICITLY_UPGRADED_PDSK,
-};
-static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum sanitize_state_warnings *warn);
-int drbd_send_state_req(struct drbd_conf *,
- union drbd_state, union drbd_state);
-
-static enum drbd_state_rv
-_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val)
-{
- union drbd_state os, ns;
- unsigned long flags;
- enum drbd_state_rv rv;
-
- if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS))
- return SS_CW_SUCCESS;
-
- if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL))
- return SS_CW_FAILED_BY_PEER;
-
- rv = 0;
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- ns = sanitize_state(mdev, os, ns, NULL);
-
- if (!cl_wide_st_chg(mdev, os, ns))
- rv = SS_CW_NO_NEED;
- if (!rv) {
- rv = is_valid_state(mdev, ns);
- if (rv == SS_SUCCESS) {
- rv = is_valid_state_transition(mdev, ns, os);
- if (rv == SS_SUCCESS)
- rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
- }
- }
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- return rv;
-}
-
-/**
- * drbd_req_state() - Perform an eventually cluster wide state change
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- * @f: flags
- *
- * Should not be called directly, use drbd_request_state() or
- * _drbd_request_state().
- */
-static enum drbd_state_rv
-drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val, enum chg_state_flags f)
-{
- struct completion done;
- unsigned long flags;
- union drbd_state os, ns;
- enum drbd_state_rv rv;
-
- init_completion(&done);
-
- if (f & CS_SERIALIZE)
- mutex_lock(&mdev->state_mutex);
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- ns = sanitize_state(mdev, os, ns, NULL);
-
- if (cl_wide_st_chg(mdev, os, ns)) {
- rv = is_valid_state(mdev, ns);
- if (rv == SS_SUCCESS)
- rv = is_valid_state_transition(mdev, ns, os);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (rv < SS_SUCCESS) {
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
-
- drbd_state_lock(mdev);
- if (!drbd_send_state_req(mdev, mask, val)) {
- drbd_state_unlock(mdev);
- rv = SS_CW_FAILED_BY_PEER;
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
-
- if (mask.conn == C_MASK && val.conn == C_DISCONNECTING)
- drbd_set_flag(mdev, DISCONNECT_SENT);
-
- wait_event(mdev->state_wait,
- (rv = _req_st_cond(mdev, mask, val)));
-
- if (rv < SS_SUCCESS) {
- drbd_state_unlock(mdev);
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f, &done);
- drbd_state_unlock(mdev);
- } else {
- rv = _drbd_set_state(mdev, ns, f, &done);
- }
-
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
- D_ASSERT(current != mdev->worker.task);
- wait_for_completion(&done);
- }
-
-abort:
- if (f & CS_SERIALIZE)
- mutex_unlock(&mdev->state_mutex);
-
- return rv;
-}
-
-/**
- * _drbd_request_state() - Request a state change (with flags)
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- * @f: flags
- *
- * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
- * flag, or when logging of failed state change requests is not desired.
- */
-enum drbd_state_rv
-_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val, enum chg_state_flags f)
-{
- enum drbd_state_rv rv;
-
- wait_event(mdev->state_wait,
- (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
-
- return rv;
-}
-
-static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
-{
- dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
- name,
- drbd_conn_str(ns.conn),
- drbd_role_str(ns.role),
- drbd_role_str(ns.peer),
- drbd_disk_str(ns.disk),
- drbd_disk_str(ns.pdsk),
- is_susp(ns) ? 's' : 'r',
- ns.aftr_isp ? 'a' : '-',
- ns.peer_isp ? 'p' : '-',
- ns.user_isp ? 'u' : '-'
- );
-}
-
-void print_st_err(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum drbd_state_rv err)
-{
- if (err == SS_IN_TRANSIENT_STATE)
- return;
- dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
- print_st(mdev, " state", os);
- print_st(mdev, "wanted", ns);
-}
-
-
-/**
- * is_valid_state() - Returns an SS_ error code if ns is not valid
- * @mdev: DRBD device.
- * @ns: State to consider.
- */
-static enum drbd_state_rv
-is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
-{
- /* See drbd_state_sw_errors in drbd_strings.c */
-
- enum drbd_fencing_p fp;
- enum drbd_state_rv rv = SS_SUCCESS;
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- if (get_net_conf(mdev)) {
- if (!mdev->net_conf->two_primaries &&
- ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
- rv = SS_TWO_PRIMARIES;
- put_net_conf(mdev);
- }
-
- if (rv <= 0)
- /* already found a reason to abort */;
- else if (ns.role == R_SECONDARY && mdev->open_cnt)
- rv = SS_DEVICE_IN_USE;
-
- else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if (fp >= FP_RESOURCE &&
- ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
- rv = SS_PRIMARY_NOP;
-
- else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
- rv = SS_NO_LOCAL_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
- rv = SS_NO_REMOTE_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if ((ns.conn == C_CONNECTED ||
- ns.conn == C_WF_BITMAP_S ||
- ns.conn == C_SYNC_SOURCE ||
- ns.conn == C_PAUSED_SYNC_S) &&
- ns.disk == D_OUTDATED)
- rv = SS_CONNECTED_OUTDATES;
-
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- (mdev->sync_conf.verify_alg[0] == 0))
- rv = SS_NO_VERIFY_ALG;
-
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- mdev->agreed_pro_version < 88)
- rv = SS_NOT_SUPPORTED;
-
- else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
- rv = SS_CONNECTED_OUTDATES;
-
- return rv;
-}
-
-/**
- * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
- * @mdev: DRBD device.
- * @ns: new state.
- * @os: old state.
- */
-static enum drbd_state_rv
-is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
- union drbd_state os)
-{
- enum drbd_state_rv rv = SS_SUCCESS;
-
- if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
- os.conn > C_CONNECTED)
- rv = SS_RESYNC_RUNNING;
-
- if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
- rv = SS_ALREADY_STANDALONE;
-
- if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
- rv = SS_IS_DISKLESS;
-
- if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
- rv = SS_NO_NET_CONFIG;
-
- if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
- rv = SS_LOWER_THAN_OUTDATED;
-
- if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
- rv = SS_IN_TRANSIENT_STATE;
-
- if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
- rv = SS_IN_TRANSIENT_STATE;
-
- /* While establishing a connection only allow cstate to change.
- Delay/refuse role changes, detach attach etc... */
- if (drbd_test_flag(mdev, STATE_SENT) &&
- !(os.conn == C_WF_REPORT_PARAMS ||
- (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
- rv = SS_IN_TRANSIENT_STATE;
-
- if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
- rv = SS_NEED_CONNECTION;
-
- if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- ns.conn != os.conn && os.conn > C_CONNECTED)
- rv = SS_RESYNC_RUNNING;
-
- if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
- os.conn < C_CONNECTED)
- rv = SS_NEED_CONNECTION;
-
- if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
- && os.conn < C_WF_REPORT_PARAMS)
- rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
-
- return rv;
-}
-
-static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
-{
- static const char *msg_table[] = {
- [NO_WARNING] = "",
- [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
- [ABORTED_RESYNC] = "Resync aborted.",
- [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
- [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
- [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
- };
-
- if (warn != NO_WARNING)
- dev_warn(DEV, "%s\n", msg_table[warn]);
-}
-
-/**
- * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
- * @mdev: DRBD device.
- * @os: old state.
- * @ns: new state.
- * @warn_sync_abort:
- *
- * When we loose connection, we have to set the state of the peers disk (pdsk)
- * to D_UNKNOWN. This rule and many more along those lines are in this function.
- */
-static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum sanitize_state_warnings *warn)
-{
- enum drbd_fencing_p fp;
- enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
-
- if (warn)
- *warn = NO_WARNING;
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- /* Disallow Network errors to configure a device's network part */
- if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
- os.conn <= C_DISCONNECTING)
- ns.conn = os.conn;
-
- /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
- * If you try to go into some Sync* state, that shall fail (elsewhere). */
- if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
- ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
- ns.conn = os.conn;
-
- /* we cannot fail (again) if we already detached */
- if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
- ns.disk = D_DISKLESS;
-
- /* After C_DISCONNECTING only C_STANDALONE may follow */
- if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
- ns.conn = os.conn;
-
- if (ns.conn < C_CONNECTED) {
- ns.peer_isp = 0;
- ns.peer = R_UNKNOWN;
- if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
- ns.pdsk = D_UNKNOWN;
- }
-
- /* Clear the aftr_isp when becoming unconfigured */
- if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
- ns.aftr_isp = 0;
-
- /* Abort resync if a disk fails/detaches */
- if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
- (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
- if (warn)
- *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
- ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
- ns.conn = C_CONNECTED;
- }
-
- /* Connection breaks down before we finished "Negotiating" */
- if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
- get_ldev_if_state(mdev, D_NEGOTIATING)) {
- if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
- ns.disk = mdev->new_state_tmp.disk;
- ns.pdsk = mdev->new_state_tmp.pdsk;
- } else {
- if (warn)
- *warn = CONNECTION_LOST_NEGOTIATING;
- ns.disk = D_DISKLESS;
- ns.pdsk = D_UNKNOWN;
- }
- put_ldev(mdev);
- }
-
- /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
- if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
- if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
- ns.disk = D_UP_TO_DATE;
- if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
- ns.pdsk = D_UP_TO_DATE;
- }
-
- /* Implications of the connection stat on the disk states */
- disk_min = D_DISKLESS;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_UNKNOWN;
- switch ((enum drbd_conns)ns.conn) {
- case C_WF_BITMAP_T:
- case C_PAUSED_SYNC_T:
- case C_STARTING_SYNC_T:
- case C_WF_SYNC_UUID:
- case C_BEHIND:
- disk_min = D_INCONSISTENT;
- disk_max = D_OUTDATED;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_VERIFY_S:
- case C_VERIFY_T:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_CONNECTED:
- disk_min = D_DISKLESS;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_DISKLESS;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_WF_BITMAP_S:
- case C_PAUSED_SYNC_S:
- case C_STARTING_SYNC_S:
- case C_AHEAD:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
- break;
- case C_SYNC_TARGET:
- disk_min = D_INCONSISTENT;
- disk_max = D_INCONSISTENT;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_SYNC_SOURCE:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_INCONSISTENT;
- break;
- case C_STANDALONE:
- case C_DISCONNECTING:
- case C_UNCONNECTED:
- case C_TIMEOUT:
- case C_BROKEN_PIPE:
- case C_NETWORK_FAILURE:
- case C_PROTOCOL_ERROR:
- case C_TEAR_DOWN:
- case C_WF_CONNECTION:
- case C_WF_REPORT_PARAMS:
- case C_MASK:
- break;
- }
- if (ns.disk > disk_max)
- ns.disk = disk_max;
-
- if (ns.disk < disk_min) {
- if (warn)
- *warn = IMPLICITLY_UPGRADED_DISK;
- ns.disk = disk_min;
- }
- if (ns.pdsk > pdsk_max)
- ns.pdsk = pdsk_max;
-
- if (ns.pdsk < pdsk_min) {
- if (warn)
- *warn = IMPLICITLY_UPGRADED_PDSK;
- ns.pdsk = pdsk_min;
- }
-
- if (fp == FP_STONITH &&
- (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
- !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
- ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
-
- if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
- (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
- !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
- ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
-
- if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
- if (ns.conn == C_SYNC_SOURCE)
- ns.conn = C_PAUSED_SYNC_S;
- if (ns.conn == C_SYNC_TARGET)
- ns.conn = C_PAUSED_SYNC_T;
- } else {
- if (ns.conn == C_PAUSED_SYNC_S)
- ns.conn = C_SYNC_SOURCE;
- if (ns.conn == C_PAUSED_SYNC_T)
- ns.conn = C_SYNC_TARGET;
- }
-
- return ns;
-}
-
-/* helper for __drbd_set_state */
-static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
-{
- if (mdev->agreed_pro_version < 90)
- mdev->ov_start_sector = 0;
- mdev->rs_total = drbd_bm_bits(mdev);
- mdev->ov_position = 0;
- if (cs == C_VERIFY_T) {
- /* starting online verify from an arbitrary position
- * does not fit well into the existing protocol.
- * on C_VERIFY_T, we initialize ov_left and friends
- * implicitly in receive_DataRequest once the
- * first P_OV_REQUEST is received */
- mdev->ov_start_sector = ~(sector_t)0;
- } else {
- unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
- if (bit >= mdev->rs_total) {
- mdev->ov_start_sector =
- BM_BIT_TO_SECT(mdev->rs_total - 1);
- mdev->rs_total = 1;
- } else
- mdev->rs_total -= bit;
- mdev->ov_position = mdev->ov_start_sector;
- }
- mdev->ov_left = mdev->rs_total;
-}
-
-static void drbd_resume_al(struct drbd_conf *mdev)
-{
- if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED))
- dev_info(DEV, "Resumed AL updates\n");
-}
-
-/**
- * __drbd_set_state() - Set a new DRBD state
- * @mdev: DRBD device.
- * @ns: new state.
- * @flags: Flags
- * @done: Optional completion, that will get completed after the after_state_ch() finished
- *
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
- */
-enum drbd_state_rv
-__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
- enum chg_state_flags flags, struct completion *done)
-{
- union drbd_state os;
- enum drbd_state_rv rv = SS_SUCCESS;
- enum sanitize_state_warnings ssw;
- struct after_state_chg_work *ascw;
-
- os = mdev->state;
-
- ns = sanitize_state(mdev, os, ns, &ssw);
-
- if (ns.i == os.i)
- return SS_NOTHING_TO_DO;
-
- if (!(flags & CS_HARD)) {
- /* pre-state-change checks ; only look at ns */
- /* See drbd_state_sw_errors in drbd_strings.c */
-
- rv = is_valid_state(mdev, ns);
- if (rv < SS_SUCCESS) {
- /* If the old state was illegal as well, then let
- this happen...*/
-
- if (is_valid_state(mdev, os) == rv)
- rv = is_valid_state_transition(mdev, ns, os);
- } else
- rv = is_valid_state_transition(mdev, ns, os);
- }
-
- if (rv < SS_SUCCESS) {
- if (flags & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- return rv;
- }
-
- print_sanitize_warnings(mdev, ssw);
-
- {
- char *pbp, pb[300];
- pbp = pb;
- *pbp = 0;
- if (ns.role != os.role)
- pbp += sprintf(pbp, "role( %s -> %s ) ",
- drbd_role_str(os.role),
- drbd_role_str(ns.role));
- if (ns.peer != os.peer)
- pbp += sprintf(pbp, "peer( %s -> %s ) ",
- drbd_role_str(os.peer),
- drbd_role_str(ns.peer));
- if (ns.conn != os.conn)
- pbp += sprintf(pbp, "conn( %s -> %s ) ",
- drbd_conn_str(os.conn),
- drbd_conn_str(ns.conn));
- if (ns.disk != os.disk)
- pbp += sprintf(pbp, "disk( %s -> %s ) ",
- drbd_disk_str(os.disk),
- drbd_disk_str(ns.disk));
- if (ns.pdsk != os.pdsk)
- pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
- drbd_disk_str(os.pdsk),
- drbd_disk_str(ns.pdsk));
- if (is_susp(ns) != is_susp(os))
- pbp += sprintf(pbp, "susp( %d -> %d ) ",
- is_susp(os),
- is_susp(ns));
- if (ns.aftr_isp != os.aftr_isp)
- pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
- os.aftr_isp,
- ns.aftr_isp);
- if (ns.peer_isp != os.peer_isp)
- pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
- os.peer_isp,
- ns.peer_isp);
- if (ns.user_isp != os.user_isp)
- pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
- os.user_isp,
- ns.user_isp);
- dev_info(DEV, "%s\n", pb);
- }
-
- /* solve the race between becoming unconfigured,
- * worker doing the cleanup, and
- * admin reconfiguring us:
- * on (re)configure, first set CONFIG_PENDING,
- * then wait for a potentially exiting worker,
- * start the worker, and schedule one no_op.
- * then proceed with configuration.
- */
- if (ns.disk == D_DISKLESS &&
- ns.conn == C_STANDALONE &&
- ns.role == R_SECONDARY &&
- !drbd_test_and_set_flag(mdev, CONFIG_PENDING))
- drbd_set_flag(mdev, DEVICE_DYING);
-
- /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
- * on the ldev here, to be sure the transition -> D_DISKLESS resp.
- * drbd_ldev_destroy() won't happen before our corresponding
- * after_state_ch works run, where we put_ldev again. */
- if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
- (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
- atomic_inc(&mdev->local_cnt);
-
- mdev->state = ns;
-
- if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
- drbd_print_uuids(mdev, "attached to UUIDs");
-
- wake_up(&mdev->misc_wait);
- wake_up(&mdev->state_wait);
-
- /* Aborted verify run, or we reached the stop sector.
- * Log the last position, unless end-of-device. */
- if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
- ns.conn <= C_CONNECTED) {
- mdev->ov_start_sector =
- BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
- if (mdev->ov_left)
- dev_info(DEV, "Online Verify reached sector %llu\n",
- (unsigned long long)mdev->ov_start_sector);
- }
-
- if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
- (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
- dev_info(DEV, "Syncer continues.\n");
- mdev->rs_paused += (long)jiffies
- -(long)mdev->rs_mark_time[mdev->rs_last_mark];
- if (ns.conn == C_SYNC_TARGET)
- mod_timer(&mdev->resync_timer, jiffies);
- }
-
- if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
- (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
- dev_info(DEV, "Resync suspended\n");
- mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
- }
-
- if (os.conn == C_CONNECTED &&
- (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
- unsigned long now = jiffies;
- int i;
-
- set_ov_position(mdev, ns.conn);
- mdev->rs_start = now;
- mdev->rs_last_events = 0;
- mdev->rs_last_sect_ev = 0;
- mdev->ov_last_oos_size = 0;
- mdev->ov_last_oos_start = 0;
-
- for (i = 0; i < DRBD_SYNC_MARKS; i++) {
- mdev->rs_mark_left[i] = mdev->ov_left;
- mdev->rs_mark_time[i] = now;
- }
-
- drbd_rs_controller_reset(mdev);
-
- if (ns.conn == C_VERIFY_S) {
- dev_info(DEV, "Starting Online Verify from sector %llu\n",
- (unsigned long long)mdev->ov_position);
- mod_timer(&mdev->resync_timer, jiffies);
- }
- }
-
- if (get_ldev(mdev)) {
- u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
- MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
- MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
-
- if (drbd_test_flag(mdev, CRASHED_PRIMARY))
- mdf |= MDF_CRASHED_PRIMARY;
- if (mdev->state.role == R_PRIMARY ||
- (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
- mdf |= MDF_PRIMARY_IND;
- if (mdev->state.conn > C_WF_REPORT_PARAMS)
- mdf |= MDF_CONNECTED_IND;
- if (mdev->state.disk > D_INCONSISTENT)
- mdf |= MDF_CONSISTENT;
- if (mdev->state.disk > D_OUTDATED)
- mdf |= MDF_WAS_UP_TO_DATE;
- if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
- mdf |= MDF_PEER_OUT_DATED;
- if (mdf != mdev->ldev->md.flags) {
- mdev->ldev->md.flags = mdf;
- drbd_md_mark_dirty(mdev);
- }
- if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
- drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
- put_ldev(mdev);
- }
-
- /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
- if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
- os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
- drbd_set_flag(mdev, CONSIDER_RESYNC);
-
- /* Receiver should clean up itself */
- if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
- drbd_thread_stop_nowait(&mdev->receiver);
-
- /* Now the receiver finished cleaning up itself, it should die */
- if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
- drbd_thread_stop_nowait(&mdev->receiver);
-
- /* Upon network failure, we need to restart the receiver. */
- if (os.conn > C_WF_CONNECTION &&
- ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
- drbd_thread_restart_nowait(&mdev->receiver);
-
- /* Resume AL writing if we get a connection */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
- drbd_resume_al(mdev);
-
- /* remember last connect and attach times so request_timer_fn() won't
- * kill newly established sessions while we are still trying to thaw
- * previously frozen IO */
- if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
- mdev->last_reconnect_jif = jiffies;
- if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
- ns.disk > D_NEGOTIATING)
- mdev->last_reattach_jif = jiffies;
-
- ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
- if (ascw) {
- ascw->os = os;
- ascw->ns = ns;
- ascw->flags = flags;
- ascw->w.cb = w_after_state_ch;
- ascw->done = done;
- drbd_queue_work(&mdev->data.work, &ascw->w);
- } else {
- dev_warn(DEV, "Could not kmalloc an ascw\n");
- }
-
- return rv;
-}
-
-static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
-{
- struct after_state_chg_work *ascw =
- container_of(w, struct after_state_chg_work, w);
- after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
- if (ascw->flags & CS_WAIT_COMPLETE) {
- D_ASSERT(ascw->done != NULL);
- complete(ascw->done);
- }
- kfree(ascw);
-
- return 1;
-}
-
-static void abw_start_sync(struct drbd_conf *mdev, int rv)
-{
- if (rv) {
- dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
- _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
- return;
- }
-
- switch (mdev->state.conn) {
- case C_STARTING_SYNC_T:
- _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
- break;
- case C_STARTING_SYNC_S:
- drbd_start_resync(mdev, C_SYNC_SOURCE);
- break;
- }
-}
-
-int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
- int (*io_fn)(struct drbd_conf *),
- char *why, enum bm_flag flags)
-{
- int rv;
-
- D_ASSERT(current == mdev->worker.task);
-
- /* open coded non-blocking drbd_suspend_io(mdev); */
- drbd_set_flag(mdev, SUSPEND_IO);
-
- drbd_bm_lock(mdev, why, flags);
- rv = io_fn(mdev);
- drbd_bm_unlock(mdev);
-
- drbd_resume_io(mdev);
-
- return rv;
-}
-
-/**
- * after_state_ch() - Perform after state change actions that may sleep
- * @mdev: DRBD device.
- * @os: old state.
- * @ns: new state.
- * @flags: Flags
- */
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum chg_state_flags flags)
-{
- enum drbd_fencing_p fp;
- enum drbd_req_event what = nothing;
- union drbd_state nsm = (union drbd_state){ .i = -1 };
-
- if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
- drbd_clear_flag(mdev, CRASHED_PRIMARY);
- if (mdev->p_uuid)
- mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
- }
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- /* Inform userspace about the change... */
- drbd_bcast_state(mdev, ns);
-
- if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
- (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
- drbd_khelper(mdev, "pri-on-incon-degr");
-
- /* Here we have the actions that are performed after a
- state change. This function might sleep */
-
- if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
- mod_timer(&mdev->request_timer, jiffies + HZ);
-
- nsm.i = -1;
- if (ns.susp_nod) {
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
- what = resend;
-
- if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
- ns.disk > D_NEGOTIATING)
- what = restart_frozen_disk_io;
-
- if (what != nothing)
- nsm.susp_nod = 0;
- }
-
- if (ns.susp_fen) {
- /* case1: The outdate peer handler is successful: */
- if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
- if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
- drbd_uuid_new_current(mdev);
- drbd_clear_flag(mdev, NEW_CUR_UUID);
- }
- spin_lock_irq(&mdev->req_lock);
- _tl_clear(mdev);
- _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
- }
- /* case2: The connection was established again: */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
- drbd_clear_flag(mdev, NEW_CUR_UUID);
- what = resend;
- nsm.susp_fen = 0;
- }
- }
-
- if (what != nothing) {
- spin_lock_irq(&mdev->req_lock);
- _tl_restart(mdev, what);
- nsm.i &= mdev->state.i;
- _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
- }
-
- /* Became sync source. With protocol >= 96, we still need to send out
- * the sync uuid now. Need to do that before any drbd_send_state, or
- * the other side may go "paused sync" before receiving the sync uuids,
- * which is unexpected. */
- if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
- (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
- mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
- drbd_gen_and_send_sync_uuid(mdev);
- put_ldev(mdev);
- }
-
- /* Do not change the order of the if above and the two below... */
- if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
- /* we probably will start a resync soon.
- * make sure those things are properly reset. */
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- atomic_set(&mdev->rs_pending_cnt, 0);
- drbd_rs_cancel_all(mdev);
-
- drbd_send_uuids(mdev);
- drbd_send_state(mdev, ns);
- }
- /* No point in queuing send_bitmap if we don't have a connection
- * anymore, so check also the _current_ state, not only the new state
- * at the time this work was queued. */
- if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
- mdev->state.conn == C_WF_BITMAP_S)
- drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
- "send_bitmap (WFBitMapS)",
- BM_LOCKED_TEST_ALLOWED);
-
- /* Lost contact to peer's copy of the data */
- if ((os.pdsk >= D_INCONSISTENT &&
- os.pdsk != D_UNKNOWN &&
- os.pdsk != D_OUTDATED)
- && (ns.pdsk < D_INCONSISTENT ||
- ns.pdsk == D_UNKNOWN ||
- ns.pdsk == D_OUTDATED)) {
- if (get_ldev(mdev)) {
- if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- if (is_susp(mdev->state)) {
- drbd_set_flag(mdev, NEW_CUR_UUID);
- } else {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
- }
- put_ldev(mdev);
- }
- }
-
- if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
- /* D_DISKLESS Peer becomes secondary */
- if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
- /* We may still be Primary ourselves.
- * No harm done if the bitmap still changes,
- * redirtied pages will follow later. */
- drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
- "demote diskless peer", BM_LOCKED_SET_ALLOWED);
- put_ldev(mdev);
- }
-
- /* Write out all changed bits on demote.
- * Though, no need to da that just yet
- * if there is a resync going on still */
- if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
- mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
- /* No changes to the bitmap expected this time, so assert that,
- * even though no harm was done if it did change. */
- drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
- "demote", BM_LOCKED_TEST_ALLOWED);
- put_ldev(mdev);
- }
-
- /* Last part of the attaching process ... */
- if (ns.conn >= C_CONNECTED &&
- os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
- drbd_send_sizes(mdev, 0, 0); /* to start sync... */
- drbd_send_uuids(mdev);
- drbd_send_state(mdev, ns);
- }
-
- /* We want to pause/continue resync, tell peer. */
- if (ns.conn >= C_CONNECTED &&
- ((os.aftr_isp != ns.aftr_isp) ||
- (os.user_isp != ns.user_isp)))
- drbd_send_state(mdev, ns);
-
- /* In case one of the isp bits got set, suspend other devices. */
- if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
- (ns.aftr_isp || ns.peer_isp || ns.user_isp))
- suspend_other_sg(mdev);
-
- /* Make sure the peer gets informed about eventual state
- changes (ISP bits) while we were in WFReportParams. */
- if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
- drbd_send_state(mdev, ns);
-
- /* We are in the progress to start a full sync... */
- if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
- (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
- /* no other bitmap changes expected during this phase */
- drbd_queue_bitmap_io(mdev,
- &drbd_bmio_set_n_write, &abw_start_sync,
- "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
-
- /* We are invalidating our self... */
- if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
- os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
- /* other bitmap operation expected during this phase */
- drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
- "set_n_write from invalidate", BM_LOCKED_MASK);
-
- /* first half of local IO error, failure to attach,
- * or administrative detach */
- if (os.disk != D_FAILED && ns.disk == D_FAILED) {
- /* corresponding get_ldev was in __drbd_set_state, to serialize
- * our cleanup here with the transition to D_DISKLESS.
- * But it is still not safe to dreference ldev here, we may end
- * up here from a failed attach, before ldev was even set. */
- if (mdev->ldev) {
- enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error;
-
- /* In some setups, this handler triggers a suicide,
- * basically mapping IO error to node failure, to
- * reduce the number of different failure scenarios.
- *
- * This handler intentionally runs before we abort IO,
- * notify the peer, or try to update our meta data. */
- if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR))
- drbd_khelper(mdev, "local-io-error");
-
- /* Immediately allow completion of all application IO,
- * that waits for completion from the local disk,
- * if this was a force-detach due to disk_timeout
- * or administrator request (drbdsetup detach --force).
- * Do NOT abort otherwise.
- * Aborting local requests may cause serious problems,
- * if requests are completed to upper layers already,
- * and then later the already submitted local bio completes.
- * This can cause DMA into former bio pages that meanwhile
- * have been re-used for other things.
- * So aborting local requests may cause crashes,
- * or even worse, silent data corruption.
- */
- if (drbd_test_flag(mdev, FORCE_DETACH))
- tl_abort_disk_io(mdev);
-
- /* current state still has to be D_FAILED,
- * there is only one way out: to D_DISKLESS,
- * and that may only happen after our put_ldev below. */
- if (mdev->state.disk != D_FAILED)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s during detach\n",
- drbd_disk_str(mdev->state.disk));
-
- if (ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- drbd_rs_cancel_all(mdev);
-
- /* In case we want to get something to stable storage still,
- * this may be the last chance.
- * Following put_ldev may transition to D_DISKLESS. */
- drbd_md_sync(mdev);
- }
- put_ldev(mdev);
- }
-
- /* second half of local IO error, failure to attach,
- * or administrative detach,
- * after local_cnt references have reached zero again */
- if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
- /* We must still be diskless,
- * re-attach has to be serialized with this! */
- if (mdev->state.disk != D_DISKLESS)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s while going diskless\n",
- drbd_disk_str(mdev->state.disk));
-
- if (ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- /* corresponding get_ldev in __drbd_set_state
- * this may finally trigger drbd_ldev_destroy. */
- put_ldev(mdev);
- }
-
- /* Notify peer that I had a local IO error, and did not detached.. */
- if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- /* Disks got bigger while they were detached */
- if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
- drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) {
- if (ns.conn == C_CONNECTED)
- resync_after_online_grow(mdev);
- }
-
- /* A resync finished or aborted, wake paused devices... */
- if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
- (os.peer_isp && !ns.peer_isp) ||
- (os.user_isp && !ns.user_isp))
- resume_next_sg(mdev);
-
- /* sync target done with resync. Explicitly notify peer, even though
- * it should (at least for non-empty resyncs) already know itself. */
- if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- /* Verify finished, or reached stop sector. Peer did not know about
- * the stop sector, and we may even have changed the stop sector during
- * verify to interrupt/stop early. Send the new state. */
- if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
- && mdev->agreed_pro_version >= 97)
- drbd_send_state(mdev, ns);
-
- /* Wake up role changes, that were delayed because of connection establishing */
- if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
- drbd_clear_flag(mdev, STATE_SENT);
- wake_up(&mdev->state_wait);
- }
-
- /* This triggers bitmap writeout of potentially still unwritten pages
- * if the resync finished cleanly, or aborted because of peer disk
- * failure, or because of connection loss.
- * For resync aborted because of local disk failure, we cannot do
- * any bitmap writeout anymore.
- * No harm done if some bits change during this phase.
- */
- if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
- drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
- "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
- put_ldev(mdev);
- }
-
- /* free tl_hash if we Got thawed and are C_STANDALONE */
- if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
- drbd_free_tl_hash(mdev);
-
- /* Upon network connection, we need to start the receiver */
- if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
- drbd_thread_start(&mdev->receiver);
-
- /* Terminate worker thread if we are unconfigured - it will be
- restarted as needed... */
- if (ns.disk == D_DISKLESS &&
- ns.conn == C_STANDALONE &&
- ns.role == R_SECONDARY) {
- if (os.aftr_isp != ns.aftr_isp)
- resume_next_sg(mdev);
- /* set in __drbd_set_state, unless CONFIG_PENDING was set */
- if (drbd_test_flag(mdev, DEVICE_DYING))
- drbd_thread_stop_nowait(&mdev->worker);
+ if (req->w.mdev != mdev)
+ continue;
+ _req_mod(req, ABORT_DISK_IO);
}
-
- drbd_md_sync(mdev);
+ spin_unlock_irq(&tconn->req_lock);
}
-
static int drbd_thread_setup(void *arg)
{
struct drbd_thread *thi = (struct drbd_thread *) arg;
- struct drbd_conf *mdev = thi->mdev;
+ struct drbd_tconn *tconn = thi->tconn;
unsigned long flags;
int retval;
+ snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+ thi->name[0], thi->tconn->name);
+
restart:
retval = thi->function(thi);
spin_lock_irqsave(&thi->t_lock, flags);
- /* if the receiver has been "Exiting", the last thing it did
+ /* if the receiver has been "EXITING", the last thing it did
* was set the conn state to "StandAlone",
* if now a re-connect request comes in, conn state goes C_UNCONNECTED,
* and receiver thread will be "started".
- * drbd_thread_start needs to set "Restarting" in that case.
+ * drbd_thread_start needs to set "RESTARTING" in that case.
* t_state check and assignment needs to be within the same spinlock,
- * so either thread_start sees Exiting, and can remap to Restarting,
- * or thread_start see None, and can proceed as normal.
+ * so either thread_start sees EXITING, and can remap to RESTARTING,
+ * or thread_start see NONE, and can proceed as normal.
*/
- if (thi->t_state == Restarting) {
- dev_info(DEV, "Restarting %s\n", current->comm);
- thi->t_state = Running;
+ if (thi->t_state == RESTARTING) {
+ conn_info(tconn, "Restarting %s thread\n", thi->name);
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
goto restart;
}
thi->task = NULL;
- thi->t_state = None;
+ thi->t_state = NONE;
smp_mb();
- complete(&thi->stop);
+ complete_all(&thi->stop);
spin_unlock_irqrestore(&thi->t_lock, flags);
- dev_info(DEV, "Terminating %s\n", current->comm);
+ conn_info(tconn, "Terminating %s\n", current->comm);
/* Release mod reference taken when thread was started */
+
+ kref_put(&tconn->kref, &conn_destroy);
module_put(THIS_MODULE);
return retval;
}
-static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
- int (*func) (struct drbd_thread *))
+static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
+ int (*func) (struct drbd_thread *), char *name)
{
spin_lock_init(&thi->t_lock);
thi->task = NULL;
- thi->t_state = None;
+ thi->t_state = NONE;
thi->function = func;
- thi->mdev = mdev;
+ thi->tconn = tconn;
+ strncpy(thi->name, name, ARRAY_SIZE(thi->name));
}
int drbd_thread_start(struct drbd_thread *thi)
{
- struct drbd_conf *mdev = thi->mdev;
+ struct drbd_tconn *tconn = thi->tconn;
struct task_struct *nt;
unsigned long flags;
- const char *me =
- thi == &mdev->receiver ? "receiver" :
- thi == &mdev->asender ? "asender" :
- thi == &mdev->worker ? "worker" : "NONSENSE";
-
/* is used from state engine doing drbd_thread_stop_nowait,
* while holding the req lock irqsave */
spin_lock_irqsave(&thi->t_lock, flags);
switch (thi->t_state) {
- case None:
- dev_info(DEV, "Starting %s thread (from %s [%d])\n",
- me, current->comm, current->pid);
+ case NONE:
+ conn_info(tconn, "Starting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
/* Get ref on module for thread - this is released when thread exits */
if (!try_module_get(THIS_MODULE)) {
- dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+ conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
spin_unlock_irqrestore(&thi->t_lock, flags);
return false;
}
+ kref_get(&thi->tconn->kref);
+
init_completion(&thi->stop);
- D_ASSERT(thi->task == NULL);
thi->reset_cpu_mask = 1;
- thi->t_state = Running;
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
nt = kthread_create(drbd_thread_setup, (void *) thi,
- "drbd%d_%s", mdev_to_minor(mdev), me);
+ "drbd_%c_%s", thi->name[0], thi->tconn->name);
if (IS_ERR(nt)) {
- dev_err(DEV, "Couldn't start thread\n");
+ conn_err(tconn, "Couldn't start thread\n");
+ kref_put(&tconn->kref, &conn_destroy);
module_put(THIS_MODULE);
return false;
}
spin_lock_irqsave(&thi->t_lock, flags);
thi->task = nt;
- thi->t_state = Running;
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
wake_up_process(nt);
break;
- case Exiting:
- thi->t_state = Restarting;
- dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
- me, current->comm, current->pid);
+ case EXITING:
+ thi->t_state = RESTARTING;
+ conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
/* fall through */
- case Running:
- case Restarting:
+ case RUNNING:
+ case RESTARTING:
default:
spin_unlock_irqrestore(&thi->t_lock, flags);
break;
@@ -1882,12 +447,12 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
{
unsigned long flags;
- enum drbd_thread_state ns = restart ? Restarting : Exiting;
+ enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
/* may be called from state engine, holding the req lock irqsave */
spin_lock_irqsave(&thi->t_lock, flags);
- if (thi->t_state == None) {
+ if (thi->t_state == NONE) {
spin_unlock_irqrestore(&thi->t_lock, flags);
if (restart)
drbd_thread_start(thi);
@@ -1905,7 +470,6 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
init_completion(&thi->stop);
if (thi->task != current)
force_sig(DRBD_SIGKILL, thi->task);
-
}
spin_unlock_irqrestore(&thi->t_lock, flags);
@@ -1914,6 +478,35 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
wait_for_completion(&thi->stop);
}
+static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
+{
+ struct drbd_thread *thi =
+ task == tconn->receiver.task ? &tconn->receiver :
+ task == tconn->asender.task ? &tconn->asender :
+ task == tconn->worker.task ? &tconn->worker : NULL;
+
+ return thi;
+}
+
+char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
+{
+ struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
+ return thi ? thi->name : task->comm;
+}
+
+int conn_lowest_minor(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ int vnr = 0, m;
+
+ rcu_read_lock();
+ mdev = idr_get_next(&tconn->volumes, &vnr);
+ m = mdev ? mdev_to_minor(mdev) : -1;
+ rcu_read_unlock();
+
+ return m;
+}
+
#ifdef CONFIG_SMP
/**
* drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
@@ -1922,240 +515,345 @@ void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
* Forces all threads of a device onto the same CPU. This is beneficial for
* DRBD's performance. May be overwritten by user's configuration.
*/
-void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
{
int ord, cpu;
/* user override. */
- if (cpumask_weight(mdev->cpu_mask))
+ if (cpumask_weight(tconn->cpu_mask))
return;
- ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
+ ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
for_each_online_cpu(cpu) {
if (ord-- == 0) {
- cpumask_set_cpu(cpu, mdev->cpu_mask);
+ cpumask_set_cpu(cpu, tconn->cpu_mask);
return;
}
}
/* should not be reached */
- cpumask_setall(mdev->cpu_mask);
+ cpumask_setall(tconn->cpu_mask);
}
/**
* drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
* @mdev: DRBD device.
+ * @thi: drbd_thread object
*
* call in the "main loop" of _all_ threads, no need for any mutex, current won't die
* prematurely.
*/
-void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
{
struct task_struct *p = current;
- struct drbd_thread *thi =
- p == mdev->asender.task ? &mdev->asender :
- p == mdev->receiver.task ? &mdev->receiver :
- p == mdev->worker.task ? &mdev->worker :
- NULL;
- ERR_IF(thi == NULL)
- return;
+
if (!thi->reset_cpu_mask)
return;
thi->reset_cpu_mask = 0;
- set_cpus_allowed_ptr(p, mdev->cpu_mask);
+ set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
}
#endif
-/* the appropriate socket mutex must be held already */
-int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size, unsigned msg_flags)
+/**
+ * drbd_header_size - size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures. (The bitmap send and receive code
+ * relies on this.)
+ */
+unsigned int drbd_header_size(struct drbd_tconn *tconn)
{
- int sent, ok;
+ if (tconn->agreed_pro_version >= 100) {
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+ return sizeof(struct p_header100);
+ } else {
+ BUILD_BUG_ON(sizeof(struct p_header80) !=
+ sizeof(struct p_header95));
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+ return sizeof(struct p_header80);
+ }
+}
- ERR_IF(!h) return false;
- ERR_IF(!size) return false;
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be16(size);
+ return sizeof(struct p_header80);
+}
- h->magic = BE_DRBD_MAGIC;
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
h->command = cpu_to_be16(cmd);
- h->length = cpu_to_be16(size-sizeof(struct p_header80));
+ h->length = cpu_to_be32(size);
+ return sizeof(struct p_header95);
+}
- sent = drbd_send(mdev, sock, h, size, msg_flags);
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+ int size, int vnr)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC_100);
+ h->volume = cpu_to_be16(vnr);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be32(size);
+ h->pad = 0;
+ return sizeof(struct p_header100);
+}
- ok = (sent == size);
- if (!ok && !signal_pending(current))
- dev_warn(DEV, "short sent %s size=%d sent=%d\n",
- cmdname(cmd), (int)size, sent);
- return ok;
+static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr,
+ void *buffer, enum drbd_packet cmd, int size)
+{
+ if (tconn->agreed_pro_version >= 100)
+ return prepare_header100(buffer, cmd, size, vnr);
+ else if (tconn->agreed_pro_version >= 95 &&
+ size > DRBD_MAX_SIZE_H80_PACKET)
+ return prepare_header95(buffer, cmd, size);
+ else
+ return prepare_header80(buffer, cmd, size);
}
-/* don't pass the socket. we may only look at it
- * when we hold the appropriate socket mutex.
- */
-int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header80 *h, size_t size)
+static void *__conn_prepare_command(struct drbd_tconn *tconn,
+ struct drbd_socket *sock)
{
- int ok = 0;
- struct socket *sock;
+ if (!sock->socket)
+ return NULL;
+ return sock->sbuf + drbd_header_size(tconn);
+}
- if (use_data_socket) {
- mutex_lock(&mdev->data.mutex);
- sock = mdev->data.socket;
- } else {
- mutex_lock(&mdev->meta.mutex);
- sock = mdev->meta.socket;
- }
+void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock)
+{
+ void *p;
- /* drbd_disconnect() could have called drbd_free_sock()
- * while we were waiting in down()... */
- if (likely(sock != NULL))
- ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+ mutex_lock(&sock->mutex);
+ p = __conn_prepare_command(tconn, sock);
+ if (!p)
+ mutex_unlock(&sock->mutex);
- if (use_data_socket)
- mutex_unlock(&mdev->data.mutex);
- else
- mutex_unlock(&mdev->meta.mutex);
- return ok;
+ return p;
}
-int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
- size_t size)
+void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock)
{
- struct p_header80 h;
- int ok;
+ return conn_prepare_command(mdev->tconn, sock);
+}
- h.magic = BE_DRBD_MAGIC;
- h.command = cpu_to_be16(cmd);
- h.length = cpu_to_be16(size);
+static int __send_command(struct drbd_tconn *tconn, int vnr,
+ struct drbd_socket *sock, enum drbd_packet cmd,
+ unsigned int header_size, void *data,
+ unsigned int size)
+{
+ int msg_flags;
+ int err;
- if (!drbd_get_data_sock(mdev))
- return 0;
+ /*
+ * Called with @data == NULL and the size of the data blocks in @size
+ * for commands that send data blocks. For those commands, omit the
+ * MSG_MORE flag: this will increase the likelihood that data blocks
+ * which are page aligned on the sender will end up page aligned on the
+ * receiver.
+ */
+ msg_flags = data ? MSG_MORE : 0;
+
+ header_size += prepare_header(tconn, vnr, sock->sbuf, cmd,
+ header_size + size);
+ err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size,
+ msg_flags);
+ if (data && !err)
+ err = drbd_send_all(tconn, sock->socket, data, size, 0);
+ return err;
+}
- ok = (sizeof(h) ==
- drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
- ok = ok && (size ==
- drbd_send(mdev, mdev->data.socket, data, size, 0));
+static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ return __send_command(tconn, 0, sock, cmd, header_size, data, size);
+}
- drbd_put_data_sock(mdev);
+int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
- return ok;
+ err = __conn_send_command(tconn, sock, cmd, header_size, data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
}
-int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
{
+ int err;
+
+ err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size,
+ data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+int drbd_send_ping(struct drbd_tconn *tconn)
+{
+ struct drbd_socket *sock;
+
+ sock = &tconn->meta;
+ if (!conn_prepare_command(tconn, sock))
+ return -EIO;
+ return conn_send_command(tconn, sock, P_PING, 0, NULL, 0);
+}
+
+int drbd_send_ping_ack(struct drbd_tconn *tconn)
+{
+ struct drbd_socket *sock;
+
+ sock = &tconn->meta;
+ if (!conn_prepare_command(tconn, sock))
+ return -EIO;
+ return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0);
+}
+
+int drbd_send_sync_param(struct drbd_conf *mdev)
+{
+ struct drbd_socket *sock;
struct p_rs_param_95 *p;
- struct socket *sock;
- int size, rv;
- const int apv = mdev->agreed_pro_version;
+ int size;
+ const int apv = mdev->tconn->agreed_pro_version;
+ enum drbd_packet cmd;
+ struct net_conf *nc;
+ struct disk_conf *dc;
+
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
size = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
- + strlen(mdev->sync_conf.verify_alg) + 1
+ + strlen(nc->verify_alg) + 1
: apv <= 94 ? sizeof(struct p_rs_param_89)
: /* apv >= 95 */ sizeof(struct p_rs_param_95);
- /* used from admin command context and receiver/worker context.
- * to avoid kmalloc, grab the socket right here,
- * then use the pre-allocated sbuf there */
- mutex_lock(&mdev->data.mutex);
- sock = mdev->data.socket;
-
- if (likely(sock != NULL)) {
- enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+ cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
- p = &mdev->data.sbuf.rs_param_95;
+ /* initialize verify_alg and csums_alg */
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- /* initialize verify_alg and csums_alg */
- memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
-
- p->rate = cpu_to_be32(sc->rate);
- p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
- p->c_delay_target = cpu_to_be32(sc->c_delay_target);
- p->c_fill_target = cpu_to_be32(sc->c_fill_target);
- p->c_max_rate = cpu_to_be32(sc->c_max_rate);
-
- if (apv >= 88)
- strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
- if (apv >= 89)
- strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
-
- rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
- } else
- rv = 0; /* not ok */
+ if (get_ldev(mdev)) {
+ dc = rcu_dereference(mdev->ldev->disk_conf);
+ p->resync_rate = cpu_to_be32(dc->resync_rate);
+ p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+ p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+ p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+ p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+ put_ldev(mdev);
+ } else {
+ p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+ p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+ p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+ p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+ p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+ }
- mutex_unlock(&mdev->data.mutex);
+ if (apv >= 88)
+ strcpy(p->verify_alg, nc->verify_alg);
+ if (apv >= 89)
+ strcpy(p->csums_alg, nc->csums_alg);
+ rcu_read_unlock();
- return rv;
+ return drbd_send_command(mdev, sock, cmd, size, NULL, 0);
}
-int drbd_send_protocol(struct drbd_conf *mdev)
+int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd)
{
+ struct drbd_socket *sock;
struct p_protocol *p;
- int size, cf, rv;
+ struct net_conf *nc;
+ int size, cf;
- size = sizeof(struct p_protocol);
+ sock = &tconn->data;
+ p = __conn_prepare_command(tconn, sock);
+ if (!p)
+ return -EIO;
- if (mdev->agreed_pro_version >= 87)
- size += strlen(mdev->net_conf->integrity_alg) + 1;
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
- /* we must not recurse into our own queue,
- * as that is blocked during handshake */
- p = kmalloc(size, GFP_NOIO);
- if (p == NULL)
- return 0;
+ if (nc->tentative && tconn->agreed_pro_version < 92) {
+ rcu_read_unlock();
+ mutex_unlock(&sock->mutex);
+ conn_err(tconn, "--dry-run is not supported by peer");
+ return -EOPNOTSUPP;
+ }
- p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
- p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
- p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
- p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
- p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
+ size = sizeof(*p);
+ if (tconn->agreed_pro_version >= 87)
+ size += strlen(nc->integrity_alg) + 1;
+ p->protocol = cpu_to_be32(nc->wire_protocol);
+ p->after_sb_0p = cpu_to_be32(nc->after_sb_0p);
+ p->after_sb_1p = cpu_to_be32(nc->after_sb_1p);
+ p->after_sb_2p = cpu_to_be32(nc->after_sb_2p);
+ p->two_primaries = cpu_to_be32(nc->two_primaries);
cf = 0;
- if (mdev->net_conf->want_lose)
- cf |= CF_WANT_LOSE;
- if (mdev->net_conf->dry_run) {
- if (mdev->agreed_pro_version >= 92)
- cf |= CF_DRY_RUN;
- else {
- dev_err(DEV, "--dry-run is not supported by peer");
- kfree(p);
- return -1;
- }
- }
+ if (nc->discard_my_data)
+ cf |= CF_DISCARD_MY_DATA;
+ if (nc->tentative)
+ cf |= CF_DRY_RUN;
p->conn_flags = cpu_to_be32(cf);
- if (mdev->agreed_pro_version >= 87)
- strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+ if (tconn->agreed_pro_version >= 87)
+ strcpy(p->integrity_alg, nc->integrity_alg);
+ rcu_read_unlock();
- rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
- (struct p_header80 *)p, size);
- kfree(p);
- return rv;
+ return __conn_send_command(tconn, sock, cmd, size, NULL, 0);
+}
+
+int drbd_send_protocol(struct drbd_tconn *tconn)
+{
+ int err;
+
+ mutex_lock(&tconn->data.mutex);
+ err = __drbd_send_protocol(tconn, P_PROTOCOL);
+ mutex_unlock(&tconn->data.mutex);
+
+ return err;
}
int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
{
- struct p_uuids p;
+ struct drbd_socket *sock;
+ struct p_uuids *p;
int i;
if (!get_ldev_if_state(mdev, D_NEGOTIATING))
- return 1;
+ return 0;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p) {
+ put_ldev(mdev);
+ return -EIO;
+ }
spin_lock_irq(&mdev->ldev->md.uuid_lock);
for (i = UI_CURRENT; i < UI_SIZE; i++)
- p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
+ p->uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
spin_unlock_irq(&mdev->ldev->md.uuid_lock);
mdev->comm_bm_set = drbd_bm_total_weight(mdev);
- p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
- uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
- uuid_flags |= drbd_test_flag(mdev, CRASHED_PRIMARY) ? 2 : 0;
+ p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
+ rcu_read_lock();
+ uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0;
+ rcu_read_unlock();
+ uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
- p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+ p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
put_ldev(mdev);
-
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
- (struct p_header80 *)&p, sizeof(p));
+ return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0);
}
int drbd_send_uuids(struct drbd_conf *mdev)
@@ -2186,9 +884,10 @@ void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
}
}
-int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
+void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
{
- struct p_rs_uuid p;
+ struct drbd_socket *sock;
+ struct p_rs_uuid *p;
u64 uuid;
D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
@@ -2201,24 +900,29 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
drbd_uuid_set(mdev, UI_BITMAP, uuid);
drbd_print_uuids(mdev, "updated sync UUID");
drbd_md_sync(mdev);
- p.uuid = cpu_to_be64(uuid);
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
- (struct p_header80 *)&p, sizeof(p));
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (p) {
+ p->uuid = cpu_to_be64(uuid);
+ drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+ }
}
int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
{
- struct p_sizes p;
+ struct drbd_socket *sock;
+ struct p_sizes *p;
sector_t d_size, u_size;
int q_order_type;
unsigned int max_bio_size;
- int ok;
if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
D_ASSERT(mdev->ldev->backing_bdev);
d_size = drbd_get_max_capacity(mdev->ldev);
- u_size = mdev->ldev->dc.disk_size;
+ rcu_read_lock();
+ u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
q_order_type = drbd_queue_order_type(mdev);
max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
@@ -2230,20 +934,23 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
}
- /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
- if (mdev->agreed_pro_version <= 94)
- max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
- p.d_size = cpu_to_be64(d_size);
- p.u_size = cpu_to_be64(u_size);
- p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
- p.max_bio_size = cpu_to_be32(max_bio_size);
- p.queue_order_type = cpu_to_be16(q_order_type);
- p.dds_flags = cpu_to_be16(flags);
+ if (mdev->tconn->agreed_pro_version <= 94)
+ max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ else if (mdev->tconn->agreed_pro_version < 100)
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ p->d_size = cpu_to_be64(d_size);
+ p->u_size = cpu_to_be64(u_size);
+ p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+ p->max_bio_size = cpu_to_be32(max_bio_size);
+ p->queue_order_type = cpu_to_be16(q_order_type);
+ p->dds_flags = cpu_to_be16(flags);
+ return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0);
}
/**
@@ -2252,34 +959,21 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
*/
int drbd_send_current_state(struct drbd_conf *mdev)
{
- struct socket *sock;
- struct p_state p;
- int ok = 0;
-
- /* Grab state lock so we wont send state if we're in the middle
- * of a cluster wide state change on another thread */
- drbd_state_lock(mdev);
-
- mutex_lock(&mdev->data.mutex);
-
- p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
- sock = mdev->data.socket;
-
- if (likely(sock != NULL)) {
- ok = _drbd_send_cmd(mdev, sock, P_STATE,
- (struct p_header80 *)&p, sizeof(p), 0);
- }
+ struct drbd_socket *sock;
+ struct p_state *p;
- mutex_unlock(&mdev->data.mutex);
-
- drbd_state_unlock(mdev);
- return ok;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
+ return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
}
/**
* drbd_send_state() - After a state change, sends the new state to the peer
- * @mdev: DRBD device.
- * @state: the state to send, not necessarily the current state.
+ * @mdev: DRBD device.
+ * @state: the state to send, not necessarily the current state.
*
* Each state change queues an "after_state_ch" work, which will eventually
* send the resulting new state to the peer. If more state changes happen
@@ -2288,50 +982,95 @@ int drbd_send_current_state(struct drbd_conf *mdev)
*/
int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
{
- struct socket *sock;
- struct p_state p;
- int ok = 0;
+ struct drbd_socket *sock;
+ struct p_state *p;
- mutex_lock(&mdev->data.mutex);
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(state.i); /* Within the send mutex */
+ return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
+}
- p.state = cpu_to_be32(state.i);
- sock = mdev->data.socket;
+int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val)
+{
+ struct drbd_socket *sock;
+ struct p_req_state *p;
- if (likely(sock != NULL)) {
- ok = _drbd_send_cmd(mdev, sock, P_STATE,
- (struct p_header80 *)&p, sizeof(p), 0);
- }
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
- mutex_unlock(&mdev->data.mutex);
+int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_packet cmd;
+ struct drbd_socket *sock;
+ struct p_req_state *p;
- return ok;
+ cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+ sock = &tconn->data;
+ p = conn_prepare_command(tconn, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
}
-int drbd_send_state_req(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val)
+void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
{
- struct p_req_state p;
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
- p.mask = cpu_to_be32(mask.i);
- p.val = cpu_to_be32(val.i);
+ sock = &mdev->tconn->meta;
+ p = drbd_prepare_command(mdev, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+ }
+}
+
+void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
+{
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+ enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
- (struct p_header80 *)&p, sizeof(p));
+ sock = &tconn->meta;
+ p = conn_prepare_command(tconn, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
+ }
}
-int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
{
- struct p_req_state_reply p;
+ BUG_ON(code & ~0xf);
+ p->encoding = (p->encoding & ~0xf) | code;
+}
- p.retcode = cpu_to_be32(retcode);
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+ p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
- (struct p_header80 *)&p, sizeof(p));
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+ BUG_ON(n & ~0x7);
+ p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
}
int fill_bitmap_rle_bits(struct drbd_conf *mdev,
- struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+ struct p_compressed_bm *p,
+ unsigned int size,
+ struct bm_xfer_ctx *c)
{
struct bitstream bs;
unsigned long plain_bits;
@@ -2339,19 +1078,21 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
unsigned long rl;
unsigned len;
unsigned toggle;
- int bits;
+ int bits, use_rle;
/* may we use this feature? */
- if ((mdev->sync_conf.use_rle == 0) ||
- (mdev->agreed_pro_version < 90))
- return 0;
+ rcu_read_lock();
+ use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle;
+ rcu_read_unlock();
+ if (!use_rle || mdev->tconn->agreed_pro_version < 90)
+ return 0;
if (c->bit_offset >= c->bm_bits)
return 0; /* nothing to do. */
/* use at most thus many bytes */
- bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
- memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+ bitstream_init(&bs, p->code, size, 0);
+ memset(p->code, 0, size);
/* plain bits covered in this code string */
plain_bits = 0;
@@ -2373,12 +1114,12 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
if (rl == 0) {
/* the first checked bit was set,
* store start value, */
- DCBP_set_start(p, 1);
+ dcbp_set_start(p, 1);
/* but skip encoding of zero run length */
toggle = !toggle;
continue;
}
- DCBP_set_start(p, 0);
+ dcbp_set_start(p, 0);
}
/* paranoia: catch zero runlength.
@@ -2418,7 +1159,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
bm_xfer_ctx_bit_to_word_offset(c);
/* store pad_bits */
- DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+ dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
return len;
}
@@ -2430,48 +1171,52 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
* code upon failure.
*/
static int
-send_bitmap_rle_or_plain(struct drbd_conf *mdev,
- struct p_header80 *h, struct bm_xfer_ctx *c)
+send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c)
{
- struct p_compressed_bm *p = (void*)h;
- unsigned long num_words;
- int len;
- int ok;
-
- len = fill_bitmap_rle_bits(mdev, p, c);
+ struct drbd_socket *sock = &mdev->tconn->data;
+ unsigned int header_size = drbd_header_size(mdev->tconn);
+ struct p_compressed_bm *p = sock->sbuf + header_size;
+ int len, err;
+ len = fill_bitmap_rle_bits(mdev, p,
+ DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
if (len < 0)
return -EIO;
if (len) {
- DCBP_set_code(p, RLE_VLI_Bits);
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
- sizeof(*p) + len, 0);
-
+ dcbp_set_code(p, RLE_VLI_Bits);
+ err = __send_command(mdev->tconn, mdev->vnr, sock,
+ P_COMPRESSED_BITMAP, sizeof(*p) + len,
+ NULL, 0);
c->packets[0]++;
- c->bytes[0] += sizeof(*p) + len;
+ c->bytes[0] += header_size + sizeof(*p) + len;
if (c->bit_offset >= c->bm_bits)
len = 0; /* DONE */
} else {
/* was not compressible.
* send a buffer full of plain text bits instead. */
- num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
- len = num_words * sizeof(long);
+ unsigned int data_size;
+ unsigned long num_words;
+ unsigned long *p = sock->sbuf + header_size;
+
+ data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ len = num_words * sizeof(*p);
if (len)
- drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
- h, sizeof(struct p_header80) + len, 0);
+ drbd_bm_get_lel(mdev, c->word_offset, num_words, p);
+ err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
c->packets[1]++;
- c->bytes[1] += sizeof(struct p_header80) + len;
+ c->bytes[1] += header_size + len;
if (c->bit_offset > c->bm_bits)
c->bit_offset = c->bm_bits;
}
- if (ok) {
+ if (!err) {
if (len == 0) {
INFO_bm_xfer_stats(mdev, "send", c);
return 0;
@@ -2482,21 +1227,13 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
}
/* See the comment at receive_bitmap() */
-int _drbd_send_bitmap(struct drbd_conf *mdev)
+static int _drbd_send_bitmap(struct drbd_conf *mdev)
{
struct bm_xfer_ctx c;
- struct p_header80 *p;
int err;
- ERR_IF(!mdev->bitmap) return false;
-
- /* maybe we should use some per thread scratch page,
- * and allocate that during initial device creation? */
- p = (struct p_header80 *) __get_free_page(GFP_NOIO);
- if (!p) {
- dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+ if (!expect(mdev->bitmap))
return false;
- }
if (get_ldev(mdev)) {
if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
@@ -2521,37 +1258,39 @@ int _drbd_send_bitmap(struct drbd_conf *mdev)
};
do {
- err = send_bitmap_rle_or_plain(mdev, p, &c);
+ err = send_bitmap_rle_or_plain(mdev, &c);
} while (err > 0);
- free_page((unsigned long) p);
return err == 0;
}
int drbd_send_bitmap(struct drbd_conf *mdev)
{
- int err;
+ struct drbd_socket *sock = &mdev->tconn->data;
+ int err = -1;
- if (!drbd_get_data_sock(mdev))
- return -1;
- err = !_drbd_send_bitmap(mdev);
- drbd_put_data_sock(mdev);
+ mutex_lock(&sock->mutex);
+ if (sock->socket)
+ err = !_drbd_send_bitmap(mdev);
+ mutex_unlock(&sock->mutex);
return err;
}
-int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
+void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size)
{
- int ok;
- struct p_barrier_ack p;
+ struct drbd_socket *sock;
+ struct p_barrier_ack *p;
- p.barrier = barrier_nr;
- p.set_size = cpu_to_be32(set_size);
+ if (tconn->cstate < C_WF_REPORT_PARAMS)
+ return;
- if (mdev->state.conn < C_CONNECTED)
- return false;
- ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &tconn->meta;
+ p = conn_prepare_command(tconn, sock);
+ if (!p)
+ return;
+ p->barrier = barrier_nr;
+ p->set_size = cpu_to_be32(set_size);
+ conn_send_command(tconn, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
}
/**
@@ -2562,62 +1301,62 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
* @blksize: size in byte, needs to be in big endian byte order
* @block_id: Id, big endian byte order
*/
-static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
- u64 sector,
- u32 blksize,
- u64 block_id)
+static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
+ u64 sector, u32 blksize, u64 block_id)
{
- int ok;
- struct p_block_ack p;
+ struct drbd_socket *sock;
+ struct p_block_ack *p;
- p.sector = sector;
- p.block_id = block_id;
- p.blksize = blksize;
- p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+ if (mdev->state.conn < C_CONNECTED)
+ return -EIO;
- if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
- return false;
- ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &mdev->tconn->meta;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = sector;
+ p->block_id = block_id;
+ p->blksize = blksize;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
+ return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
}
/* dp->sector and dp->block_id already/still in network byte order,
* data_size is payload size according to dp->head,
* and may need to be corrected for digest size. */
-int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp, int data_size)
+void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct p_data *dp, int data_size)
{
- data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
- return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
- dp->block_id);
+ if (mdev->tconn->peer_integrity_tfm)
+ data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+ _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
+ dp->block_id);
}
-int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_block_req *rp)
+void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct p_block_req *rp)
{
- return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+ _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
}
/**
* drbd_send_ack() - Sends an ack packet
- * @mdev: DRBD device.
- * @cmd: Packet command code.
- * @e: Epoch entry.
+ * @mdev: DRBD device
+ * @cmd: packet command code
+ * @peer_req: peer request
*/
-int drbd_send_ack(struct drbd_conf *mdev,
- enum drbd_packets cmd, struct drbd_epoch_entry *e)
+int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
{
return _drbd_send_ack(mdev, cmd,
- cpu_to_be64(e->sector),
- cpu_to_be32(e->size),
- e->block_id);
+ cpu_to_be64(peer_req->i.sector),
+ cpu_to_be32(peer_req->i.size),
+ peer_req->block_id);
}
/* This function misuses the block_id field to signal if the blocks
* are is sync or not. */
-int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
sector_t sector, int blksize, u64 block_id)
{
return _drbd_send_ack(mdev, cmd,
@@ -2629,85 +1368,87 @@ int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
sector_t sector, int size, u64 block_id)
{
- int ok;
- struct p_block_req p;
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- p.sector = cpu_to_be64(sector);
- p.block_id = block_id;
- p.blksize = cpu_to_be32(size);
-
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = block_id;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
}
-int drbd_send_drequest_csum(struct drbd_conf *mdev,
- sector_t sector, int size,
- void *digest, int digest_size,
- enum drbd_packets cmd)
+int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
+ void *digest, int digest_size, enum drbd_packet cmd)
{
- int ok;
- struct p_block_req p;
-
- p.sector = cpu_to_be64(sector);
- p.block_id = BE_DRBD_MAGIC + 0xbeef;
- p.blksize = cpu_to_be32(size);
-
- p.head.magic = BE_DRBD_MAGIC;
- p.head.command = cpu_to_be16(cmd);
- p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
-
- mutex_lock(&mdev->data.mutex);
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
- ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
+ /* FIXME: Put the digest into the preallocated socket buffer. */
- mutex_unlock(&mdev->data.mutex);
-
- return ok;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(mdev, sock, cmd, sizeof(*p),
+ digest, digest_size);
}
int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
{
- int ok;
- struct p_block_req p;
-
- p.sector = cpu_to_be64(sector);
- p.block_id = BE_DRBD_MAGIC + 0xbabe;
- p.blksize = cpu_to_be32(size);
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
}
/* called on sndtimeo
* returns false if we should retry,
* true if we think connection is dead
*/
-static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
{
int drop_it;
/* long elapsed = (long)(jiffies - mdev->last_received); */
- drop_it = mdev->meta.socket == sock
- || !mdev->asender.task
- || get_t_state(&mdev->asender) != Running
- || mdev->state.conn < C_CONNECTED;
+ drop_it = tconn->meta.socket == sock
+ || !tconn->asender.task
+ || get_t_state(&tconn->asender) != RUNNING
+ || tconn->cstate < C_WF_REPORT_PARAMS;
if (drop_it)
return true;
- drop_it = !--mdev->ko_count;
+ drop_it = !--tconn->ko_count;
if (!drop_it) {
- dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
- current->comm, current->pid, mdev->ko_count);
- request_ping(mdev);
+ conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+ current->comm, current->pid, tconn->ko_count);
+ request_ping(tconn);
}
return drop_it; /* && (mdev->state == R_PRIMARY) */;
}
+static void drbd_update_congested(struct drbd_tconn *tconn)
+{
+ struct sock *sk = tconn->data.socket->sk;
+ if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+ set_bit(NET_CONGESTED, &tconn->flags);
+}
+
/* The idea of sendpage seems to be to put some kind of reference
* to the page into the skb, and to hand it over to the NIC. In
* this process get_page() gets called.
@@ -2730,21 +1471,28 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *
* with page_count == 0 or PageSlab.
*/
static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
- int offset, size_t size, unsigned msg_flags)
+ int offset, size_t size, unsigned msg_flags)
{
- int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
+ struct socket *socket;
+ void *addr;
+ int err;
+
+ socket = mdev->tconn->data.socket;
+ addr = kmap(page) + offset;
+ err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags);
kunmap(page);
- if (sent == size)
- mdev->send_cnt += size>>9;
- return sent == size;
+ if (!err)
+ mdev->send_cnt += size >> 9;
+ return err;
}
static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
int offset, size_t size, unsigned msg_flags)
{
+ struct socket *socket = mdev->tconn->data.socket;
mm_segment_t oldfs = get_fs();
- int sent, ok;
int len = size;
+ int err = -EIO;
/* e.g. XFS meta- & log-data is in slab pages, which have a
* page_count of 0 and/or have PageSlab() set.
@@ -2756,34 +1504,35 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
msg_flags |= MSG_NOSIGNAL;
- drbd_update_congested(mdev);
+ drbd_update_congested(mdev->tconn);
set_fs(KERNEL_DS);
do {
- sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
- offset, len,
- msg_flags);
- if (sent == -EAGAIN) {
- if (we_should_drop_the_connection(mdev,
- mdev->data.socket))
- break;
- else
- continue;
- }
+ int sent;
+
+ sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
if (sent <= 0) {
+ if (sent == -EAGAIN) {
+ if (we_should_drop_the_connection(mdev->tconn, socket))
+ break;
+ continue;
+ }
dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
__func__, (int)size, len, sent);
+ if (sent < 0)
+ err = sent;
break;
}
len -= sent;
offset += sent;
} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
set_fs(oldfs);
- drbd_clear_flag(mdev, NET_CONGESTED);
+ clear_bit(NET_CONGESTED, &mdev->tconn->flags);
- ok = (len == 0);
- if (likely(ok))
- mdev->send_cnt += size>>9;
- return ok;
+ if (len == 0) {
+ err = 0;
+ mdev->send_cnt += size >> 9;
+ }
+ return err;
}
static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
@@ -2792,12 +1541,15 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
int i;
/* hint all but last page with MSG_MORE */
bio_for_each_segment(bvec, bio, i) {
- if (!_drbd_no_send_page(mdev, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len,
- i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
- return 0;
+ int err;
+
+ err = _drbd_no_send_page(mdev, bvec->bv_page,
+ bvec->bv_offset, bvec->bv_len,
+ i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+ if (err)
+ return err;
}
- return 1;
+ return 0;
}
static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
@@ -2806,32 +1558,40 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
int i;
/* hint all but last page with MSG_MORE */
bio_for_each_segment(bvec, bio, i) {
- if (!_drbd_send_page(mdev, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len,
- i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
- return 0;
+ int err;
+
+ err = _drbd_send_page(mdev, bvec->bv_page,
+ bvec->bv_offset, bvec->bv_len,
+ i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+ if (err)
+ return err;
}
- return 1;
+ return 0;
}
-static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+static int _drbd_send_zc_ee(struct drbd_conf *mdev,
+ struct drbd_peer_request *peer_req)
{
- struct page *page = e->pages;
- unsigned len = e->size;
+ struct page *page = peer_req->pages;
+ unsigned len = peer_req->i.size;
+ int err;
+
/* hint all but last page with MSG_MORE */
page_chain_for_each(page) {
unsigned l = min_t(unsigned, len, PAGE_SIZE);
- if (!_drbd_send_page(mdev, page, 0, l,
- page_chain_next(page) ? MSG_MORE : 0))
- return 0;
+
+ err = _drbd_send_page(mdev, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+ if (err)
+ return err;
len -= l;
}
- return 1;
+ return 0;
}
static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
{
- if (mdev->agreed_pro_version >= 95)
+ if (mdev->tconn->agreed_pro_version >= 95)
return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
(bi_rw & REQ_FUA ? DP_FUA : 0) |
(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
@@ -2845,50 +1605,36 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
*/
int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
{
- int ok = 1;
- struct p_data p;
+ struct drbd_socket *sock;
+ struct p_data *p;
unsigned int dp_flags = 0;
- void *dgb;
int dgs;
+ int err;
- if (!drbd_get_data_sock(mdev))
- return 0;
-
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
- crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
-
- if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
- p.head.h80.magic = BE_DRBD_MAGIC;
- p.head.h80.command = cpu_to_be16(P_DATA);
- p.head.h80.length =
- cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
- } else {
- p.head.h95.magic = BE_DRBD_MAGIC_BIG;
- p.head.h95.command = cpu_to_be16(P_DATA);
- p.head.h95.length =
- cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
- }
-
- p.sector = cpu_to_be64(req->sector);
- p.block_id = (unsigned long)req;
- p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->block_id = (unsigned long)req;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
-
if (mdev->state.conn >= C_SYNC_SOURCE &&
mdev->state.conn <= C_PAUSED_SYNC_T)
dp_flags |= DP_MAY_SET_IN_SYNC;
-
- p.dp_flags = cpu_to_be32(dp_flags);
- drbd_set_flag(mdev, UNPLUG_REMOTE);
- ok = (sizeof(p) ==
- drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
- if (ok && dgs) {
- dgb = mdev->int_dig_out;
- drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
- ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
- }
- if (ok) {
+ if (mdev->tconn->agreed_pro_version >= 100) {
+ if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ if (req->rq_state & RQ_EXP_WRITE_ACK)
+ dp_flags |= DP_SEND_WRITE_ACK;
+ }
+ p->dp_flags = cpu_to_be32(dp_flags);
+ if (dgs)
+ drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1);
+ err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
+ if (!err) {
/* For protocol A, we have to memcpy the payload into
* socket buffers, as we may complete right away
* as soon as we handed it over to tcp, at which point the data
@@ -2900,92 +1646,76 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
* out ok after sending on this side, but does not fit on the
* receiving side, we sure have detected corruption elsewhere.
*/
- if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
- ok = _drbd_send_bio(mdev, req->master_bio);
+ if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
+ err = _drbd_send_bio(mdev, req->master_bio);
else
- ok = _drbd_send_zc_bio(mdev, req->master_bio);
+ err = _drbd_send_zc_bio(mdev, req->master_bio);
/* double check digest, sometimes buffers have been modified in flight. */
if (dgs > 0 && dgs <= 64) {
/* 64 byte, 512 bit, is the largest digest size
* currently supported in kernel crypto. */
unsigned char digest[64];
- drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
- if (memcmp(mdev->int_dig_out, digest, dgs)) {
+ drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest);
+ if (memcmp(p + 1, digest, dgs)) {
dev_warn(DEV,
"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
- (unsigned long long)req->sector, req->size);
+ (unsigned long long)req->i.sector, req->i.size);
}
} /* else if (dgs > 64) {
... Be noisy about digest too large ...
} */
}
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
- drbd_put_data_sock(mdev);
-
- return ok;
+ return err;
}
/* answer packet, used to send data back for read requests:
* Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
* C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
*/
-int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e)
+int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
{
- int ok;
- struct p_data p;
- void *dgb;
+ struct drbd_socket *sock;
+ struct p_data *p;
+ int err;
int dgs;
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
- crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
- if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
- p.head.h80.magic = BE_DRBD_MAGIC;
- p.head.h80.command = cpu_to_be16(cmd);
- p.head.h80.length =
- cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
- } else {
- p.head.h95.magic = BE_DRBD_MAGIC_BIG;
- p.head.h95.command = cpu_to_be16(cmd);
- p.head.h95.length =
- cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
- }
+ dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
- p.sector = cpu_to_be64(e->sector);
- p.block_id = e->block_id;
- /* p.seq_num = 0; No sequence numbers here.. */
-
- /* Only called by our kernel thread.
- * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
- * in response to admin command or module unload.
- */
- if (!drbd_get_data_sock(mdev))
- return 0;
-
- ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
- if (ok && dgs) {
- dgb = mdev->int_dig_out;
- drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
- ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
- }
- if (ok)
- ok = _drbd_send_zc_ee(mdev, e);
-
- drbd_put_data_sock(mdev);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(peer_req->i.sector);
+ p->block_id = peer_req->block_id;
+ p->seq_num = 0; /* unused */
+ p->dp_flags = 0;
+ if (dgs)
+ drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1);
+ err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size);
+ if (!err)
+ err = _drbd_send_zc_ee(mdev, peer_req);
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
- return ok;
+ return err;
}
-int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
+int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req)
{
- struct p_block_desc p;
-
- p.sector = cpu_to_be64(req->sector);
- p.blksize = cpu_to_be32(req->size);
+ struct drbd_socket *sock;
+ struct p_block_desc *p;
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->blksize = cpu_to_be32(req->i.size);
+ return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
}
/*
@@ -3004,7 +1734,7 @@ int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
/*
* you must have down()ed the appropriate [m]sock_mutex elsewhere!
*/
-int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
void *buf, size_t size, unsigned msg_flags)
{
struct kvec iov;
@@ -3012,7 +1742,7 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
int rv, sent = 0;
if (!sock)
- return -1000;
+ return -EBADR;
/* THINK if (signal_pending) return ... ? */
@@ -3025,9 +1755,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
msg.msg_controllen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
- if (sock == mdev->data.socket) {
- mdev->ko_count = mdev->net_conf->ko_count;
- drbd_update_congested(mdev);
+ if (sock == tconn->data.socket) {
+ rcu_read_lock();
+ tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count;
+ rcu_read_unlock();
+ drbd_update_congested(tconn);
}
do {
/* STRANGE
@@ -3041,12 +1773,11 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
*/
rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
if (rv == -EAGAIN) {
- if (we_should_drop_the_connection(mdev, sock))
+ if (we_should_drop_the_connection(tconn, sock))
break;
else
continue;
}
- D_ASSERT(rv != 0);
if (rv == -EINTR) {
flush_signals(current);
rv = 0;
@@ -3058,22 +1789,40 @@ int drbd_send(struct drbd_conf *mdev, struct socket *sock,
iov.iov_len -= rv;
} while (sent < size);
- if (sock == mdev->data.socket)
- drbd_clear_flag(mdev, NET_CONGESTED);
+ if (sock == tconn->data.socket)
+ clear_bit(NET_CONGESTED, &tconn->flags);
if (rv <= 0) {
if (rv != -EAGAIN) {
- dev_err(DEV, "%s_sendmsg returned %d\n",
- sock == mdev->meta.socket ? "msock" : "sock",
- rv);
- drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+ conn_err(tconn, "%s_sendmsg returned %d\n",
+ sock == tconn->meta.socket ? "msock" : "sock",
+ rv);
+ conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
} else
- drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+ conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
}
return sent;
}
+/**
+ * drbd_send_all - Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer,
+ size_t size, unsigned msg_flags)
+{
+ int err;
+
+ err = drbd_send(tconn, sock, buffer, size, msg_flags);
+ if (err < 0)
+ return err;
+ if (err != size)
+ return -EIO;
+ return 0;
+}
+
static int drbd_open(struct block_device *bdev, fmode_t mode)
{
struct drbd_conf *mdev = bdev->bd_disk->private_data;
@@ -3081,7 +1830,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
int rv = 0;
mutex_lock(&drbd_main_mutex);
- spin_lock_irqsave(&mdev->req_lock, flags);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
/* to have a stable mdev->state.role
* and no race with updating open_cnt */
@@ -3094,7 +1843,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
if (!rv)
mdev->open_cnt++;
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
mutex_unlock(&drbd_main_mutex);
return rv;
@@ -3111,35 +1860,14 @@ static int drbd_release(struct gendisk *gd, fmode_t mode)
static void drbd_set_defaults(struct drbd_conf *mdev)
{
- /* This way we get a compile error when sync_conf grows,
- and we forgot to initialize it here */
- mdev->sync_conf = (struct syncer_conf) {
- /* .rate = */ DRBD_RATE_DEF,
- /* .after = */ DRBD_AFTER_DEF,
- /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
- /* .verify_alg = */ {}, 0,
- /* .cpu_mask = */ {}, 0,
- /* .csums_alg = */ {}, 0,
- /* .use_rle = */ 0,
- /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
- /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
- /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
- /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
- /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
- /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
- };
-
- /* Have to use that way, because the layout differs between
- big endian and little endian */
- mdev->state = (union drbd_state) {
+ /* Beware! The actual layout differs
+ * between big endian and little endian */
+ mdev->state = (union drbd_dev_state) {
{ .role = R_SECONDARY,
.peer = R_UNKNOWN,
.conn = C_STANDALONE,
.disk = D_DISKLESS,
.pdsk = D_UNKNOWN,
- .susp = 0,
- .susp_nod = 0,
- .susp_fen = 0
} };
}
@@ -3155,28 +1883,17 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
atomic_set(&mdev->rs_pending_cnt, 0);
atomic_set(&mdev->unacked_cnt, 0);
atomic_set(&mdev->local_cnt, 0);
- atomic_set(&mdev->net_cnt, 0);
- atomic_set(&mdev->packet_seq, 0);
- atomic_set(&mdev->pp_in_use, 0);
atomic_set(&mdev->pp_in_use_by_net, 0);
atomic_set(&mdev->rs_sect_in, 0);
atomic_set(&mdev->rs_sect_ev, 0);
atomic_set(&mdev->ap_in_flight, 0);
atomic_set(&mdev->md_io_in_use, 0);
- mutex_init(&mdev->data.mutex);
- mutex_init(&mdev->meta.mutex);
- sema_init(&mdev->data.work.s, 0);
- sema_init(&mdev->meta.work.s, 0);
- mutex_init(&mdev->state_mutex);
-
- spin_lock_init(&mdev->data.work.q_lock);
- spin_lock_init(&mdev->meta.work.q_lock);
+ mutex_init(&mdev->own_state_mutex);
+ mdev->state_mutex = &mdev->own_state_mutex;
spin_lock_init(&mdev->al_lock);
- spin_lock_init(&mdev->req_lock);
spin_lock_init(&mdev->peer_seq_lock);
- spin_lock_init(&mdev->epoch_lock);
INIT_LIST_HEAD(&mdev->active_ee);
INIT_LIST_HEAD(&mdev->sync_ee);
@@ -3184,8 +1901,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
INIT_LIST_HEAD(&mdev->read_ee);
INIT_LIST_HEAD(&mdev->net_ee);
INIT_LIST_HEAD(&mdev->resync_reads);
- INIT_LIST_HEAD(&mdev->data.work.q);
- INIT_LIST_HEAD(&mdev->meta.work.q);
INIT_LIST_HEAD(&mdev->resync_work.list);
INIT_LIST_HEAD(&mdev->unplug_work.list);
INIT_LIST_HEAD(&mdev->go_diskless.list);
@@ -3199,6 +1914,14 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
mdev->md_sync_work.cb = w_md_sync;
mdev->bm_io_work.w.cb = w_bitmap_io;
mdev->start_resync_work.cb = w_start_resync;
+
+ mdev->resync_work.mdev = mdev;
+ mdev->unplug_work.mdev = mdev;
+ mdev->go_diskless.mdev = mdev;
+ mdev->md_sync_work.mdev = mdev;
+ mdev->bm_io_work.w.mdev = mdev;
+ mdev->start_resync_work.mdev = mdev;
+
init_timer(&mdev->resync_timer);
init_timer(&mdev->md_sync_timer);
init_timer(&mdev->start_resync_timer);
@@ -3214,17 +1937,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
init_waitqueue_head(&mdev->misc_wait);
init_waitqueue_head(&mdev->state_wait);
- init_waitqueue_head(&mdev->net_cnt_wait);
init_waitqueue_head(&mdev->ee_wait);
init_waitqueue_head(&mdev->al_wait);
init_waitqueue_head(&mdev->seq_wait);
- drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
- drbd_thread_init(mdev, &mdev->worker, drbd_worker);
- drbd_thread_init(mdev, &mdev->asender, drbd_asender);
-
- mdev->agreed_pro_version = PRO_VERSION_MAX;
- mdev->write_ordering = WO_bdev_flush;
mdev->resync_wenr = LC_FREE;
mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
@@ -3233,13 +1949,10 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
void drbd_mdev_cleanup(struct drbd_conf *mdev)
{
int i;
- if (mdev->receiver.t_state != None)
+ if (mdev->tconn->receiver.t_state != NONE)
dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
- mdev->receiver.t_state);
+ mdev->tconn->receiver.t_state);
- /* no need to lock it, I'm the only thread alive */
- if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
- dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
mdev->al_writ_cnt =
mdev->bm_writ_cnt =
mdev->read_cnt =
@@ -3256,7 +1969,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
mdev->rs_mark_left[i] = 0;
mdev->rs_mark_time[i] = 0;
}
- D_ASSERT(mdev->net_conf == NULL);
+ D_ASSERT(mdev->tconn->net_conf == NULL);
drbd_set_my_capacity(mdev, 0);
if (mdev->bitmap) {
@@ -3265,21 +1978,18 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
drbd_bm_cleanup(mdev);
}
- drbd_free_resources(mdev);
- drbd_clear_flag(mdev, AL_SUSPENDED);
+ drbd_free_bc(mdev->ldev);
+ mdev->ldev = NULL;
+
+ clear_bit(AL_SUSPENDED, &mdev->flags);
- /*
- * currently we drbd_init_ee only on module load, so
- * we may do drbd_release_ee only on module unload!
- */
D_ASSERT(list_empty(&mdev->active_ee));
D_ASSERT(list_empty(&mdev->sync_ee));
D_ASSERT(list_empty(&mdev->done_ee));
D_ASSERT(list_empty(&mdev->read_ee));
D_ASSERT(list_empty(&mdev->net_ee));
D_ASSERT(list_empty(&mdev->resync_reads));
- D_ASSERT(list_empty(&mdev->data.work.q));
- D_ASSERT(list_empty(&mdev->meta.work.q));
+ D_ASSERT(list_empty(&mdev->tconn->sender_work.q));
D_ASSERT(list_empty(&mdev->resync_work.list));
D_ASSERT(list_empty(&mdev->unplug_work.list));
D_ASSERT(list_empty(&mdev->go_diskless.list));
@@ -3353,7 +2063,7 @@ static int drbd_create_mempools(void)
goto Enomem;
drbd_ee_cache = kmem_cache_create(
- "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+ "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
if (drbd_ee_cache == NULL)
goto Enomem;
@@ -3368,11 +2078,9 @@ static int drbd_create_mempools(void)
goto Enomem;
/* mempools */
-#ifdef COMPAT_HAVE_BIOSET_CREATE
drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
if (drbd_md_io_bio_set == NULL)
goto Enomem;
-#endif
drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
if (drbd_md_io_page_pool == NULL)
@@ -3421,73 +2129,53 @@ static struct notifier_block drbd_notifier = {
.notifier_call = drbd_notify_sys,
};
-static void drbd_release_ee_lists(struct drbd_conf *mdev)
+static void drbd_release_all_peer_reqs(struct drbd_conf *mdev)
{
int rr;
- rr = drbd_release_ee(mdev, &mdev->active_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->active_ee);
if (rr)
dev_err(DEV, "%d EEs in active list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->sync_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee);
if (rr)
dev_err(DEV, "%d EEs in sync list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->read_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->read_ee);
if (rr)
dev_err(DEV, "%d EEs in read list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->done_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->done_ee);
if (rr)
dev_err(DEV, "%d EEs in done list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->net_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->net_ee);
if (rr)
dev_err(DEV, "%d EEs in net list found!\n", rr);
}
-/* caution. no locking.
- * currently only used from module cleanup code. */
-static void drbd_delete_device(unsigned int minor)
+/* caution. no locking. */
+void drbd_minor_destroy(struct kref *kref)
{
- struct drbd_conf *mdev = minor_to_mdev(minor);
-
- if (!mdev)
- return;
+ struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref);
+ struct drbd_tconn *tconn = mdev->tconn;
del_timer_sync(&mdev->request_timer);
/* paranoia asserts */
- if (mdev->open_cnt != 0)
- dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
- __FILE__ , __LINE__);
-
- ERR_IF (!list_empty(&mdev->data.work.q)) {
- struct list_head *lp;
- list_for_each(lp, &mdev->data.work.q) {
- dev_err(DEV, "lp = %p\n", lp);
- }
- };
+ D_ASSERT(mdev->open_cnt == 0);
/* end paranoia asserts */
- del_gendisk(mdev->vdisk);
-
/* cleanup stuff that may have been allocated during
* device (re-)configuration or state changes */
if (mdev->this_bdev)
bdput(mdev->this_bdev);
- drbd_free_resources(mdev);
+ drbd_free_bc(mdev->ldev);
+ mdev->ldev = NULL;
- drbd_release_ee_lists(mdev);
-
- /* should be freed on disconnect? */
- kfree(mdev->ee_hash);
- /*
- mdev->ee_hash_s = 0;
- mdev->ee_hash = NULL;
- */
+ drbd_release_all_peer_reqs(mdev);
lc_destroy(mdev->act_log);
lc_destroy(mdev->resync);
@@ -3495,19 +2183,101 @@ static void drbd_delete_device(unsigned int minor)
kfree(mdev->p_uuid);
/* mdev->p_uuid = NULL; */
- kfree(mdev->int_dig_out);
- kfree(mdev->int_dig_in);
- kfree(mdev->int_dig_vv);
+ if (mdev->bitmap) /* should no longer be there. */
+ drbd_bm_cleanup(mdev);
+ __free_page(mdev->md_io_page);
+ put_disk(mdev->vdisk);
+ blk_cleanup_queue(mdev->rq_queue);
+ kfree(mdev->rs_plan_s);
+ kfree(mdev);
- /* cleanup the rest that has been
- * allocated from drbd_new_device
- * and actually free the mdev itself */
- drbd_free_mdev(mdev);
+ kref_put(&tconn->kref, &conn_destroy);
}
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+ struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+ LIST_HEAD(writes);
+ struct drbd_request *req, *tmp;
+
+ spin_lock_irq(&retry->lock);
+ list_splice_init(&retry->writes, &writes);
+ spin_unlock_irq(&retry->lock);
+
+ list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+ struct drbd_conf *mdev = req->w.mdev;
+ struct bio *bio = req->master_bio;
+ unsigned long start_time = req->start_time;
+ bool expected;
+
+ expected =
+ expect(atomic_read(&req->completion_ref) == 0) &&
+ expect(req->rq_state & RQ_POSTPONED) &&
+ expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+ (req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+ if (!expected)
+ dev_err(DEV, "req=%p completion_ref=%d rq_state=%x\n",
+ req, atomic_read(&req->completion_ref),
+ req->rq_state);
+
+ /* We still need to put one kref associated with the
+ * "completion_ref" going zero in the code path that queued it
+ * here. The request object may still be referenced by a
+ * frozen local req->private_bio, in case we force-detached.
+ */
+ kref_put(&req->kref, drbd_req_destroy);
+
+ /* A single suspended or otherwise blocking device may stall
+ * all others as well. Fortunately, this code path is to
+ * recover from a situation that "should not happen":
+ * concurrent writes in multi-primary setup.
+ * In a "normal" lifecycle, this workqueue is supposed to be
+ * destroyed without ever doing anything.
+ * If it turns out to be an issue anyways, we can do per
+ * resource (replication group) or per device (minor) retry
+ * workqueues instead.
+ */
+
+ /* We are not just doing generic_make_request(),
+ * as we want to keep the start_time information. */
+ inc_ap_bio(mdev);
+ __drbd_make_request(mdev, bio, start_time);
+ }
+}
+
+void drbd_restart_request(struct drbd_request *req)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&retry.lock, flags);
+ list_move_tail(&req->tl_requests, &retry.writes);
+ spin_unlock_irqrestore(&retry.lock, flags);
+
+ /* Drop the extra reference that would otherwise
+ * have been dropped by complete_master_bio.
+ * do_retry() needs to grab a new one. */
+ dec_ap_bio(req->w.mdev);
+
+ queue_work(retry.wq, &retry.worker);
+}
+
+
static void drbd_cleanup(void)
{
unsigned int i;
+ struct drbd_conf *mdev;
+ struct drbd_tconn *tconn, *tmp;
unregister_reboot_notifier(&drbd_notifier);
@@ -3522,19 +2292,31 @@ static void drbd_cleanup(void)
if (drbd_proc)
remove_proc_entry("drbd", NULL);
- drbd_nl_cleanup();
+ if (retry.wq)
+ destroy_workqueue(retry.wq);
+
+ drbd_genl_unregister();
- if (minor_table) {
- i = minor_count;
- while (i--)
- drbd_delete_device(i);
- drbd_destroy_mempools();
+ idr_for_each_entry(&minors, mdev, i) {
+ idr_remove(&minors, mdev_to_minor(mdev));
+ idr_remove(&mdev->tconn->volumes, mdev->vnr);
+ del_gendisk(mdev->vdisk);
+ /* synchronize_rcu(); No other threads running at this point */
+ kref_put(&mdev->kref, &drbd_minor_destroy);
}
- kfree(minor_table);
+ /* not _rcu since, no other updater anymore. Genl already unregistered */
+ list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+ list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */
+ /* synchronize_rcu(); */
+ kref_put(&tconn->kref, &conn_destroy);
+ }
+ drbd_destroy_mempools();
unregister_blkdev(DRBD_MAJOR, "drbd");
+ idr_destroy(&minors);
+
printk(KERN_INFO "drbd: module cleanup done.\n");
}
@@ -3559,7 +2341,7 @@ static int drbd_congested(void *congested_data, int bdi_bits)
goto out;
}
- if (drbd_test_flag(mdev, CALLBACK_PENDING)) {
+ if (test_bit(CALLBACK_PENDING, &mdev->tconn->flags)) {
r |= (1 << BDI_async_congested);
/* Without good local data, we would need to read from remote,
* and that would need the worker thread as well, which is
@@ -3583,7 +2365,7 @@ static int drbd_congested(void *congested_data, int bdi_bits)
reason = 'b';
}
- if (bdi_bits & (1 << BDI_async_congested) && drbd_test_flag(mdev, NET_CONGESTED)) {
+ if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
r |= (1 << BDI_async_congested);
reason = reason == 'b' ? 'a' : 'n';
}
@@ -3593,20 +2375,243 @@ out:
return r;
}
-struct drbd_conf *drbd_new_device(unsigned int minor)
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
+{
+ spin_lock_init(&wq->q_lock);
+ INIT_LIST_HEAD(&wq->q);
+ init_waitqueue_head(&wq->q_wait);
+}
+
+struct drbd_tconn *conn_get_by_name(const char *name)
+{
+ struct drbd_tconn *tconn;
+
+ if (!name || !name[0])
+ return NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
+ if (!strcmp(tconn->name, name)) {
+ kref_get(&tconn->kref);
+ goto found;
+ }
+ }
+ tconn = NULL;
+found:
+ rcu_read_unlock();
+ return tconn;
+}
+
+struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len)
+{
+ struct drbd_tconn *tconn;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
+ if (tconn->my_addr_len == my_addr_len &&
+ tconn->peer_addr_len == peer_addr_len &&
+ !memcmp(&tconn->my_addr, my_addr, my_addr_len) &&
+ !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) {
+ kref_get(&tconn->kref);
+ goto found;
+ }
+ }
+ tconn = NULL;
+found:
+ rcu_read_unlock();
+ return tconn;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+ socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->rbuf)
+ return -ENOMEM;
+ socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->sbuf)
+ return -ENOMEM;
+ return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+ free_page((unsigned long) socket->sbuf);
+ free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_tconn *tconn)
+{
+ drbd_free_sock(tconn);
+
+ crypto_free_hash(tconn->csums_tfm);
+ crypto_free_hash(tconn->verify_tfm);
+ crypto_free_hash(tconn->cram_hmac_tfm);
+ crypto_free_hash(tconn->integrity_tfm);
+ crypto_free_hash(tconn->peer_integrity_tfm);
+ kfree(tconn->int_dig_in);
+ kfree(tconn->int_dig_vv);
+
+ tconn->csums_tfm = NULL;
+ tconn->verify_tfm = NULL;
+ tconn->cram_hmac_tfm = NULL;
+ tconn->integrity_tfm = NULL;
+ tconn->peer_integrity_tfm = NULL;
+ tconn->int_dig_in = NULL;
+ tconn->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts)
+{
+ cpumask_var_t new_cpu_mask;
+ int err;
+
+ if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+ /*
+ retcode = ERR_NOMEM;
+ drbd_msg_put_info("unable to allocate cpumask");
+ */
+
+ /* silently ignore cpu mask on UP kernel */
+ if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+ /* FIXME: Get rid of constant 32 here */
+ err = bitmap_parse(res_opts->cpu_mask, 32,
+ cpumask_bits(new_cpu_mask), nr_cpu_ids);
+ if (err) {
+ conn_warn(tconn, "bitmap_parse() failed with %d\n", err);
+ /* retcode = ERR_CPU_MASK_PARSE; */
+ goto fail;
+ }
+ }
+ tconn->res_opts = *res_opts;
+ if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) {
+ cpumask_copy(tconn->cpu_mask, new_cpu_mask);
+ drbd_calc_cpu_mask(tconn);
+ tconn->receiver.reset_cpu_mask = 1;
+ tconn->asender.reset_cpu_mask = 1;
+ tconn->worker.reset_cpu_mask = 1;
+ }
+ err = 0;
+
+fail:
+ free_cpumask_var(new_cpu_mask);
+ return err;
+
+}
+
+/* caller must be under genl_lock() */
+struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts)
+{
+ struct drbd_tconn *tconn;
+
+ tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
+ if (!tconn)
+ return NULL;
+
+ tconn->name = kstrdup(name, GFP_KERNEL);
+ if (!tconn->name)
+ goto fail;
+
+ if (drbd_alloc_socket(&tconn->data))
+ goto fail;
+ if (drbd_alloc_socket(&tconn->meta))
+ goto fail;
+
+ if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
+ goto fail;
+
+ if (set_resource_options(tconn, res_opts))
+ goto fail;
+
+ tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+ if (!tconn->current_epoch)
+ goto fail;
+
+ INIT_LIST_HEAD(&tconn->transfer_log);
+
+ INIT_LIST_HEAD(&tconn->current_epoch->list);
+ tconn->epochs = 1;
+ spin_lock_init(&tconn->epoch_lock);
+ tconn->write_ordering = WO_bdev_flush;
+
+ tconn->send.seen_any_write_yet = false;
+ tconn->send.current_epoch_nr = 0;
+ tconn->send.current_epoch_writes = 0;
+
+ tconn->cstate = C_STANDALONE;
+ mutex_init(&tconn->cstate_mutex);
+ spin_lock_init(&tconn->req_lock);
+ mutex_init(&tconn->conf_update);
+ init_waitqueue_head(&tconn->ping_wait);
+ idr_init(&tconn->volumes);
+
+ drbd_init_workqueue(&tconn->sender_work);
+ mutex_init(&tconn->data.mutex);
+ mutex_init(&tconn->meta.mutex);
+
+ drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
+ drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
+ drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
+
+ kref_init(&tconn->kref);
+ list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns);
+
+ return tconn;
+
+fail:
+ kfree(tconn->current_epoch);
+ free_cpumask_var(tconn->cpu_mask);
+ drbd_free_socket(&tconn->meta);
+ drbd_free_socket(&tconn->data);
+ kfree(tconn->name);
+ kfree(tconn);
+
+ return NULL;
+}
+
+void conn_destroy(struct kref *kref)
+{
+ struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref);
+
+ if (atomic_read(&tconn->current_epoch->epoch_size) != 0)
+ conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size));
+ kfree(tconn->current_epoch);
+
+ idr_destroy(&tconn->volumes);
+
+ free_cpumask_var(tconn->cpu_mask);
+ drbd_free_socket(&tconn->meta);
+ drbd_free_socket(&tconn->data);
+ kfree(tconn->name);
+ kfree(tconn->int_dig_in);
+ kfree(tconn->int_dig_vv);
+ kfree(tconn);
+}
+
+enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
{
struct drbd_conf *mdev;
struct gendisk *disk;
struct request_queue *q;
+ int vnr_got = vnr;
+ int minor_got = minor;
+ enum drbd_ret_code err = ERR_NOMEM;
+
+ mdev = minor_to_mdev(minor);
+ if (mdev)
+ return ERR_MINOR_EXISTS;
/* GFP_KERNEL, we are outside of all write-out paths */
mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
if (!mdev)
- return NULL;
- if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
- goto out_no_cpumask;
+ return ERR_NOMEM;
+
+ kref_get(&tconn->kref);
+ mdev->tconn = tconn;
mdev->minor = minor;
+ mdev->vnr = vnr;
drbd_init_set_defaults(mdev);
@@ -3644,7 +2649,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
blk_queue_merge_bvec(q, drbd_merge_bvec);
- q->queue_lock = &mdev->req_lock;
+ q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
mdev->md_io_page = alloc_page(GFP_KERNEL);
if (!mdev->md_io_page)
@@ -3652,30 +2657,44 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
if (drbd_bm_init(mdev))
goto out_no_bitmap;
- /* no need to lock access, we are still initializing this minor device. */
- if (!tl_init(mdev))
- goto out_no_tl;
-
- mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
- if (!mdev->app_reads_hash)
- goto out_no_app_reads;
-
- mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
- if (!mdev->current_epoch)
- goto out_no_epoch;
-
- INIT_LIST_HEAD(&mdev->current_epoch->list);
- mdev->epochs = 1;
-
- return mdev;
-
-/* out_whatever_else:
- kfree(mdev->current_epoch); */
-out_no_epoch:
- kfree(mdev->app_reads_hash);
-out_no_app_reads:
- tl_cleanup(mdev);
-out_no_tl:
+ mdev->read_requests = RB_ROOT;
+ mdev->write_requests = RB_ROOT;
+
+ if (!idr_pre_get(&minors, GFP_KERNEL))
+ goto out_no_minor_idr;
+ if (idr_get_new_above(&minors, mdev, minor, &minor_got))
+ goto out_no_minor_idr;
+ if (minor_got != minor) {
+ err = ERR_MINOR_EXISTS;
+ drbd_msg_put_info("requested minor exists already");
+ goto out_idr_remove_minor;
+ }
+
+ if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
+ goto out_idr_remove_minor;
+ if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
+ goto out_idr_remove_minor;
+ if (vnr_got != vnr) {
+ err = ERR_INVALID_REQUEST;
+ drbd_msg_put_info("requested volume exists already");
+ goto out_idr_remove_vol;
+ }
+ add_disk(disk);
+ kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
+
+ /* inherit the connection state */
+ mdev->state.conn = tconn->cstate;
+ if (mdev->state.conn == C_WF_REPORT_PARAMS)
+ drbd_connected(mdev);
+
+ return NO_ERROR;
+
+out_idr_remove_vol:
+ idr_remove(&tconn->volumes, vnr_got);
+out_idr_remove_minor:
+ idr_remove(&minors, minor_got);
+ synchronize_rcu();
+out_no_minor_idr:
drbd_bm_cleanup(mdev);
out_no_bitmap:
__free_page(mdev->md_io_page);
@@ -3684,55 +2703,25 @@ out_no_io_page:
out_no_disk:
blk_cleanup_queue(q);
out_no_q:
- free_cpumask_var(mdev->cpu_mask);
-out_no_cpumask:
- kfree(mdev);
- return NULL;
-}
-
-/* counterpart of drbd_new_device.
- * last part of drbd_delete_device. */
-void drbd_free_mdev(struct drbd_conf *mdev)
-{
- kfree(mdev->current_epoch);
- kfree(mdev->app_reads_hash);
- tl_cleanup(mdev);
- if (mdev->bitmap) /* should no longer be there. */
- drbd_bm_cleanup(mdev);
- __free_page(mdev->md_io_page);
- put_disk(mdev->vdisk);
- blk_cleanup_queue(mdev->rq_queue);
- free_cpumask_var(mdev->cpu_mask);
- drbd_free_tl_hash(mdev);
kfree(mdev);
+ kref_put(&tconn->kref, &conn_destroy);
+ return err;
}
-
int __init drbd_init(void)
{
int err;
- if (sizeof(struct p_handshake) != 80) {
- printk(KERN_ERR
- "drbd: never change the size or layout "
- "of the HandShake packet.\n");
- return -EINVAL;
- }
-
if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
printk(KERN_ERR
- "drbd: invalid minor_count (%d)\n", minor_count);
+ "drbd: invalid minor_count (%d)\n", minor_count);
#ifdef MODULE
return -EINVAL;
#else
- minor_count = 8;
+ minor_count = DRBD_MINOR_COUNT_DEF;
#endif
}
- err = drbd_nl_init();
- if (err)
- return err;
-
err = register_blkdev(DRBD_MAJOR, "drbd");
if (err) {
printk(KERN_ERR
@@ -3741,6 +2730,13 @@ int __init drbd_init(void)
return err;
}
+ err = drbd_genl_register();
+ if (err) {
+ printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+ goto fail;
+ }
+
+
register_reboot_notifier(&drbd_notifier);
/*
@@ -3751,22 +2747,29 @@ int __init drbd_init(void)
init_waitqueue_head(&drbd_pp_wait);
drbd_proc = NULL; /* play safe for drbd_cleanup */
- minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
- GFP_KERNEL);
- if (!minor_table)
- goto Enomem;
+ idr_init(&minors);
err = drbd_create_mempools();
if (err)
- goto Enomem;
+ goto fail;
drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
if (!drbd_proc) {
printk(KERN_ERR "drbd: unable to register proc file\n");
- goto Enomem;
+ goto fail;
}
rwlock_init(&global_state_lock);
+ INIT_LIST_HEAD(&drbd_tconns);
+
+ retry.wq = create_singlethread_workqueue("drbd-reissue");
+ if (!retry.wq) {
+ printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+ goto fail;
+ }
+ INIT_WORK(&retry.worker, do_retry);
+ spin_lock_init(&retry.lock);
+ INIT_LIST_HEAD(&retry.writes);
printk(KERN_INFO "drbd: initialized. "
"Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
@@ -3774,11 +2777,10 @@ int __init drbd_init(void)
printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
printk(KERN_INFO "drbd: registered as block device major %d\n",
DRBD_MAJOR);
- printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
return 0; /* Success! */
-Enomem:
+fail:
drbd_cleanup();
if (err == -ENOMEM)
/* currently always the case */
@@ -3799,47 +2801,42 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
kfree(ldev);
}
-void drbd_free_sock(struct drbd_conf *mdev)
+void drbd_free_sock(struct drbd_tconn *tconn)
{
- if (mdev->data.socket) {
- mutex_lock(&mdev->data.mutex);
- kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
- sock_release(mdev->data.socket);
- mdev->data.socket = NULL;
- mutex_unlock(&mdev->data.mutex);
+ if (tconn->data.socket) {
+ mutex_lock(&tconn->data.mutex);
+ kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
+ sock_release(tconn->data.socket);
+ tconn->data.socket = NULL;
+ mutex_unlock(&tconn->data.mutex);
}
- if (mdev->meta.socket) {
- mutex_lock(&mdev->meta.mutex);
- kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
- sock_release(mdev->meta.socket);
- mdev->meta.socket = NULL;
- mutex_unlock(&mdev->meta.mutex);
+ if (tconn->meta.socket) {
+ mutex_lock(&tconn->meta.mutex);
+ kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
+ sock_release(tconn->meta.socket);
+ tconn->meta.socket = NULL;
+ mutex_unlock(&tconn->meta.mutex);
}
}
+/* meta data management */
-void drbd_free_resources(struct drbd_conf *mdev)
+void conn_md_sync(struct drbd_tconn *tconn)
{
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = NULL;
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = NULL;
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = NULL;
- crypto_free_hash(mdev->integrity_w_tfm);
- mdev->integrity_w_tfm = NULL;
- crypto_free_hash(mdev->integrity_r_tfm);
- mdev->integrity_r_tfm = NULL;
-
- drbd_free_sock(mdev);
+ struct drbd_conf *mdev;
+ int vnr;
- __no_warn(local,
- drbd_free_bc(mdev->ldev);
- mdev->ldev = NULL;);
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ drbd_md_sync(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
}
-/* meta data management */
-
struct meta_data_on_disk {
u64 la_size; /* last agreed size. */
u64 uuid[UI_SIZE]; /* UUIDs. */
@@ -3850,7 +2847,7 @@ struct meta_data_on_disk {
u32 md_size_sect;
u32 al_offset; /* offset to this block */
u32 al_nr_extents; /* important for restoring the AL */
- /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+ /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
u32 bm_offset; /* offset to the bitmap, from here */
u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
u32 la_peer_max_bio_size; /* last peer max_bio_size */
@@ -3870,7 +2867,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
del_timer(&mdev->md_sync_timer);
/* timer may be rearmed by drbd_md_mark_dirty() now. */
- if (!drbd_test_and_clear_flag(mdev, MD_DIRTY))
+ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
return;
/* We use here D_FAILED and not D_ATTACHING because we try to write
@@ -3888,7 +2885,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
for (i = UI_CURRENT; i < UI_SIZE; i++)
buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
- buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+ buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
@@ -3902,7 +2899,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
sector = mdev->ldev->md.md_offset;
- if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+ if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
/* this was a try anyways ... */
dev_err(DEV, "meta data update failed!\n");
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
@@ -3923,11 +2920,12 @@ out:
* @bdev: Device from which the meta data should be read in.
*
* Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
- * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ * something goes wrong.
*/
int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
{
struct meta_data_on_disk *buffer;
+ u32 magic, flags;
int i, rv = NO_ERROR;
if (!get_ldev_if_state(mdev, D_ATTACHING))
@@ -3937,7 +2935,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
if (!buffer)
goto out;
- if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
+ if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
/* NOTE: can't do normal error processing here as this is
called BEFORE disk is attached */
dev_err(DEV, "Error while reading metadata.\n");
@@ -3945,8 +2943,20 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
goto err;
}
- if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
- dev_err(DEV, "Error while reading metadata, magic not found.\n");
+ magic = be32_to_cpu(buffer->magic);
+ flags = be32_to_cpu(buffer->flags);
+ if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+ (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+ /* btw: that's Activity Log clean, not "all" clean. */
+ dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+ rv = ERR_MD_UNCLEAN;
+ goto err;
+ }
+ if (magic != DRBD_MD_MAGIC_08) {
+ if (magic == DRBD_MD_MAGIC_07)
+ dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+ else
+ dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
rv = ERR_MD_INVALID;
goto err;
}
@@ -3980,20 +2990,16 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
for (i = UI_CURRENT; i < UI_SIZE; i++)
bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
bdev->md.flags = be32_to_cpu(buffer->flags);
- mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED) {
unsigned int peer;
peer = be32_to_cpu(buffer->la_peer_max_bio_size);
peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
mdev->peer_max_bio_size = peer;
}
- spin_unlock_irq(&mdev->req_lock);
-
- if (mdev->sync_conf.al_extents < 7)
- mdev->sync_conf.al_extents = 127;
+ spin_unlock_irq(&mdev->tconn->req_lock);
err:
drbd_md_put_buffer(mdev);
@@ -4014,7 +3020,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
#ifdef DEBUG
void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
{
- if (!drbd_test_and_set_flag(mdev, MD_DIRTY)) {
+ if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
mod_timer(&mdev->md_sync_timer, jiffies + HZ);
mdev->last_md_mark_dirty.line = line;
mdev->last_md_mark_dirty.func = func;
@@ -4023,7 +3029,7 @@ void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *
#else
void drbd_md_mark_dirty(struct drbd_conf *mdev)
{
- if (!drbd_test_and_set_flag(mdev, MD_DIRTY))
+ if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
}
#endif
@@ -4171,9 +3177,10 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
return rv;
}
-static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_bitmap_io(struct drbd_work *w, int unused)
{
struct bm_io_work *work = container_of(w, struct bm_io_work, w);
+ struct drbd_conf *mdev = w->mdev;
int rv = -EIO;
D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
@@ -4185,18 +3192,17 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
put_ldev(mdev);
}
- drbd_clear_flag(mdev, BITMAP_IO);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(BITMAP_IO, &mdev->flags);
wake_up(&mdev->misc_wait);
if (work->done)
work->done(mdev, rv);
- drbd_clear_flag(mdev, BITMAP_IO_QUEUED);
+ clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
work->why = NULL;
work->flags = 0;
- return 1;
+ return 0;
}
void drbd_ldev_destroy(struct drbd_conf *mdev)
@@ -4209,15 +3215,13 @@ void drbd_ldev_destroy(struct drbd_conf *mdev)
drbd_free_bc(mdev->ldev);
mdev->ldev = NULL;);
- if (mdev->md_io_tmpp) {
- __free_page(mdev->md_io_tmpp);
- mdev->md_io_tmpp = NULL;
- }
- drbd_clear_flag(mdev, GO_DISKLESS);
+ clear_bit(GO_DISKLESS, &mdev->flags);
}
-static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_go_diskless(struct drbd_work *w, int unused)
{
+ struct drbd_conf *mdev = w->mdev;
+
D_ASSERT(mdev->state.disk == D_FAILED);
/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
* inc/dec it frequently. Once we are D_DISKLESS, no one will touch
@@ -4232,11 +3236,15 @@ static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused
* (Do we want a specific meta data flag for this?)
*
* If that does not make it to stable storage either,
- * we cannot do anything about that anymore. */
- if (mdev->bitmap) {
+ * we cannot do anything about that anymore.
+ *
+ * We still need to check if both bitmap and ldev are present, we may
+ * end up here after a failed attach, before ldev was even assigned.
+ */
+ if (mdev->bitmap && mdev->ldev) {
if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
"detach", BM_LOCKED_MASK)) {
- if (drbd_test_flag(mdev, WAS_READ_ERROR)) {
+ if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
drbd_md_set_flag(mdev, MDF_FULL_SYNC);
drbd_md_sync(mdev);
}
@@ -4244,14 +3252,14 @@ static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused
}
drbd_force_state(mdev, NS(disk, D_DISKLESS));
- return 1;
+ return 0;
}
void drbd_go_diskless(struct drbd_conf *mdev)
{
D_ASSERT(mdev->state.disk == D_FAILED);
- if (!drbd_test_and_set_flag(mdev, GO_DISKLESS))
- drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+ if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
}
/**
@@ -4271,10 +3279,10 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev,
void (*done)(struct drbd_conf *, int),
char *why, enum bm_flag flags)
{
- D_ASSERT(current == mdev->worker.task);
+ D_ASSERT(current == mdev->tconn->worker.task);
- D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO_QUEUED));
- D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO));
+ D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
+ D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
if (mdev->bm_io_work.why)
dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
@@ -4285,13 +3293,13 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev,
mdev->bm_io_work.why = why;
mdev->bm_io_work.flags = flags;
- spin_lock_irq(&mdev->req_lock);
- drbd_set_flag(mdev, BITMAP_IO);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ set_bit(BITMAP_IO, &mdev->flags);
if (atomic_read(&mdev->ap_bio_cnt) == 0) {
- if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
- drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
}
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
}
/**
@@ -4308,7 +3316,7 @@ int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
{
int rv;
- D_ASSERT(current != mdev->worker.task);
+ D_ASSERT(current != mdev->tconn->worker.task);
if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
drbd_suspend_io(mdev);
@@ -4347,18 +3355,127 @@ static void md_sync_timer_fn(unsigned long data)
{
struct drbd_conf *mdev = (struct drbd_conf *) data;
- drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+ /* must not double-queue! */
+ if (list_empty(&mdev->md_sync_work.list))
+ drbd_queue_work_front(&mdev->tconn->sender_work, &mdev->md_sync_work);
}
-static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_md_sync(struct drbd_work *w, int unused)
{
+ struct drbd_conf *mdev = w->mdev;
+
dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
#ifdef DEBUG
dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
#endif
drbd_md_sync(mdev);
- return 1;
+ return 0;
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+ /* THINK may need to become several global tables
+ * when we want to support more than
+ * one PRO_VERSION */
+ static const char *cmdnames[] = {
+ [P_DATA] = "Data",
+ [P_DATA_REPLY] = "DataReply",
+ [P_RS_DATA_REPLY] = "RSDataReply",
+ [P_BARRIER] = "Barrier",
+ [P_BITMAP] = "ReportBitMap",
+ [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
+ [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
+ [P_UNPLUG_REMOTE] = "UnplugRemote",
+ [P_DATA_REQUEST] = "DataRequest",
+ [P_RS_DATA_REQUEST] = "RSDataRequest",
+ [P_SYNC_PARAM] = "SyncParam",
+ [P_SYNC_PARAM89] = "SyncParam89",
+ [P_PROTOCOL] = "ReportProtocol",
+ [P_UUIDS] = "ReportUUIDs",
+ [P_SIZES] = "ReportSizes",
+ [P_STATE] = "ReportState",
+ [P_SYNC_UUID] = "ReportSyncUUID",
+ [P_AUTH_CHALLENGE] = "AuthChallenge",
+ [P_AUTH_RESPONSE] = "AuthResponse",
+ [P_PING] = "Ping",
+ [P_PING_ACK] = "PingAck",
+ [P_RECV_ACK] = "RecvAck",
+ [P_WRITE_ACK] = "WriteAck",
+ [P_RS_WRITE_ACK] = "RSWriteAck",
+ [P_SUPERSEDED] = "Superseded",
+ [P_NEG_ACK] = "NegAck",
+ [P_NEG_DREPLY] = "NegDReply",
+ [P_NEG_RS_DREPLY] = "NegRSDReply",
+ [P_BARRIER_ACK] = "BarrierAck",
+ [P_STATE_CHG_REQ] = "StateChgRequest",
+ [P_STATE_CHG_REPLY] = "StateChgReply",
+ [P_OV_REQUEST] = "OVRequest",
+ [P_OV_REPLY] = "OVReply",
+ [P_OV_RESULT] = "OVResult",
+ [P_CSUM_RS_REQUEST] = "CsumRSRequest",
+ [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
+ [P_COMPRESSED_BITMAP] = "CBitmap",
+ [P_DELAY_PROBE] = "DelayProbe",
+ [P_OUT_OF_SYNC] = "OutOfSync",
+ [P_RETRY_WRITE] = "RetryWrite",
+ [P_RS_CANCEL] = "RSCancel",
+ [P_CONN_ST_CHG_REQ] = "conn_st_chg_req",
+ [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
+ [P_RETRY_WRITE] = "retry_write",
+ [P_PROTOCOL_UPDATE] = "protocol_update",
+
+ /* enum drbd_packet, but not commands - obsoleted flags:
+ * P_MAY_IGNORE
+ * P_MAX_OPT_CMD
+ */
+ };
+
+ /* too big for the array: 0xfffX */
+ if (cmd == P_INITIAL_META)
+ return "InitialMeta";
+ if (cmd == P_INITIAL_DATA)
+ return "InitialData";
+ if (cmd == P_CONNECTION_FEATURES)
+ return "ConnectionFeatures";
+ if (cmd >= ARRAY_SIZE(cmdnames))
+ return "Unknown";
+ return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc - wait for a request to make progress
+ * @mdev: device associated with the request
+ * @i: the struct drbd_interval embedded in struct drbd_request or
+ * struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+ struct net_conf *nc;
+ DEFINE_WAIT(wait);
+ long timeout;
+
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -ETIMEDOUT;
+ }
+ timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+ rcu_read_unlock();
+
+ /* Indicate to wake up mdev->misc_wait on progress. */
+ i->waiting = true;
+ prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ timeout = schedule_timeout(timeout);
+ finish_wait(&mdev->misc_wait, &wait);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ if (!timeout || mdev->state.conn < C_CONNECTED)
+ return -ETIMEDOUT;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ return 0;
}
#ifdef CONFIG_DRBD_FAULT_INJECTION
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c8dda4e8dfc..76bb3a684b8 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -29,159 +29,317 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
-#include <linux/connector.h>
#include <linux/blkpg.h>
#include <linux/cpumask.h>
#include "drbd_int.h"
#include "drbd_req.h"
#include "drbd_wrappers.h"
#include <asm/unaligned.h>
-#include <linux/drbd_tag_magic.h>
#include <linux/drbd_limits.h>
-#include <linux/compiler.h>
#include <linux/kthread.h>
-static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
-static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
-static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
-
-/* see get_sb_bdev and bd_claim */
+#include <net/genetlink.h>
+
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
-/* Generate the tag_list to struct functions */
-#define NL_PACKET(name, number, fields) \
-static int name ## _from_tags(struct drbd_conf *mdev, \
- unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
-static int name ## _from_tags(struct drbd_conf *mdev, \
- unsigned short *tags, struct name *arg) \
-{ \
- int tag; \
- int dlen; \
- \
- while ((tag = get_unaligned(tags++)) != TT_END) { \
- dlen = get_unaligned(tags++); \
- switch (tag_number(tag)) { \
- fields \
- default: \
- if (tag & T_MANDATORY) { \
- dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
- return 0; \
- } \
- } \
- tags = (unsigned short *)((char *)tags + dlen); \
- } \
- return 1; \
-}
-#define NL_INTEGER(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
- arg->member = get_unaligned((int *)(tags)); \
- break;
-#define NL_INT64(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
- arg->member = get_unaligned((u64 *)(tags)); \
+/* Configuration is strictly serialized, because generic netlink message
+ * processing is strictly serialized by the genl_lock().
+ * Which means we can use one static global drbd_config_context struct.
+ */
+static struct drbd_config_context {
+ /* assigned from drbd_genlmsghdr */
+ unsigned int minor;
+ /* assigned from request attributes, if present */
+ unsigned int volume;
+#define VOLUME_UNSPECIFIED (-1U)
+ /* pointer into the request skb,
+ * limited lifetime! */
+ char *resource_name;
+ struct nlattr *my_addr;
+ struct nlattr *peer_addr;
+
+ /* reply buffer */
+ struct sk_buff *reply_skb;
+ /* pointer into reply buffer */
+ struct drbd_genlmsghdr *reply_dh;
+ /* resolved from attributes, if possible */
+ struct drbd_conf *mdev;
+ struct drbd_tconn *tconn;
+} adm_ctx;
+
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+ genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+ if (genlmsg_reply(skb, info))
+ printk(KERN_ERR "drbd: error sending genl reply\n");
+}
+
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+int drbd_msg_put_info(const char *info)
+{
+ struct sk_buff *skb = adm_ctx.reply_skb;
+ struct nlattr *nla;
+ int err = -EMSGSIZE;
+
+ if (!info || !info[0])
+ return 0;
+
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+ if (!nla)
+ return err;
+
+ err = nla_put_string(skb, T_info_text, info);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
+ } else
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ */
+#define DRBD_ADM_NEED_MINOR 1
+#define DRBD_ADM_NEED_RESOURCE 2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
+ unsigned flags)
+{
+ struct drbd_genlmsghdr *d_in = info->userhdr;
+ const u8 cmd = info->genlhdr->cmd;
+ int err;
+
+ memset(&adm_ctx, 0, sizeof(adm_ctx));
+
+ /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!adm_ctx.reply_skb) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
+ info, &drbd_genl_family, 0, cmd);
+ /* put of a few bytes into a fresh skb of >= 4k will always succeed.
+ * but anyways */
+ if (!adm_ctx.reply_dh) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx.reply_dh->minor = d_in->minor;
+ adm_ctx.reply_dh->ret_code = NO_ERROR;
+
+ adm_ctx.volume = VOLUME_UNSPECIFIED;
+ if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+ struct nlattr *nla;
+ /* parse and validate only */
+ err = drbd_cfg_context_from_attrs(NULL, info);
+ if (err)
+ goto fail;
+
+ /* It was present, and valid,
+ * copy it over to the reply skb. */
+ err = nla_put_nohdr(adm_ctx.reply_skb,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]);
+ if (err)
+ goto fail;
+
+ /* and assign stuff to the global adm_ctx */
+ nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+ if (nla)
+ adm_ctx.volume = nla_get_u32(nla);
+ nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+ if (nla)
+ adm_ctx.resource_name = nla_data(nla);
+ adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+ adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+ if ((adm_ctx.my_addr &&
+ nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) ||
+ (adm_ctx.peer_addr &&
+ nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) {
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ adm_ctx.minor = d_in->minor;
+ adm_ctx.mdev = minor_to_mdev(d_in->minor);
+ adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name);
+
+ if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) {
+ drbd_msg_put_info("unknown minor");
+ return ERR_MINOR_INVALID;
+ }
+ if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info("unknown resource");
+ return ERR_INVALID_REQUEST;
+ }
+
+ if (flags & DRBD_ADM_NEED_CONNECTION) {
+ if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info("no resource name expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.mdev) {
+ drbd_msg_put_info("no minor number expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.my_addr && adm_ctx.peer_addr)
+ adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
+ nla_len(adm_ctx.my_addr),
+ nla_data(adm_ctx.peer_addr),
+ nla_len(adm_ctx.peer_addr));
+ if (!adm_ctx.tconn) {
+ drbd_msg_put_info("unknown connection");
+ return ERR_INVALID_REQUEST;
+ }
+ }
+
+ /* some more paranoia, if the request was over-determined */
+ if (adm_ctx.mdev && adm_ctx.tconn &&
+ adm_ctx.mdev->tconn != adm_ctx.tconn) {
+ pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n",
+ adm_ctx.minor, adm_ctx.resource_name,
+ adm_ctx.mdev->tconn->name);
+ drbd_msg_put_info("minor exists in different resource");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.mdev &&
+ adm_ctx.volume != VOLUME_UNSPECIFIED &&
+ adm_ctx.volume != adm_ctx.mdev->vnr) {
+ pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+ adm_ctx.minor, adm_ctx.volume,
+ adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name);
+ drbd_msg_put_info("minor exists as different volume");
+ return ERR_INVALID_REQUEST;
+ }
+
+ return NO_ERROR;
+
+fail:
+ nlmsg_free(adm_ctx.reply_skb);
+ adm_ctx.reply_skb = NULL;
+ return err;
+}
+
+static int drbd_adm_finish(struct genl_info *info, int retcode)
+{
+ if (adm_ctx.tconn) {
+ kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+ adm_ctx.tconn = NULL;
+ }
+
+ if (!adm_ctx.reply_skb)
+ return -ENOMEM;
+
+ adm_ctx.reply_dh->ret_code = retcode;
+ drbd_adm_send_reply(adm_ctx.reply_skb, info);
+ return 0;
+}
+
+static void setup_khelper_env(struct drbd_tconn *tconn, char **envp)
+{
+ char *afs;
+
+ /* FIXME: A future version will not allow this case. */
+ if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0)
+ return;
+
+ switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) {
+ case AF_INET6:
+ afs = "ipv6";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+ &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr);
break;
-#define NL_BIT(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
- arg->member = *(char *)(tags) ? 1 : 0; \
+ case AF_INET:
+ afs = "ipv4";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
break;
-#define NL_STRING(pn, pr, member, len) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
- if (dlen > len) { \
- dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
- #member, dlen, (unsigned int)len); \
- return 0; \
- } \
- arg->member ## _len = dlen; \
- memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
- break;
-#include <linux/drbd_nl.h>
-
-/* Generate the struct to tag_list functions */
-#define NL_PACKET(name, number, fields) \
-static unsigned short* \
-name ## _to_tags(struct drbd_conf *mdev, \
- struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
-static unsigned short* \
-name ## _to_tags(struct drbd_conf *mdev, \
- struct name *arg, unsigned short *tags) \
-{ \
- fields \
- return tags; \
-}
-
-#define NL_INTEGER(pn, pr, member) \
- put_unaligned(pn | pr | TT_INTEGER, tags++); \
- put_unaligned(sizeof(int), tags++); \
- put_unaligned(arg->member, (int *)tags); \
- tags = (unsigned short *)((char *)tags+sizeof(int));
-#define NL_INT64(pn, pr, member) \
- put_unaligned(pn | pr | TT_INT64, tags++); \
- put_unaligned(sizeof(u64), tags++); \
- put_unaligned(arg->member, (u64 *)tags); \
- tags = (unsigned short *)((char *)tags+sizeof(u64));
-#define NL_BIT(pn, pr, member) \
- put_unaligned(pn | pr | TT_BIT, tags++); \
- put_unaligned(sizeof(char), tags++); \
- *(char *)tags = arg->member; \
- tags = (unsigned short *)((char *)tags+sizeof(char));
-#define NL_STRING(pn, pr, member, len) \
- put_unaligned(pn | pr | TT_STRING, tags++); \
- put_unaligned(arg->member ## _len, tags++); \
- memcpy(tags, arg->member, arg->member ## _len); \
- tags = (unsigned short *)((char *)tags + arg->member ## _len);
-#include <linux/drbd_nl.h>
-
-void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
-void drbd_nl_send_reply(struct cn_msg *, int);
+ default:
+ afs = "ssocks";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
+ }
+ snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
int drbd_khelper(struct drbd_conf *mdev, char *cmd)
{
char *envp[] = { "HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL, /* Will be set to address family */
- NULL, /* Will be set to address */
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
NULL };
-
- char mb[12], af[20], ad[60], *afs;
+ char mb[12];
char *argv[] = {usermode_helper, cmd, mb, NULL };
+ struct drbd_tconn *tconn = mdev->tconn;
+ struct sib_info sib;
int ret;
- if (current == mdev->worker.task)
- drbd_set_flag(mdev, CALLBACK_PENDING);
+ if (current == tconn->worker.task)
+ set_bit(CALLBACK_PENDING, &tconn->flags);
snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
-
- if (get_net_conf(mdev)) {
- switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
- case AF_INET6:
- afs = "ipv6";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
- &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
- break;
- case AF_INET:
- afs = "ipv4";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
- &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
- break;
- default:
- afs = "ssocks";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
- &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
- }
- snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
- envp[3]=af;
- envp[4]=ad;
- put_net_conf(mdev);
- }
+ setup_khelper_env(tconn, envp);
/* The helper may take some time.
* write out any unsynced meta data changes now */
drbd_md_sync(mdev);
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
-
- drbd_bcast_ev_helper(mdev, cmd);
+ sib.sib_reason = SIB_HELPER_PRE;
+ sib.helper_name = cmd;
+ drbd_bcast_event(mdev, &sib);
ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
if (ret)
dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
@@ -191,9 +349,46 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
(ret >> 8) & 0xff, ret);
+ sib.sib_reason = SIB_HELPER_POST;
+ sib.helper_exit_code = ret;
+ drbd_bcast_event(mdev, &sib);
+
+ if (current == tconn->worker.task)
+ clear_bit(CALLBACK_PENDING, &tconn->flags);
+
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+}
+
+int conn_khelper(struct drbd_tconn *tconn, char *cmd)
+{
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
+ NULL };
+ char *argv[] = {usermode_helper, cmd, tconn->name, NULL };
+ int ret;
+
+ setup_khelper_env(tconn, envp);
+ conn_md_sync(tconn);
- if (current == mdev->worker.task)
- drbd_clear_flag(mdev, CALLBACK_PENDING);
+ conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name);
+ /* TODO: conn_bcast_event() ?? */
+
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+ if (ret)
+ conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, tconn->name,
+ (ret >> 8) & 0xff, ret);
+ else
+ conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, tconn->name,
+ (ret >> 8) & 0xff, ret);
+ /* TODO: conn_bcast_event() ?? */
if (ret < 0) /* Ignore any ERRNOs we got. */
ret = 0;
@@ -201,116 +396,129 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
return ret;
}
-enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
{
+ enum drbd_fencing_p fp = FP_NOT_AVAIL;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (get_ldev_if_state(mdev, D_CONSISTENT)) {
+ fp = max_t(enum drbd_fencing_p, fp,
+ rcu_dereference(mdev->ldev->disk_conf)->fencing);
+ put_ldev(mdev);
+ }
+ }
+ rcu_read_unlock();
+
+ return fp;
+}
+
+bool conn_try_outdate_peer(struct drbd_tconn *tconn)
+{
+ union drbd_state mask = { };
+ union drbd_state val = { };
+ enum drbd_fencing_p fp;
char *ex_to_string;
int r;
- enum drbd_disk_state nps;
- enum drbd_fencing_p fp;
- D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+ if (tconn->cstate >= C_WF_REPORT_PARAMS) {
+ conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n");
+ return false;
+ }
- if (get_ldev_if_state(mdev, D_CONSISTENT)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- } else {
- dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
- nps = mdev->state.pdsk;
+ fp = highest_fencing_policy(tconn);
+ switch (fp) {
+ case FP_NOT_AVAIL:
+ conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n");
goto out;
+ case FP_DONT_CARE:
+ return true;
+ default: ;
}
- r = drbd_khelper(mdev, "fence-peer");
+ r = conn_khelper(tconn, "fence-peer");
switch ((r>>8) & 0xff) {
case 3: /* peer is inconsistent */
ex_to_string = "peer is inconsistent or worse";
- nps = D_INCONSISTENT;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_INCONSISTENT;
break;
case 4: /* peer got outdated, or was already outdated */
ex_to_string = "peer was fenced";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
break;
case 5: /* peer was down */
- if (mdev->state.disk == D_UP_TO_DATE) {
+ if (conn_highest_disk(tconn) == D_UP_TO_DATE) {
/* we will(have) create(d) a new UUID anyways... */
ex_to_string = "peer is unreachable, assumed to be dead";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
} else {
ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
- nps = mdev->state.pdsk;
}
break;
case 6: /* Peer is primary, voluntarily outdate myself.
* This is useful when an unconnected R_SECONDARY is asked to
* become R_PRIMARY, but finds the other peer being active. */
ex_to_string = "peer is active";
- dev_warn(DEV, "Peer is primary, outdating myself.\n");
- nps = D_UNKNOWN;
- _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+ conn_warn(tconn, "Peer is primary, outdating myself.\n");
+ mask.disk = D_MASK;
+ val.disk = D_OUTDATED;
break;
case 7:
if (fp != FP_STONITH)
- dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+ conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n");
ex_to_string = "peer was stonithed";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
break;
default:
/* The script is broken ... */
- nps = D_UNKNOWN;
- dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
- return nps;
+ conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+ return false; /* Eventually leave IO frozen */
}
- dev_info(DEV, "fence-peer helper returned %d (%s)\n",
- (r>>8) & 0xff, ex_to_string);
+ conn_info(tconn, "fence-peer helper returned %d (%s)\n",
+ (r>>8) & 0xff, ex_to_string);
-out:
- if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
- /* The handler was not successful... unfreeze here, the
- state engine can not unfreeze... */
- _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
- }
+ out:
- return nps;
+ /* Not using
+ conn_request_state(tconn, mask, val, CS_VERBOSE);
+ here, because we might were able to re-establish the connection in the
+ meantime. */
+ spin_lock_irq(&tconn->req_lock);
+ if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags))
+ _conn_request_state(tconn, mask, val, CS_VERBOSE);
+ spin_unlock_irq(&tconn->req_lock);
+
+ return conn_highest_pdsk(tconn) <= D_OUTDATED;
}
static int _try_outdate_peer_async(void *data)
{
- struct drbd_conf *mdev = (struct drbd_conf *)data;
- enum drbd_disk_state nps;
- union drbd_state ns;
+ struct drbd_tconn *tconn = (struct drbd_tconn *)data;
- nps = drbd_try_outdate_peer(mdev);
-
- /* Not using
- drbd_request_state(mdev, NS(pdsk, nps));
- here, because we might were able to re-establish the connection
- in the meantime. This can only partially be solved in the state's
- engine is_valid_state() and is_valid_state_transition()
- functions.
-
- nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
- pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
- therefore we have to have the pre state change check here.
- */
- spin_lock_irq(&mdev->req_lock);
- ns = mdev->state;
- if (ns.conn < C_WF_REPORT_PARAMS && !drbd_test_flag(mdev, STATE_SENT)) {
- ns.pdsk = nps;
- _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- }
- spin_unlock_irq(&mdev->req_lock);
+ conn_try_outdate_peer(tconn);
+ kref_put(&tconn->kref, &conn_destroy);
return 0;
}
-void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+void conn_try_outdate_peer_async(struct drbd_tconn *tconn)
{
struct task_struct *opa;
- opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
- if (IS_ERR(opa))
- dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+ kref_get(&tconn->kref);
+ opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h");
+ if (IS_ERR(opa)) {
+ conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n");
+ kref_put(&tconn->kref, &conn_destroy);
+ }
}
enum drbd_state_rv
@@ -318,15 +526,15 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
{
const int max_tries = 4;
enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+ struct net_conf *nc;
int try = 0;
int forced = 0;
union drbd_state mask, val;
- enum drbd_disk_state nps;
if (new_role == R_PRIMARY)
- request_ping(mdev); /* Detect a dead peer ASAP */
+ request_ping(mdev->tconn); /* Detect a dead peer ASAP */
- mutex_lock(&mdev->state_mutex);
+ mutex_lock(mdev->state_mutex);
mask.i = 0; mask.role = R_MASK;
val.i = 0; val.role = new_role;
@@ -354,38 +562,34 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
if (rv == SS_NO_UP_TO_DATE_DISK &&
mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
- nps = drbd_try_outdate_peer(mdev);
- if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+ if (conn_try_outdate_peer(mdev->tconn)) {
val.disk = D_UP_TO_DATE;
mask.disk = D_MASK;
}
-
- val.pdsk = nps;
- mask.pdsk = D_MASK;
-
continue;
}
if (rv == SS_NOTHING_TO_DO)
- goto fail;
+ goto out;
if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
- nps = drbd_try_outdate_peer(mdev);
-
- if (force && nps > D_OUTDATED) {
+ if (!conn_try_outdate_peer(mdev->tconn) && force) {
dev_warn(DEV, "Forced into split brain situation!\n");
- nps = D_OUTDATED;
- }
-
- mask.pdsk = D_MASK;
- val.pdsk = nps;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+ }
continue;
}
if (rv == SS_TWO_PRIMARIES) {
/* Maybe the peer is detected as dead very soon...
retry at most once more in this case. */
- schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10);
+ int timeo;
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
if (try < max_tries)
try = max_tries - 1;
continue;
@@ -394,13 +598,13 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
rv = _drbd_request_state(mdev, mask, val,
CS_VERBOSE + CS_WAIT_COMPLETE);
if (rv < SS_SUCCESS)
- goto fail;
+ goto out;
}
break;
}
if (rv < SS_SUCCESS)
- goto fail;
+ goto out;
if (forced)
dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
@@ -408,6 +612,8 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
/* Wait until nothing is on the fly :) */
wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
+ /* FIXME also wait for all pending P_BARRIER_ACK? */
+
if (new_role == R_SECONDARY) {
set_disk_ro(mdev->vdisk, true);
if (get_ldev(mdev)) {
@@ -415,10 +621,12 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
put_ldev(mdev);
}
} else {
- if (get_net_conf(mdev)) {
- mdev->net_conf->want_lose = 0;
- put_net_conf(mdev);
- }
+ mutex_lock(&mdev->tconn->conf_update);
+ nc = mdev->tconn->net_conf;
+ if (nc)
+ nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+ mutex_unlock(&mdev->tconn->conf_update);
+
set_disk_ro(mdev->vdisk, false);
if (get_ldev(mdev)) {
if (((mdev->state.conn < C_CONNECTED ||
@@ -444,67 +652,47 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
drbd_md_sync(mdev);
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
- fail:
- mutex_unlock(&mdev->state_mutex);
+out:
+ mutex_unlock(mdev->state_mutex);
return rv;
}
-static struct drbd_conf *ensure_mdev(int minor, int create)
+static const char *from_attrs_err_to_txt(int err)
{
- struct drbd_conf *mdev;
-
- if (minor >= minor_count)
- return NULL;
-
- mdev = minor_to_mdev(minor);
-
- if (!mdev && create) {
- struct gendisk *disk = NULL;
- mdev = drbd_new_device(minor);
-
- spin_lock_irq(&drbd_pp_lock);
- if (minor_table[minor] == NULL) {
- minor_table[minor] = mdev;
- disk = mdev->vdisk;
- mdev = NULL;
- } /* else: we lost the race */
- spin_unlock_irq(&drbd_pp_lock);
-
- if (disk) /* we won the race above */
- /* in case we ever add a drbd_delete_device(),
- * don't forget the del_gendisk! */
- add_disk(disk);
- else /* we lost the race above */
- drbd_free_mdev(mdev);
-
- mdev = minor_to_mdev(minor);
- }
-
- return mdev;
+ return err == -ENOMSG ? "required attribute missing" :
+ err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+ err == -EEXIST ? "can not change invariant setting" :
+ "invalid attribute value";
}
-static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
{
- struct primary primary_args;
-
- memset(&primary_args, 0, sizeof(struct primary));
- if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
- }
-
- reply->ret_code =
- drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force);
+ struct set_role_parms parms;
+ int err;
+ enum drbd_ret_code retcode;
- return 0;
-}
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
-static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
-{
- reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+ err = set_role_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+ if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+ retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate);
+ else
+ retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
@@ -514,7 +702,12 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev)
{
sector_t md_size_sect = 0;
- switch (bdev->dc.meta_dev_idx) {
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+
+ switch (meta_dev_idx) {
default:
/* v07 style fixed size indexed meta data */
bdev->md.md_size_sect = MD_RESERVED_SECT;
@@ -533,7 +726,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
case DRBD_MD_INDEX_FLEX_INT:
bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
/* al size is still fixed */
- bdev->md.al_offset = -MD_AL_MAX_SIZE;
+ bdev->md.al_offset = -MD_AL_SECTORS;
/* we need (slightly less than) ~ this much bitmap sectors: */
md_size_sect = drbd_get_capacity(bdev->backing_bdev);
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -549,6 +742,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
break;
}
+ rcu_read_unlock();
}
/* input size is expected to be in KB */
@@ -581,17 +775,23 @@ char *ppsize(char *buf, unsigned long long size)
* R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
* peer may not initiate a resize.
*/
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an mdev->flag, is triggered by drbd internals,
+ * and should be short-lived. */
void drbd_suspend_io(struct drbd_conf *mdev)
{
- drbd_set_flag(mdev, SUSPEND_IO);
- if (is_susp(mdev->state))
+ set_bit(SUSPEND_IO, &mdev->flags);
+ if (drbd_suspended(mdev))
return;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
}
void drbd_resume_io(struct drbd_conf *mdev)
{
- drbd_clear_flag(mdev, SUSPEND_IO);
+ clear_bit(SUSPEND_IO, &mdev->flags);
wake_up(&mdev->misc_wait);
}
@@ -605,7 +805,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
{
sector_t prev_first_sect, prev_size; /* previous meta location */
- sector_t la_size;
+ sector_t la_size, u_size;
sector_t size;
char ppb[10];
@@ -633,7 +833,10 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
/* TODO: should only be some assert here, not (re)init... */
drbd_md_set_sector_offsets(mdev, mdev->ldev);
- size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
+ rcu_read_lock();
+ u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
if (drbd_get_capacity(mdev->this_bdev) != size ||
drbd_bm_capacity(mdev) != size) {
@@ -696,12 +899,12 @@ out:
}
sector_t
-drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space)
+drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+ sector_t u_size, int assume_peer_has_space)
{
sector_t p_size = mdev->p_size; /* partner's disk size. */
sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
sector_t m_size; /* my size */
- sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
sector_t size = 0;
m_size = drbd_get_max_capacity(bdev);
@@ -750,24 +953,21 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int ass
* failed, and 0 on success. You should call drbd_md_sync() after you called
* this function.
*/
-static int drbd_check_al_size(struct drbd_conf *mdev)
+static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc)
{
struct lru_cache *n, *t;
struct lc_element *e;
unsigned int in_use;
int i;
- ERR_IF(mdev->sync_conf.al_extents < 7)
- mdev->sync_conf.al_extents = 127;
-
if (mdev->act_log &&
- mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+ mdev->act_log->nr_elements == dc->al_extents)
return 0;
in_use = 0;
t = mdev->act_log;
- n = lc_create("act_log", drbd_al_ext_cache,
- mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+ n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+ dc->al_extents, sizeof(struct lc_element), 0);
if (n == NULL) {
dev_err(DEV, "Cannot allocate act_log lru!\n");
@@ -808,7 +1008,9 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_
struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
- max_segments = mdev->ldev->dc.max_bio_bvecs;
+ rcu_read_lock();
+ max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs;
+ rcu_read_unlock();
put_ldev(mdev);
}
@@ -852,12 +1054,14 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
Because new from 8.3.8 onwards the peer can use multiple
BIOs for a single peer_request */
if (mdev->state.conn >= C_CONNECTED) {
- if (mdev->agreed_pro_version < 94) {
- peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ if (mdev->tconn->agreed_pro_version < 94)
+ peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
- } else if (mdev->agreed_pro_version == 94)
+ else if (mdev->tconn->agreed_pro_version == 94)
peer = DRBD_MAX_SIZE_H80_PACKET;
- else /* drbd 8.3.8 onwards */
+ else if (mdev->tconn->agreed_pro_version < 100)
+ peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
+ else
peer = DRBD_MAX_BIO_SIZE;
}
@@ -872,36 +1076,27 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
drbd_setup_queue_param(mdev, new);
}
-/* serialize deconfig (worker exiting, doing cleanup)
- * and reconfig (drbdsetup disk, drbdsetup net)
- *
- * Wait for a potentially exiting worker, then restart it,
- * or start a new one. Flush any pending work, there may still be an
- * after_state_change queued.
- */
-static void drbd_reconfig_start(struct drbd_conf *mdev)
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_tconn *tconn)
{
- wait_event(mdev->state_wait, !drbd_test_and_set_flag(mdev, CONFIG_PENDING));
- wait_event(mdev->state_wait, !drbd_test_flag(mdev, DEVICE_DYING));
- drbd_thread_start(&mdev->worker);
- drbd_flush_workqueue(mdev);
+ drbd_thread_start(&tconn->worker);
+ conn_flush_workqueue(tconn);
}
-/* if still unconfigured, stops worker again.
- * if configured now, clears CONFIG_PENDING.
- * wakes potential waiters */
-static void drbd_reconfig_done(struct drbd_conf *mdev)
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_tconn *tconn)
{
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.disk == D_DISKLESS &&
- mdev->state.conn == C_STANDALONE &&
- mdev->state.role == R_SECONDARY) {
- drbd_set_flag(mdev, DEVICE_DYING);
- drbd_thread_stop_nowait(&mdev->worker);
- } else
- drbd_clear_flag(mdev, CONFIG_PENDING);
- spin_unlock_irq(&mdev->req_lock);
- wake_up(&mdev->state_wait);
+ bool stop_threads;
+ spin_lock_irq(&tconn->req_lock);
+ stop_threads = conn_all_vols_unconf(tconn) &&
+ tconn->cstate == C_STANDALONE;
+ spin_unlock_irq(&tconn->req_lock);
+ if (stop_threads) {
+ /* asender is implicitly stopped by receiver
+ * in conn_disconnect() */
+ drbd_thread_stop(&tconn->receiver);
+ drbd_thread_stop(&tconn->worker);
+ }
}
/* Make sure IO is suspended before calling this function(). */
@@ -909,42 +1104,182 @@ static void drbd_suspend_al(struct drbd_conf *mdev)
{
int s = 0;
- if (lc_try_lock(mdev->act_log)) {
- drbd_al_shrink(mdev);
- lc_unlock(mdev->act_log);
- } else {
+ if (!lc_try_lock(mdev->act_log)) {
dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
return;
}
- spin_lock_irq(&mdev->req_lock);
+ drbd_al_shrink(mdev);
+ spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED)
- s = !drbd_test_and_set_flag(mdev, AL_SUSPENDED);
-
- spin_unlock_irq(&mdev->req_lock);
+ s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ lc_unlock(mdev->act_log);
if (s)
dev_info(DEV, "Suspended AL updates\n");
}
-/* does always return 0;
- * interesting return code is in reply->ret_code */
-static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+
+static bool should_set_defaults(struct genl_info *info)
+{
+ unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+ return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static void enforce_disk_conf_limits(struct disk_conf *dc)
+{
+ if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
+ dc->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
+ dc->al_extents = DRBD_AL_EXTENTS_MAX;
+
+ if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+}
+
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
{
enum drbd_ret_code retcode;
+ struct drbd_conf *mdev;
+ struct disk_conf *new_disk_conf, *old_disk_conf;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+ int err, fifo_size;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
+
+ /* we also need a disk
+ * to change the options on */
+ if (!get_ldev(mdev)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ mutex_lock(&mdev->tconn->conf_update);
+ old_disk_conf = mdev->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ if (should_set_defaults(info))
+ set_disk_conf_defaults(new_disk_conf);
+
+ err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ }
+
+ if (!expect(new_disk_conf->resync_rate >= 1))
+ new_disk_conf->resync_rate = 1;
+
+ enforce_disk_conf_limits(new_disk_conf);
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
+ dev_err(DEV, "kmalloc of fifo_buffer failed");
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+ }
+
+ drbd_suspend_io(mdev);
+ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+ drbd_al_shrink(mdev);
+ err = drbd_check_al_size(mdev, new_disk_conf);
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+ drbd_resume_io(mdev);
+
+ if (err) {
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
+ if (retcode == NO_ERROR) {
+ rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+ drbd_resync_after_changed(mdev);
+ }
+ write_unlock_irq(&global_state_lock);
+
+ if (retcode != NO_ERROR)
+ goto fail_unlock;
+
+ if (new_plan) {
+ old_plan = mdev->rs_plan_s;
+ rcu_assign_pointer(mdev->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&mdev->tconn->conf_update);
+
+ if (new_disk_conf->al_updates)
+ mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ mdev->ldev->md.flags |= MDF_AL_DISABLED;
+
+ drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
+
+ drbd_md_sync(mdev);
+
+ if (mdev->state.conn >= C_CONNECTED)
+ drbd_send_sync_param(mdev);
+
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ kfree(old_plan);
+ mod_timer(&mdev->request_timer, jiffies + HZ);
+ goto success;
+
+fail_unlock:
+ mutex_unlock(&mdev->tconn->conf_update);
+ fail:
+ kfree(new_disk_conf);
+ kfree(new_plan);
+success:
+ put_ldev(mdev);
+ out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_conf *mdev;
+ int err;
+ enum drbd_ret_code retcode;
enum determine_dev_size dd;
sector_t max_possible_sectors;
sector_t min_md_device_sectors;
struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+ struct disk_conf *new_disk_conf = NULL;
struct block_device *bdev;
struct lru_cache *resync_lru = NULL;
+ struct fifo_buffer *new_plan = NULL;
union drbd_state ns, os;
enum drbd_state_rv rv;
- int cp_discovered = 0;
- int logical_block_size;
+ struct net_conf *nc;
- drbd_reconfig_start(mdev);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ mdev = adm_ctx.mdev;
+ conn_reconfig_start(mdev->tconn);
/* if you want to reconfigure, please tear down first */
if (mdev->state.disk > D_DISKLESS) {
@@ -958,52 +1293,66 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
/* make sure there is no leftover from previous force-detach attempts */
- drbd_clear_flag(mdev, FORCE_DETACH);
- drbd_clear_flag(mdev, WAS_IO_ERROR);
- drbd_clear_flag(mdev, WAS_READ_ERROR);
+ clear_bit(FORCE_DETACH, &mdev->flags);
+ clear_bit(WAS_IO_ERROR, &mdev->flags);
+ clear_bit(WAS_READ_ERROR, &mdev->flags);
/* and no leftover from previously aborted resync or verify, either */
mdev->rs_total = 0;
mdev->rs_failed = 0;
atomic_set(&mdev->rs_pending_cnt, 0);
- /* allocation not in the IO path, cqueue thread context */
+ /* allocation not in the IO path, drbdsetup context */
nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
if (!nbc) {
retcode = ERR_NOMEM;
goto fail;
}
-
- nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
- nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
- nbc->dc.fencing = DRBD_FENCING_DEF;
- nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
-
spin_lock_init(&nbc->md.uuid_lock);
- if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ nbc->disk_conf = new_disk_conf;
+
+ set_disk_conf_defaults(new_disk_conf);
+ err = disk_conf_from_attrs(new_disk_conf, info);
+ if (err) {
retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
goto fail;
}
- if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+ enforce_disk_conf_limits(new_disk_conf);
+
+ new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+ if (!new_plan) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
- if (get_net_conf(mdev)) {
- int prot = mdev->net_conf->wire_protocol;
- put_net_conf(mdev);
- if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (nc) {
+ if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+ rcu_read_unlock();
retcode = ERR_STONITH_AND_PROT_A;
goto fail;
}
}
+ rcu_read_unlock();
- bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+ bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
if (IS_ERR(bdev)) {
- dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+ dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
PTR_ERR(bdev));
retcode = ERR_OPEN_DISK;
goto fail;
@@ -1018,12 +1367,12 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
* should check it for you already; but if you don't, or
* someone fooled it, we need to double check here)
*/
- bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+ bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL,
- (nbc->dc.meta_dev_idx < 0) ?
+ (new_disk_conf->meta_dev_idx < 0) ?
(void *)mdev : (void *)drbd_m_holder);
if (IS_ERR(bdev)) {
- dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+ dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
PTR_ERR(bdev));
retcode = ERR_OPEN_MD_DISK;
goto fail;
@@ -1031,14 +1380,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
nbc->md_bdev = bdev;
if ((nbc->backing_bdev == nbc->md_bdev) !=
- (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
- nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+ (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
resync_lru = lc_create("resync", drbd_bm_ext_cache,
- 61, sizeof(struct bm_extent),
+ 1, 61, sizeof(struct bm_extent),
offsetof(struct bm_extent, lce));
if (!resync_lru) {
retcode = ERR_NOMEM;
@@ -1048,21 +1397,21 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
drbd_md_set_sector_offsets(mdev, nbc);
- if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+ if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
(unsigned long long) drbd_get_max_capacity(nbc),
- (unsigned long long) nbc->dc.disk_size);
+ (unsigned long long) new_disk_conf->disk_size);
retcode = ERR_DISK_TOO_SMALL;
goto fail;
}
- if (nbc->dc.meta_dev_idx < 0) {
+ if (new_disk_conf->meta_dev_idx < 0) {
max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
/* at least one MB, otherwise it does not make sense */
min_md_device_sectors = (2<<10);
} else {
max_possible_sectors = DRBD_MAX_SECTORS;
- min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+ min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
}
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1087,14 +1436,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
dev_warn(DEV, "==> truncating very big lower level device "
"to currently maximum possible %llu sectors <==\n",
(unsigned long long) max_possible_sectors);
- if (nbc->dc.meta_dev_idx >= 0)
+ if (new_disk_conf->meta_dev_idx >= 0)
dev_warn(DEV, "==>> using internal or flexible "
"meta data may help <<==\n");
}
drbd_suspend_io(mdev);
/* also wait for the last barrier ack. */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
+ /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+ * We need a way to either ignore barrier acks for barriers sent before a device
+ * was attached, or a way to wait for all pending barrier acks to come in.
+ * As barriers are counted per resource,
+ * we'd need to suspend io on all devices of a resource.
+ */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev));
/* and for any other previously queued work */
drbd_flush_workqueue(mdev);
@@ -1109,25 +1464,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
drbd_md_set_sector_offsets(mdev, nbc);
- /* allocate a second IO page if logical_block_size != 512 */
- logical_block_size = bdev_logical_block_size(nbc->md_bdev);
- if (logical_block_size == 0)
- logical_block_size = MD_SECTOR_SIZE;
-
- if (logical_block_size != MD_SECTOR_SIZE) {
- if (!mdev->md_io_tmpp) {
- struct page *page = alloc_page(GFP_NOIO);
- if (!page)
- goto force_diskless_dec;
-
- dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
- logical_block_size, MD_SECTOR_SIZE);
- dev_warn(DEV, "Workaround engaged (has performance impact).\n");
-
- mdev->md_io_tmpp = page;
- }
- }
-
if (!mdev->bitmap) {
if (drbd_bm_init(mdev)) {
retcode = ERR_NOMEM;
@@ -1149,30 +1485,25 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
}
/* Since we are diskless, fix the activity log first... */
- if (drbd_check_al_size(mdev)) {
+ if (drbd_check_al_size(mdev, new_disk_conf)) {
retcode = ERR_NOMEM;
goto force_diskless_dec;
}
/* Prevent shrinking of consistent devices ! */
if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
- drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
+ drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
dev_warn(DEV, "refusing to truncate a consistent device\n");
retcode = ERR_DISK_TOO_SMALL;
goto force_diskless_dec;
}
- if (!drbd_al_read_log(mdev, nbc)) {
- retcode = ERR_IO_MD_DISK;
- goto force_diskless_dec;
- }
-
/* Reset the "barriers don't work" bits here, then force meta data to
* be written, to ensure we determine if barriers are supported. */
- if (nbc->dc.no_md_flush)
- drbd_set_flag(mdev, MD_NO_FUA);
+ if (new_disk_conf->md_flushes)
+ clear_bit(MD_NO_FUA, &mdev->flags);
else
- drbd_clear_flag(mdev, MD_NO_FUA);
+ set_bit(MD_NO_FUA, &mdev->flags);
/* Point of no return reached.
* Devices and memory are no longer released by error cleanup below.
@@ -1181,22 +1512,22 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
D_ASSERT(mdev->ldev == NULL);
mdev->ldev = nbc;
mdev->resync = resync_lru;
+ mdev->rs_plan_s = new_plan;
nbc = NULL;
resync_lru = NULL;
+ new_disk_conf = NULL;
+ new_plan = NULL;
- mdev->write_ordering = WO_bdev_flush;
- drbd_bump_write_ordering(mdev, WO_bdev_flush);
+ drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
- drbd_set_flag(mdev, CRASHED_PRIMARY);
+ set_bit(CRASHED_PRIMARY, &mdev->flags);
else
- drbd_clear_flag(mdev, CRASHED_PRIMARY);
+ clear_bit(CRASHED_PRIMARY, &mdev->flags);
if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
- !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
- drbd_set_flag(mdev, CRASHED_PRIMARY);
- cp_discovered = 1;
- }
+ !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod))
+ set_bit(CRASHED_PRIMARY, &mdev->flags);
mdev->send_cnt = 0;
mdev->recv_cnt = 0;
@@ -1219,20 +1550,22 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
* so we can automatically recover from a crash of a
* degraded but active "cluster" after a certain timeout.
*/
- drbd_clear_flag(mdev, USE_DEGR_WFC_T);
+ clear_bit(USE_DEGR_WFC_T, &mdev->flags);
if (mdev->state.role != R_PRIMARY &&
drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
!drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
- drbd_set_flag(mdev, USE_DEGR_WFC_T);
+ set_bit(USE_DEGR_WFC_T, &mdev->flags);
dd = drbd_determine_dev_size(mdev, 0);
if (dd == dev_size_error) {
retcode = ERR_NOMEM_BITMAP;
goto force_diskless_dec;
} else if (dd == grew)
- drbd_set_flag(mdev, RESYNC_AFTER_NEG);
+ set_bit(RESYNC_AFTER_NEG, &mdev->flags);
- if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+ if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
+ (test_bit(CRASHED_PRIMARY, &mdev->flags) &&
+ drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) {
dev_info(DEV, "Assuming that all blocks are out of sync "
"(aka FullSync)\n");
if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
@@ -1242,16 +1575,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
}
} else {
if (drbd_bitmap_io(mdev, &drbd_bm_read,
- "read from attaching", BM_LOCKED_MASK) < 0) {
- retcode = ERR_IO_MD_DISK;
- goto force_diskless_dec;
- }
- }
-
- if (cp_discovered) {
- drbd_al_apply_to_bm(mdev);
- if (drbd_bitmap_io(mdev, &drbd_bm_write,
- "crashed primary apply AL", BM_LOCKED_MASK)) {
+ "read from attaching", BM_LOCKED_MASK)) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
@@ -1260,9 +1584,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
drbd_suspend_al(mdev); /* IO is still suspended here... */
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
- ns.i = os.i;
+ spin_lock_irq(&mdev->tconn->req_lock);
+ os = drbd_read_state(mdev);
+ ns = os;
/* If MDF_CONSISTENT is not set go into inconsistent state,
otherwise investigate MDF_WasUpToDate...
If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
@@ -1280,8 +1604,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
ns.pdsk = D_OUTDATED;
- if ( ns.disk == D_CONSISTENT &&
- (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+ rcu_read_lock();
+ if (ns.disk == D_CONSISTENT &&
+ (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE))
ns.disk = D_UP_TO_DATE;
/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
@@ -1289,6 +1614,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
this point, because drbd_request_state() modifies these
flags. */
+ if (rcu_dereference(mdev->ldev->disk_conf)->al_updates)
+ mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ mdev->ldev->md.flags |= MDF_AL_DISABLED;
+
+ rcu_read_unlock();
+
/* In case we are C_CONNECTED postpone any decision on the new disk
state after the negotiation phase. */
if (mdev->state.conn == C_CONNECTED) {
@@ -1304,12 +1636,13 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
}
rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
if (rv < SS_SUCCESS)
goto force_diskless_dec;
+ mod_timer(&mdev->request_timer, jiffies + HZ);
+
if (mdev->state.role == R_PRIMARY)
mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
else
@@ -1320,16 +1653,17 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
put_ldev(mdev);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ conn_reconfig_done(mdev->tconn);
+ drbd_adm_finish(info, retcode);
return 0;
force_diskless_dec:
put_ldev(mdev);
force_diskless:
- drbd_force_state(mdev, NS(disk, D_FAILED));
+ drbd_force_state(mdev, NS(disk, D_DISKLESS));
drbd_md_sync(mdev);
fail:
+ conn_reconfig_done(mdev->tconn);
if (nbc) {
if (nbc->backing_bdev)
blkdev_put(nbc->backing_bdev,
@@ -1339,34 +1673,24 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(nbc);
}
+ kfree(new_disk_conf);
lc_destroy(resync_lru);
+ kfree(new_plan);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ finish:
+ drbd_adm_finish(info, retcode);
return 0;
}
-/* Detaching the disk is a process in multiple stages. First we need to lock
- * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
- * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
- * internal references as well.
- * Only then we have finally detached. */
-static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+static int adm_detach(struct drbd_conf *mdev, int force)
{
- enum drbd_ret_code retcode;
+ enum drbd_state_rv retcode;
int ret;
- struct detach dt = {};
- if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- goto out;
- }
-
- if (dt.detach_force) {
- drbd_set_flag(mdev, FORCE_DETACH);
+ if (force) {
+ set_bit(FORCE_DETACH, &mdev->flags);
drbd_force_state(mdev, NS(disk, D_FAILED));
- reply->ret_code = SS_SUCCESS;
+ retcode = SS_SUCCESS;
goto out;
}
@@ -1378,326 +1702,529 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
ret = wait_event_interruptible(mdev->misc_wait,
mdev->state.disk != D_FAILED);
drbd_resume_io(mdev);
-
if ((int)retcode == (int)SS_IS_DISKLESS)
retcode = SS_NOTHING_TO_DO;
if (ret)
retcode = ERR_INTR;
- reply->ret_code = retcode;
out:
- return 0;
+ return retcode;
}
-static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+/* Detaching the disk is a process in multiple stages. First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
{
- int i, ns;
enum drbd_ret_code retcode;
- struct net_conf *new_conf = NULL;
- struct crypto_hash *tfm = NULL;
- struct crypto_hash *integrity_w_tfm = NULL;
- struct crypto_hash *integrity_r_tfm = NULL;
- struct hlist_head *new_tl_hash = NULL;
- struct hlist_head *new_ee_hash = NULL;
- struct drbd_conf *odev;
- char hmac_name[CRYPTO_MAX_ALG_NAME];
- void *int_dig_out = NULL;
- void *int_dig_in = NULL;
- void *int_dig_vv = NULL;
- struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+ struct detach_parms parms = { };
+ int err;
- drbd_reconfig_start(mdev);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- if (mdev->state.conn > C_STANDALONE) {
- retcode = ERR_NET_CONFIGURED;
- goto fail;
+ if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+ err = detach_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+
+ retcode = adm_detach(adm_ctx.mdev, parms.force_detach);
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+static bool conn_resync_running(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (mdev->state.conn == C_SYNC_SOURCE ||
+ mdev->state.conn == C_SYNC_TARGET ||
+ mdev->state.conn == C_PAUSED_SYNC_S ||
+ mdev->state.conn == C_PAUSED_SYNC_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static bool conn_ov_running(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (mdev->state.conn == C_VERIFY_S ||
+ mdev->state.conn == C_VERIFY_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static enum drbd_ret_code
+_check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf)
+{
+ struct drbd_conf *mdev;
+ int i;
+
+ if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) {
+ if (new_conf->wire_protocol != old_conf->wire_protocol)
+ return ERR_NEED_APV_100;
+
+ if (new_conf->two_primaries != old_conf->two_primaries)
+ return ERR_NEED_APV_100;
+
+ if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
+ return ERR_NEED_APV_100;
+ }
+
+ if (!new_conf->two_primaries &&
+ conn_highest_role(tconn) == R_PRIMARY &&
+ conn_highest_peer(tconn) == R_PRIMARY)
+ return ERR_NEED_ALLOW_TWO_PRI;
+
+ if (new_conf->two_primaries &&
+ (new_conf->wire_protocol != DRBD_PROT_C))
+ return ERR_NOT_PROTO_C;
+
+ idr_for_each_entry(&tconn->volumes, mdev, i) {
+ if (get_ldev(mdev)) {
+ enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+ put_ldev(mdev);
+ if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+ return ERR_STONITH_AND_PROT_A;
+ }
+ if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data)
+ return ERR_DISCARD_IMPOSSIBLE;
+ }
+
+ if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
+ return ERR_CONG_NOT_PROTO_A;
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf)
+{
+ static enum drbd_ret_code rv;
+ struct drbd_conf *mdev;
+ int i;
+
+ rcu_read_lock();
+ rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf);
+ rcu_read_unlock();
+
+ /* tconn->volumes protected by genl_lock() here */
+ idr_for_each_entry(&tconn->volumes, mdev, i) {
+ if (!mdev->bitmap) {
+ if(drbd_bm_init(mdev))
+ return ERR_NOMEM;
+ }
+ }
+
+ return rv;
+}
+
+struct crypto {
+ struct crypto_hash *verify_tfm;
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm;
+};
+
+static int
+alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+{
+ if (!tfm_name[0])
+ return NO_ERROR;
+
+ *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(*tfm)) {
+ *tfm = NULL;
+ return err_alg;
}
- /* allocation not in the IO path, cqueue thread context */
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_conf)
+{
+ char hmac_name[CRYPTO_MAX_ALG_NAME];
+ enum drbd_ret_code rv;
+
+ rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg,
+ ERR_CSUMS_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg,
+ ERR_VERIFY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg,
+ ERR_INTEGRITY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ if (new_conf->cram_hmac_alg[0] != 0) {
+ snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+ new_conf->cram_hmac_alg);
+
+ rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+ ERR_AUTH_ALG);
+ }
+
+ return rv;
+}
+
+static void free_crypto(struct crypto *crypto)
+{
+ crypto_free_hash(crypto->cram_hmac_tfm);
+ crypto_free_hash(crypto->integrity_tfm);
+ crypto_free_hash(crypto->csums_tfm);
+ crypto_free_hash(crypto->verify_tfm);
+}
+
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+ struct drbd_tconn *tconn;
+ struct net_conf *old_conf, *new_conf = NULL;
+ int err;
+ int ovr; /* online verify running */
+ int rsr; /* re-sync running */
+ struct crypto crypto = { };
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ tconn = adm_ctx.tconn;
+
new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
if (!new_conf) {
retcode = ERR_NOMEM;
+ goto out;
+ }
+
+ conn_reconfig_start(tconn);
+
+ mutex_lock(&tconn->data.mutex);
+ mutex_lock(&tconn->conf_update);
+ old_conf = tconn->net_conf;
+
+ if (!old_conf) {
+ drbd_msg_put_info("net conf missing, try connect");
+ retcode = ERR_INVALID_REQUEST;
goto fail;
}
- new_conf->timeout = DRBD_TIMEOUT_DEF;
- new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
- new_conf->ping_int = DRBD_PING_INT_DEF;
- new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
- new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
- new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
- new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
- new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
- new_conf->ko_count = DRBD_KO_COUNT_DEF;
- new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
- new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
- new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
- new_conf->want_lose = 0;
- new_conf->two_primaries = 0;
- new_conf->wire_protocol = DRBD_PROT_C;
- new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
- new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
- new_conf->on_congestion = DRBD_ON_CONGESTION_DEF;
- new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF;
-
- if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
+ *new_conf = *old_conf;
+ if (should_set_defaults(info))
+ set_net_conf_defaults(new_conf);
+
+ err = net_conf_from_attrs_for_change(new_conf, info);
+ if (err && err != -ENOMSG) {
retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
goto fail;
}
- if (new_conf->two_primaries
- && (new_conf->wire_protocol != DRBD_PROT_C)) {
- retcode = ERR_NOT_PROTO_C;
+ retcode = check_net_options(tconn, new_conf);
+ if (retcode != NO_ERROR)
goto fail;
- }
- if (get_ldev(mdev)) {
- enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
- retcode = ERR_STONITH_AND_PROT_A;
- goto fail;
- }
+ /* re-sync running */
+ rsr = conn_resync_running(tconn);
+ if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) {
+ retcode = ERR_CSUMS_RESYNC_RUNNING;
+ goto fail;
}
- if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
- retcode = ERR_CONG_NOT_PROTO_A;
+ /* online verify running */
+ ovr = conn_ov_running(tconn);
+ if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) {
+ retcode = ERR_VERIFY_RUNNING;
goto fail;
}
- if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
- retcode = ERR_DISCARD;
+ retcode = alloc_crypto(&crypto, new_conf);
+ if (retcode != NO_ERROR)
goto fail;
- }
- retcode = NO_ERROR;
+ rcu_assign_pointer(tconn->net_conf, new_conf);
- new_my_addr = (struct sockaddr *)&new_conf->my_addr;
- new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev || odev == mdev)
- continue;
- if (get_net_conf(odev)) {
- taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
- if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
- !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
- retcode = ERR_LOCAL_ADDR;
-
- taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
- if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
- !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
- retcode = ERR_PEER_ADDR;
-
- put_net_conf(odev);
- if (retcode != NO_ERROR)
- goto fail;
- }
+ if (!rsr) {
+ crypto_free_hash(tconn->csums_tfm);
+ tconn->csums_tfm = crypto.csums_tfm;
+ crypto.csums_tfm = NULL;
+ }
+ if (!ovr) {
+ crypto_free_hash(tconn->verify_tfm);
+ tconn->verify_tfm = crypto.verify_tfm;
+ crypto.verify_tfm = NULL;
}
- if (new_conf->cram_hmac_alg[0] != 0) {
- snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
- new_conf->cram_hmac_alg);
- tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm)) {
- tfm = NULL;
- retcode = ERR_AUTH_ALG;
- goto fail;
- }
+ crypto_free_hash(tconn->integrity_tfm);
+ tconn->integrity_tfm = crypto.integrity_tfm;
+ if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100)
+ /* Do this without trying to take tconn->data.mutex again. */
+ __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE);
- if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
- retcode = ERR_AUTH_ALG_ND;
- goto fail;
- }
- }
+ crypto_free_hash(tconn->cram_hmac_tfm);
+ tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
- if (new_conf->integrity_alg[0]) {
- integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(integrity_w_tfm)) {
- integrity_w_tfm = NULL;
- retcode=ERR_INTEGRITY_ALG;
- goto fail;
- }
+ mutex_unlock(&tconn->conf_update);
+ mutex_unlock(&tconn->data.mutex);
+ synchronize_rcu();
+ kfree(old_conf);
- if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
- retcode=ERR_INTEGRITY_ALG_ND;
- goto fail;
- }
+ if (tconn->cstate >= C_WF_REPORT_PARAMS)
+ drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn)));
- integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(integrity_r_tfm)) {
- integrity_r_tfm = NULL;
- retcode=ERR_INTEGRITY_ALG;
- goto fail;
- }
+ goto done;
+
+ fail:
+ mutex_unlock(&tconn->conf_update);
+ mutex_unlock(&tconn->data.mutex);
+ free_crypto(&crypto);
+ kfree(new_conf);
+ done:
+ conn_reconfig_done(tconn);
+ out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_conf *mdev;
+ struct net_conf *old_conf, *new_conf = NULL;
+ struct crypto crypto = { };
+ struct drbd_tconn *tconn;
+ enum drbd_ret_code retcode;
+ int i;
+ int err;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+ if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+ drbd_msg_put_info("connection endpoint(s) missing");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
}
- ns = new_conf->max_epoch_size/8;
- if (mdev->tl_hash_s != ns) {
- new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
- if (!new_tl_hash) {
- retcode = ERR_NOMEM;
- goto fail;
+ /* No need for _rcu here. All reconfiguration is
+ * strictly serialized on genl_lock(). We are protected against
+ * concurrent reconfiguration/addition/deletion */
+ list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
+ if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len &&
+ !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) {
+ retcode = ERR_LOCAL_ADDR;
+ goto out;
}
- }
- ns = new_conf->max_buffers/8;
- if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
- new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
- if (!new_ee_hash) {
- retcode = ERR_NOMEM;
- goto fail;
+ if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len &&
+ !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) {
+ retcode = ERR_PEER_ADDR;
+ goto out;
}
}
- ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+ tconn = adm_ctx.tconn;
+ conn_reconfig_start(tconn);
- if (integrity_w_tfm) {
- i = crypto_hash_digestsize(integrity_w_tfm);
- int_dig_out = kmalloc(i, GFP_KERNEL);
- if (!int_dig_out) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- int_dig_in = kmalloc(i, GFP_KERNEL);
- if (!int_dig_in) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- int_dig_vv = kmalloc(i, GFP_KERNEL);
- if (!int_dig_vv) {
- retcode = ERR_NOMEM;
- goto fail;
- }
+ if (tconn->cstate > C_STANDALONE) {
+ retcode = ERR_NET_CONFIGURED;
+ goto fail;
}
- if (!mdev->bitmap) {
- if(drbd_bm_init(mdev)) {
- retcode = ERR_NOMEM;
- goto fail;
- }
+ /* allocation not in the IO path, drbdsetup / netlink process context */
+ new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL);
+ if (!new_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
}
- drbd_flush_workqueue(mdev);
- spin_lock_irq(&mdev->req_lock);
- if (mdev->net_conf != NULL) {
- retcode = ERR_NET_CONFIGURED;
- spin_unlock_irq(&mdev->req_lock);
+ set_net_conf_defaults(new_conf);
+
+ err = net_conf_from_attrs(new_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
goto fail;
}
- mdev->net_conf = new_conf;
- mdev->send_cnt = 0;
- mdev->recv_cnt = 0;
+ retcode = check_net_options(tconn, new_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
- if (new_tl_hash) {
- kfree(mdev->tl_hash);
- mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
- mdev->tl_hash = new_tl_hash;
- }
+ retcode = alloc_crypto(&crypto, new_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
- if (new_ee_hash) {
- kfree(mdev->ee_hash);
- mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
- mdev->ee_hash = new_ee_hash;
+ ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+ conn_flush_workqueue(tconn);
+
+ mutex_lock(&tconn->conf_update);
+ old_conf = tconn->net_conf;
+ if (old_conf) {
+ retcode = ERR_NET_CONFIGURED;
+ mutex_unlock(&tconn->conf_update);
+ goto fail;
}
+ rcu_assign_pointer(tconn->net_conf, new_conf);
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = tfm;
+ conn_free_crypto(tconn);
+ tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
+ tconn->integrity_tfm = crypto.integrity_tfm;
+ tconn->csums_tfm = crypto.csums_tfm;
+ tconn->verify_tfm = crypto.verify_tfm;
- crypto_free_hash(mdev->integrity_w_tfm);
- mdev->integrity_w_tfm = integrity_w_tfm;
+ tconn->my_addr_len = nla_len(adm_ctx.my_addr);
+ memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len);
+ tconn->peer_addr_len = nla_len(adm_ctx.peer_addr);
+ memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len);
- crypto_free_hash(mdev->integrity_r_tfm);
- mdev->integrity_r_tfm = integrity_r_tfm;
+ mutex_unlock(&tconn->conf_update);
- kfree(mdev->int_dig_out);
- kfree(mdev->int_dig_in);
- kfree(mdev->int_dig_vv);
- mdev->int_dig_out=int_dig_out;
- mdev->int_dig_in=int_dig_in;
- mdev->int_dig_vv=int_dig_vv;
- retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, i) {
+ mdev->send_cnt = 0;
+ mdev->recv_cnt = 0;
+ }
+ rcu_read_unlock();
- kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ conn_reconfig_done(tconn);
+ drbd_adm_finish(info, retcode);
return 0;
fail:
- kfree(int_dig_out);
- kfree(int_dig_in);
- kfree(int_dig_vv);
- crypto_free_hash(tfm);
- crypto_free_hash(integrity_w_tfm);
- crypto_free_hash(integrity_r_tfm);
- kfree(new_tl_hash);
- kfree(new_ee_hash);
+ free_crypto(&crypto);
kfree(new_conf);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ conn_reconfig_done(tconn);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force)
{
- int retcode;
- struct disconnect dc;
-
- memset(&dc, 0, sizeof(struct disconnect));
- if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
- retcode = ERR_MANDATORY_TAG;
- goto fail;
- }
-
- if (dc.force) {
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.conn >= C_WF_CONNECTION)
- _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
- spin_unlock_irq(&mdev->req_lock);
- goto done;
- }
+ enum drbd_state_rv rv;
- retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
+ rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
+ force ? CS_HARD : 0);
- if (retcode == SS_NOTHING_TO_DO)
- goto done;
- else if (retcode == SS_ALREADY_STANDALONE)
- goto done;
- else if (retcode == SS_PRIMARY_NOP) {
- /* Our statche checking code wants to see the peer outdated. */
- retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
- pdsk, D_OUTDATED));
- } else if (retcode == SS_CW_FAILED_BY_PEER) {
+ switch (rv) {
+ case SS_NOTHING_TO_DO:
+ break;
+ case SS_ALREADY_STANDALONE:
+ return SS_SUCCESS;
+ case SS_PRIMARY_NOP:
+ /* Our state checking code wants to see the peer outdated. */
+ rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
+ pdsk, D_OUTDATED), CS_VERBOSE);
+ break;
+ case SS_CW_FAILED_BY_PEER:
/* The peer probably wants to see us outdated. */
- retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
- disk, D_OUTDATED),
- CS_ORDERED);
- if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- retcode = SS_SUCCESS;
+ rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
+ disk, D_OUTDATED), 0);
+ if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+ rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
+ CS_HARD);
}
+ break;
+ default:;
+ /* no special handling necessary */
+ }
+
+ if (rv >= SS_SUCCESS) {
+ enum drbd_state_rv rv2;
+ /* No one else can reconfigure the network while I am here.
+ * The state handling only uses drbd_thread_stop_nowait(),
+ * we want to really wait here until the receiver is no more.
+ */
+ drbd_thread_stop(&adm_ctx.tconn->receiver);
+
+ /* Race breaker. This additional state change request may be
+ * necessary, if this was a forced disconnect during a receiver
+ * restart. We may have "killed" the receiver thread just
+ * after drbdd_init() returned. Typically, we should be
+ * C_STANDALONE already, now, and this becomes a no-op.
+ */
+ rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE),
+ CS_VERBOSE | CS_HARD);
+ if (rv2 < SS_SUCCESS)
+ conn_err(tconn,
+ "unexpected rv2=%d in conn_try_disconnect()\n",
+ rv2);
}
+ return rv;
+}
- if (retcode < SS_SUCCESS)
- goto fail;
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct disconnect_parms parms;
+ struct drbd_tconn *tconn;
+ enum drbd_state_rv rv;
+ enum drbd_ret_code retcode;
+ int err;
- if (wait_event_interruptible(mdev->state_wait,
- mdev->state.conn != C_DISCONNECTING)) {
- /* Do not test for mdev->state.conn == C_STANDALONE, since
- someone else might connect us in the mean time! */
- retcode = ERR_INTR;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
goto fail;
+
+ tconn = adm_ctx.tconn;
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+ err = disconnect_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto fail;
+ }
}
- done:
- retcode = NO_ERROR;
+ rv = conn_try_disconnect(tconn, parms.force_disconnect);
+ if (rv < SS_SUCCESS)
+ retcode = rv; /* FIXME: Type mismatch. */
+ else
+ retcode = NO_ERROR;
fail:
- drbd_md_sync(mdev);
- reply->ret_code = retcode;
+ drbd_adm_finish(info, retcode);
return 0;
}
@@ -1709,7 +2236,7 @@ void resync_after_online_grow(struct drbd_conf *mdev)
if (mdev->state.role != mdev->state.peer)
iass = (mdev->state.role == R_PRIMARY);
else
- iass = drbd_test_flag(mdev, DISCARD_CONCURRENT);
+ iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
if (iass)
drbd_start_resync(mdev, C_SYNC_SOURCE);
@@ -1717,20 +2244,34 @@ void resync_after_online_grow(struct drbd_conf *mdev)
_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
}
-static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
{
- struct resize rs;
- int retcode = NO_ERROR;
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+ struct resize_parms rs;
+ struct drbd_conf *mdev;
+ enum drbd_ret_code retcode;
enum determine_dev_size dd;
enum dds_flags ddsf;
+ sector_t u_size;
+ int err;
- memset(&rs, 0, sizeof(struct resize));
- if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
- retcode = ERR_MANDATORY_TAG;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
goto fail;
+
+ memset(&rs, 0, sizeof(struct resize_parms));
+ if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+ err = resize_parms_from_attrs(&rs, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto fail;
+ }
}
+ mdev = adm_ctx.mdev;
if (mdev->state.conn > C_CONNECTED) {
retcode = ERR_RESIZE_RESYNC;
goto fail;
@@ -1747,15 +2288,36 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
goto fail;
}
- if (rs.no_resync && mdev->agreed_pro_version < 93) {
+ if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
retcode = ERR_NEED_APV_93;
goto fail_ldev;
}
+ rcu_read_lock();
+ u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ if (u_size != (sector_t)rs.resize_size) {
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail_ldev;
+ }
+ }
+
if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
- mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
+ if (new_disk_conf) {
+ mutex_lock(&mdev->tconn->conf_update);
+ old_disk_conf = mdev->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = (sector_t)rs.resize_size;
+ rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&mdev->tconn->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ }
+
ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
dd = drbd_determine_dev_size(mdev, ddsf);
drbd_md_sync(mdev);
@@ -1767,14 +2329,14 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
if (mdev->state.conn == C_CONNECTED) {
if (dd == grew)
- drbd_set_flag(mdev, RESIZE_PENDING);
+ set_bit(RESIZE_PENDING, &mdev->flags);
drbd_send_uuids(mdev);
drbd_send_sizes(mdev, 1, ddsf);
}
fail:
- reply->ret_code = retcode;
+ drbd_adm_finish(info, retcode);
return 0;
fail_ldev:
@@ -1782,210 +2344,61 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
goto fail;
}
-static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ enum drbd_ret_code retcode;
+ struct drbd_tconn *tconn;
+ struct res_opts res_opts;
int err;
- int ovr; /* online verify running */
- int rsr; /* re-sync running */
- struct crypto_hash *verify_tfm = NULL;
- struct crypto_hash *csums_tfm = NULL;
- struct syncer_conf sc;
- cpumask_var_t new_cpu_mask;
- int *rs_plan_s = NULL;
- int fifo_size;
-
- if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
- retcode = ERR_NOMEM;
- goto fail;
- }
-
- if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
- memset(&sc, 0, sizeof(struct syncer_conf));
- sc.rate = DRBD_RATE_DEF;
- sc.after = DRBD_AFTER_DEF;
- sc.al_extents = DRBD_AL_EXTENTS_DEF;
- sc.on_no_data = DRBD_ON_NO_DATA_DEF;
- sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
- sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
- sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
- sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
- sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
- } else
- memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
- if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
- retcode = ERR_MANDATORY_TAG;
- goto fail;
- }
-
- /* re-sync running */
- rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
- mdev->state.conn == C_SYNC_TARGET ||
- mdev->state.conn == C_PAUSED_SYNC_S ||
- mdev->state.conn == C_PAUSED_SYNC_T );
-
- if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
- retcode = ERR_CSUMS_RESYNC_RUNNING;
- goto fail;
- }
-
- if (!rsr && sc.csums_alg[0]) {
- csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(csums_tfm)) {
- csums_tfm = NULL;
- retcode = ERR_CSUMS_ALG;
- goto fail;
- }
-
- if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
- retcode = ERR_CSUMS_ALG_ND;
- goto fail;
- }
- }
-
- /* online verify running */
- ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
-
- if (ovr) {
- if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
- retcode = ERR_VERIFY_RUNNING;
- goto fail;
- }
- }
-
- if (!ovr && sc.verify_alg[0]) {
- verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(verify_tfm)) {
- verify_tfm = NULL;
- retcode = ERR_VERIFY_ALG;
- goto fail;
- }
-
- if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
- retcode = ERR_VERIFY_ALG_ND;
- goto fail;
- }
- }
-
- /* silently ignore cpu mask on UP kernel */
- if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
- err = bitmap_parse(sc.cpu_mask, 32,
- cpumask_bits(new_cpu_mask), nr_cpu_ids);
- if (err) {
- dev_warn(DEV, "bitmap_parse() failed with %d\n", err);
- retcode = ERR_CPU_MASK_PARSE;
- goto fail;
- }
- }
-
- ERR_IF (sc.rate < 1) sc.rate = 1;
- ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
-#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
- if (sc.al_extents > AL_MAX) {
- dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
- sc.al_extents = AL_MAX;
- }
-#undef AL_MAX
-
- /* to avoid spurious errors when configuring minors before configuring
- * the minors they depend on: if necessary, first create the minor we
- * depend on */
- if (sc.after >= 0)
- ensure_mdev(sc.after, 1);
-
- /* most sanity checks done, try to assign the new sync-after
- * dependency. need to hold the global lock in there,
- * to avoid a race in the dependency loop check. */
- retcode = drbd_alter_sa(mdev, sc.after);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
if (retcode != NO_ERROR)
goto fail;
+ tconn = adm_ctx.tconn;
- fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
- if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
- rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
- if (!rs_plan_s) {
- dev_err(DEV, "kmalloc of fifo_buffer failed");
- retcode = ERR_NOMEM;
- goto fail;
- }
- }
+ res_opts = tconn->res_opts;
+ if (should_set_defaults(info))
+ set_res_opts_defaults(&res_opts);
- /* ok, assign the rest of it as well.
- * lock against receive_SyncParam() */
- spin_lock(&mdev->peer_seq_lock);
- mdev->sync_conf = sc;
-
- if (!rsr) {
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = csums_tfm;
- csums_tfm = NULL;
- }
-
- if (!ovr) {
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = verify_tfm;
- verify_tfm = NULL;
- }
-
- if (fifo_size != mdev->rs_plan_s.size) {
- kfree(mdev->rs_plan_s.values);
- mdev->rs_plan_s.values = rs_plan_s;
- mdev->rs_plan_s.size = fifo_size;
- mdev->rs_planed = 0;
- rs_plan_s = NULL;
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto fail;
}
- spin_unlock(&mdev->peer_seq_lock);
-
- if (get_ldev(mdev)) {
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
- drbd_al_shrink(mdev);
- err = drbd_check_al_size(mdev);
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
-
- put_ldev(mdev);
- drbd_md_sync(mdev);
-
- if (err) {
+ err = set_resource_options(tconn, &res_opts);
+ if (err) {
+ retcode = ERR_INVALID_REQUEST;
+ if (err == -ENOMEM)
retcode = ERR_NOMEM;
- goto fail;
- }
- }
-
- if (mdev->state.conn >= C_CONNECTED)
- drbd_send_sync_param(mdev, &sc);
-
- if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
- cpumask_copy(mdev->cpu_mask, new_cpu_mask);
- drbd_calc_cpu_mask(mdev);
- mdev->receiver.reset_cpu_mask = 1;
- mdev->asender.reset_cpu_mask = 1;
- mdev->worker.reset_cpu_mask = 1;
}
- kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
fail:
- kfree(rs_plan_s);
- free_cpumask_var(new_cpu_mask);
- crypto_free_hash(csums_tfm);
- crypto_free_hash(verify_tfm);
- reply->ret_code = retcode;
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
{
- int retcode;
+ struct drbd_conf *mdev;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
/* If there is still bitmap IO pending, probably because of a previous
* resync just being finished, wait for it before requesting a new resync.
* Also wait for it's after_state_ch(). */
drbd_suspend_io(mdev);
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
drbd_flush_workqueue(mdev);
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
@@ -1994,10 +2407,10 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
while (retcode == SS_NEED_CONNECTION) {
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED)
retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
if (retcode != SS_NEED_CONNECTION)
break;
@@ -2006,7 +2419,25 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
}
drbd_resume_io(mdev);
- reply->ret_code = retcode;
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+ union drbd_state mask, union drbd_state val)
+{
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ retcode = drbd_request_state(adm_ctx.mdev, mask, val);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
@@ -2019,29 +2450,36 @@ static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
return rv;
}
-static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
{
- int retcode;
+ int retcode; /* drbd_ret_code, drbd_state_rv */
+ struct drbd_conf *mdev;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
/* If there is still bitmap IO pending, probably because of a previous
* resync just being finished, wait for it before requesting a new resync.
* Also wait for it's after_state_ch(). */
drbd_suspend_io(mdev);
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
drbd_flush_workqueue(mdev);
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
-
if (retcode < SS_SUCCESS) {
if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
- /* The peer will get a resync upon connect anyways. Just make that
- into a full resync. */
+ /* The peer will get a resync upon connect anyways.
+ * Just make that into a full resync. */
retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
if (retcode >= SS_SUCCESS) {
if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
- "set_n_write from invalidate_peer",
- BM_LOCKED_SET_ALLOWED))
+ "set_n_write from invalidate_peer",
+ BM_LOCKED_SET_ALLOWED))
retcode = ERR_IO_MD_DISK;
}
} else
@@ -2049,30 +2487,41 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
}
drbd_resume_io(mdev);
- reply->ret_code = retcode;
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ enum drbd_ret_code retcode;
- if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
- retcode = ERR_PAUSE_IS_SET;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- reply->ret_code = retcode;
+ if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+ retcode = ERR_PAUSE_IS_SET;
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
- union drbd_state s;
+ union drbd_dev_state s;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
- s = mdev->state;
+ if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+ s = adm_ctx.mdev->state;
if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
@@ -2081,178 +2530,482 @@ static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
}
}
- reply->ret_code = retcode;
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
{
- reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
-
- return 0;
+ return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
}
-static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
{
- if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
+ struct drbd_conf *mdev;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
+ if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
drbd_uuid_new_current(mdev);
- drbd_clear_flag(mdev, NEW_CUR_UUID);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
}
drbd_suspend_io(mdev);
- reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
- if (reply->ret_code == SS_SUCCESS) {
+ retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+ if (retcode == SS_SUCCESS) {
if (mdev->state.conn < C_CONNECTED)
- tl_clear(mdev);
+ tl_clear(mdev->tconn);
if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
- tl_restart(mdev, fail_frozen_disk_io);
+ tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO);
}
drbd_resume_io(mdev);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
{
- reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
- return 0;
+ return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
}
-static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr)
{
- unsigned short *tl;
+ struct nlattr *nla;
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+ if (!nla)
+ goto nla_put_failure;
+ if (vnr != VOLUME_UNSPECIFIED &&
+ nla_put_u32(skb, T_ctx_volume, vnr))
+ goto nla_put_failure;
+ if (nla_put_string(skb, T_ctx_resource_name, tconn->name))
+ goto nla_put_failure;
+ if (tconn->my_addr_len &&
+ nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr))
+ goto nla_put_failure;
+ if (tconn->peer_addr_len &&
+ nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr))
+ goto nla_put_failure;
+ nla_nest_end(skb, nla);
+ return 0;
- tl = reply->tag_list;
+nla_put_failure:
+ if (nla)
+ nla_nest_cancel(skb, nla);
+ return -EMSGSIZE;
+}
- if (get_ldev(mdev)) {
- tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
- put_ldev(mdev);
- }
+int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
+ const struct sib_info *sib)
+{
+ struct state_info *si = NULL; /* for sizeof(si->member); */
+ struct net_conf *nc;
+ struct nlattr *nla;
+ int got_ldev;
+ int err = 0;
+ int exclude_sensitive;
+
+ /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+ * to. So we better exclude_sensitive information.
+ *
+ * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+ * in the context of the requesting user process. Exclude sensitive
+ * information, unless current has superuser.
+ *
+ * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+ * relies on the current implementation of netlink_dump(), which
+ * executes the dump callback successively from netlink_recvmsg(),
+ * always in the context of the receiving process */
+ exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+ got_ldev = get_ldev(mdev);
+
+ /* We need to add connection name and volume number information still.
+ * Minor number is in drbd_genlmsghdr. */
+ if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr))
+ goto nla_put_failure;
+
+ if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ if (got_ldev)
+ if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
+ goto nla_put_failure;
+
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (nc)
+ err = net_conf_to_skb(skb, nc, exclude_sensitive);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+
+ nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+ if (!nla)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+ nla_put_u32(skb, T_current_state, mdev->state.i) ||
+ nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) ||
+ nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) ||
+ nla_put_u64(skb, T_send_cnt, mdev->send_cnt) ||
+ nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) ||
+ nla_put_u64(skb, T_read_cnt, mdev->read_cnt) ||
+ nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) ||
+ nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) ||
+ nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) ||
+ nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) ||
+ nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) ||
+ nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt)))
+ goto nla_put_failure;
+
+ if (got_ldev) {
+ int err;
- if (get_net_conf(mdev)) {
- tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
- put_net_conf(mdev);
+ spin_lock_irq(&mdev->ldev->md.uuid_lock);
+ err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid);
+ spin_unlock_irq(&mdev->ldev->md.uuid_lock);
+
+ if (err)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) ||
+ nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) ||
+ nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev)))
+ goto nla_put_failure;
+ if (C_SYNC_SOURCE <= mdev->state.conn &&
+ C_PAUSED_SYNC_T >= mdev->state.conn) {
+ if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) ||
+ nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed))
+ goto nla_put_failure;
+ }
}
- tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ if (sib) {
+ switch(sib->sib_reason) {
+ case SIB_SYNC_PROGRESS:
+ case SIB_GET_STATUS_REPLY:
+ break;
+ case SIB_STATE_CHANGE:
+ if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+ nla_put_u32(skb, T_new_state, sib->ns.i))
+ goto nla_put_failure;
+ break;
+ case SIB_HELPER_POST:
+ if (nla_put_u32(skb, T_helper_exit_code,
+ sib->helper_exit_code))
+ goto nla_put_failure;
+ /* fall through */
+ case SIB_HELPER_PRE:
+ if (nla_put_string(skb, T_helper, sib->helper_name))
+ goto nla_put_failure;
+ break;
+ }
+ }
+ nla_nest_end(skb, nla);
- return (int)((char *)tl - (char *)reply->tag_list);
+ if (0)
+nla_put_failure:
+ err = -EMSGSIZE;
+ if (got_ldev)
+ put_ldev(mdev);
+ return err;
}
-static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
{
- unsigned short *tl = reply->tag_list;
- union drbd_state s = mdev->state;
- unsigned long rs_left;
- unsigned int res;
+ enum drbd_ret_code retcode;
+ int err;
- tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- /* no local ref, no bitmap, no syncer progress. */
- if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
- if (get_ldev(mdev)) {
- drbd_get_syncer_progress(mdev, &rs_left, &res);
- tl = tl_add_int(tl, T_sync_progress, &res);
- put_ldev(mdev);
- }
+ err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- return (int)((char *)tl - (char *)reply->tag_list);
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
-static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
{
- unsigned short *tl;
-
- tl = reply->tag_list;
+ struct drbd_conf *mdev;
+ struct drbd_genlmsghdr *dh;
+ struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0];
+ struct drbd_tconn *tconn = NULL;
+ struct drbd_tconn *tmp;
+ unsigned volume = cb->args[1];
+
+ /* Open coded, deferred, iteration:
+ * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+ * idr_for_each_entry(&tconn->volumes, mdev, i) {
+ * ...
+ * }
+ * }
+ * where tconn is cb->args[0];
+ * and i is cb->args[1];
+ *
+ * cb->args[2] indicates if we shall loop over all resources,
+ * or just dump all volumes of a single resource.
+ *
+ * This may miss entries inserted after this dump started,
+ * or entries deleted before they are reached.
+ *
+ * We need to make sure the mdev won't disappear while
+ * we are looking at it, and revalidate our iterators
+ * on each iteration.
+ */
- if (get_ldev(mdev)) {
- unsigned long flags;
- spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
- tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
- tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
- spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
- put_ldev(mdev);
+ /* synchronize with conn_create()/conn_destroy() */
+ rcu_read_lock();
+ /* revalidate iterator position */
+ list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) {
+ if (pos == NULL) {
+ /* first iteration */
+ pos = tmp;
+ tconn = pos;
+ break;
+ }
+ if (tmp == pos) {
+ tconn = pos;
+ break;
+ }
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ if (tconn) {
+next_tconn:
+ mdev = idr_get_next(&tconn->volumes, &volume);
+ if (!mdev) {
+ /* No more volumes to dump on this tconn.
+ * Advance tconn iterator. */
+ pos = list_entry_rcu(tconn->all_tconn.next,
+ struct drbd_tconn, all_tconn);
+ /* Did we dump any volume on this tconn yet? */
+ if (volume != 0) {
+ /* If we reached the end of the list,
+ * or only a single resource dump was requested,
+ * we are done. */
+ if (&pos->all_tconn == &drbd_tconns || cb->args[2])
+ goto out;
+ volume = 0;
+ tconn = pos;
+ goto next_tconn;
+ }
+ }
- return (int)((char *)tl - (char *)reply->tag_list);
+ dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &drbd_genl_family,
+ NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+ if (!dh)
+ goto out;
+
+ if (!mdev) {
+ /* This is a tconn without a single volume.
+ * Suprisingly enough, it may have a network
+ * configuration. */
+ struct net_conf *nc;
+ dh->minor = -1U;
+ dh->ret_code = NO_ERROR;
+ if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED))
+ goto cancel;
+ nc = rcu_dereference(tconn->net_conf);
+ if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+ goto cancel;
+ goto done;
+ }
+
+ D_ASSERT(mdev->vnr == volume);
+ D_ASSERT(mdev->tconn == tconn);
+
+ dh->minor = mdev_to_minor(mdev);
+ dh->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(skb, mdev, NULL)) {
+cancel:
+ genlmsg_cancel(skb, dh);
+ goto out;
+ }
+done:
+ genlmsg_end(skb, dh);
+ }
+
+out:
+ rcu_read_unlock();
+ /* where to start the next iteration */
+ cb->args[0] = (long)pos;
+ cb->args[1] = (pos == tconn) ? volume + 1 : 0;
+
+ /* No more tconns/volumes/minors found results in an empty skb.
+ * Which will terminate the dump. */
+ return skb->len;
}
-/**
- * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
- * @mdev: DRBD device.
- * @nlp: Netlink/connector packet from drbdsetup
- * @reply: Reply packet for drbdsetup
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock(). During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
*/
-static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
{
- unsigned short *tl;
- char rv;
+ const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+ struct nlattr *nla;
+ const char *resource_name;
+ struct drbd_tconn *tconn;
+ int maxtype;
+
+ /* Is this a followup call? */
+ if (cb->args[0]) {
+ /* ... of a single resource dump,
+ * and the resource iterator has been advanced already? */
+ if (cb->args[2] && cb->args[2] != cb->args[0])
+ return 0; /* DONE. */
+ goto dump;
+ }
+
+ /* First call (from netlink_dump_start). We need to figure out
+ * which resource(s) the user wants us to dump. */
+ nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+ nlmsg_attrlen(cb->nlh, hdrlen),
+ DRBD_NLA_CFG_CONTEXT);
+
+ /* No explicit context given. Dump all. */
+ if (!nla)
+ goto dump;
+ maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+ nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+ if (IS_ERR(nla))
+ return PTR_ERR(nla);
+ /* context given, but no name present? */
+ if (!nla)
+ return -EINVAL;
+ resource_name = nla_data(nla);
+ tconn = conn_get_by_name(resource_name);
+
+ if (!tconn)
+ return -ENODEV;
+
+ kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */
+
+ /* prime iterators, and set "filter" mode mark:
+ * only dump this tconn. */
+ cb->args[0] = (long)tconn;
+ /* cb->args[1] = 0; passed in this way. */
+ cb->args[2] = (long)tconn;
+
+dump:
+ return get_one_status(skb, cb);
+}
- tl = reply->tag_list;
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+ struct timeout_parms tp;
+ int err;
- rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
- drbd_test_flag(mdev, USE_DEGR_WFC_T) ? UT_DEGRADED : UT_DEFAULT;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ tp.timeout_type =
+ adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+ test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED :
+ UT_DEFAULT;
- return (int)((char *)tl - (char *)reply->tag_list);
+ err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
+ }
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
-static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
{
- /* default to resume from last known position, if possible */
- struct start_ov args = {
- .start_sector = mdev->ov_start_sector,
- .stop_sector = ULLONG_MAX,
- };
+ struct drbd_conf *mdev;
+ enum drbd_ret_code retcode;
+ struct start_ov_parms parms;
- if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
+
+ /* resume from last known position, if possible */
+ parms.ov_start_sector = mdev->ov_start_sector;
+ parms.ov_stop_sector = ULLONG_MAX;
+ if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+ int err = start_ov_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
+ }
}
+ /* w_make_ov_request expects position to be aligned */
+ mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+ mdev->ov_stop_sector = parms.ov_stop_sector;
/* If there is still bitmap IO pending, e.g. previous resync or verify
* just being finished, wait for it before requesting a new resync. */
drbd_suspend_io(mdev);
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
-
- /* w_make_ov_request expects start position to be aligned */
- mdev->ov_start_sector = args.start_sector & ~(BM_SECT_PER_BIT-1);
- mdev->ov_stop_sector = args.stop_sector;
- reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+ retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
drbd_resume_io(mdev);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ struct drbd_conf *mdev;
+ enum drbd_ret_code retcode;
int skip_initial_sync = 0;
int err;
+ struct new_c_uuid_parms args;
- struct new_c_uuid args;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out_nolock;
- memset(&args, 0, sizeof(struct new_c_uuid));
- if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
+ mdev = adm_ctx.mdev;
+ memset(&args, 0, sizeof(args));
+ if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+ err = new_c_uuid_parms_from_attrs(&args, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out_nolock;
+ }
}
- mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+ mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */
if (!get_ldev(mdev)) {
retcode = ERR_NO_DISK;
@@ -2260,7 +3013,7 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
}
/* this is "skip initial sync", assume to be clean */
- if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
+ if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 &&
mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
dev_info(DEV, "Preparing to skip initial sync\n");
skip_initial_sync = 1;
@@ -2283,10 +3036,10 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
drbd_send_uuids_skip_initial_sync(mdev);
_drbd_uuid_set(mdev, UI_BITMAP, 0);
drbd_print_uuids(mdev, "cleared bitmap UUID");
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
}
}
@@ -2294,416 +3047,283 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
out_dec:
put_ldev(mdev);
out:
- mutex_unlock(&mdev->state_mutex);
-
- reply->ret_code = retcode;
+ mutex_unlock(mdev->state_mutex);
+out_nolock:
+ drbd_adm_finish(info, retcode);
return 0;
}
-struct cn_handler_struct {
- int (*function)(struct drbd_conf *,
- struct drbd_nl_cfg_req *,
- struct drbd_nl_cfg_reply *);
- int reply_body_size;
-};
-
-static struct cn_handler_struct cnd_table[] = {
- [ P_primary ] = { &drbd_nl_primary, 0 },
- [ P_secondary ] = { &drbd_nl_secondary, 0 },
- [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
- [ P_detach ] = { &drbd_nl_detach, 0 },
- [ P_net_conf ] = { &drbd_nl_net_conf, 0 },
- [ P_disconnect ] = { &drbd_nl_disconnect, 0 },
- [ P_resize ] = { &drbd_nl_resize, 0 },
- [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
- [ P_invalidate ] = { &drbd_nl_invalidate, 0 },
- [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
- [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
- [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
- [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
- [ P_resume_io ] = { &drbd_nl_resume_io, 0 },
- [ P_outdate ] = { &drbd_nl_outdate, 0 },
- [ P_get_config ] = { &drbd_nl_get_config,
- sizeof(struct syncer_conf_tag_len_struct) +
- sizeof(struct disk_conf_tag_len_struct) +
- sizeof(struct net_conf_tag_len_struct) },
- [ P_get_state ] = { &drbd_nl_get_state,
- sizeof(struct get_state_tag_len_struct) +
- sizeof(struct sync_progress_tag_len_struct) },
- [ P_get_uuids ] = { &drbd_nl_get_uuids,
- sizeof(struct get_uuids_tag_len_struct) },
- [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
- sizeof(struct get_timeout_flag_tag_len_struct)},
- [ P_start_ov ] = { &drbd_nl_start_ov, 0 },
- [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
-};
-
-static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
+static enum drbd_ret_code
+drbd_check_resource_name(const char *name)
{
- struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
- struct cn_handler_struct *cm;
- struct cn_msg *cn_reply;
- struct drbd_nl_cfg_reply *reply;
- struct drbd_conf *mdev;
- int retcode, rr;
- int reply_size = sizeof(struct cn_msg)
- + sizeof(struct drbd_nl_cfg_reply)
- + sizeof(short int);
-
- if (!try_module_get(THIS_MODULE)) {
- printk(KERN_ERR "drbd: try_module_get() failed!\n");
- return;
- }
-
- if (!capable(CAP_SYS_ADMIN)) {
- retcode = ERR_PERM;
- goto fail;
+ if (!name || !name[0]) {
+ drbd_msg_put_info("resource name missing");
+ return ERR_MANDATORY_TAG;
}
-
- mdev = ensure_mdev(nlp->drbd_minor,
- (nlp->flags & DRBD_NL_CREATE_DEVICE));
- if (!mdev) {
- retcode = ERR_MINOR_INVALID;
- goto fail;
+ /* if we want to use these in sysfs/configfs/debugfs some day,
+ * we must not allow slashes */
+ if (strchr(name, '/')) {
+ drbd_msg_put_info("invalid resource name");
+ return ERR_INVALID_REQUEST;
}
+ return NO_ERROR;
+}
- if (nlp->packet_type >= P_nl_after_last_packet ||
- nlp->packet_type == P_return_code_only) {
- retcode = ERR_PACKET_NR;
- goto fail;
- }
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+ struct res_opts res_opts;
+ int err;
- cm = cnd_table + nlp->packet_type;
+ retcode = drbd_adm_prepare(skb, info, 0);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- /* This may happen if packet number is 0: */
- if (cm->function == NULL) {
- retcode = ERR_PACKET_NR;
- goto fail;
+ set_res_opts_defaults(&res_opts);
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
}
- reply_size += cm->reply_body_size;
+ retcode = drbd_check_resource_name(adm_ctx.resource_name);
+ if (retcode != NO_ERROR)
+ goto out;
- /* allocation not in the IO path, cqueue thread context */
- cn_reply = kzalloc(reply_size, GFP_KERNEL);
- if (!cn_reply) {
- retcode = ERR_NOMEM;
- goto fail;
+ if (adm_ctx.tconn) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+ retcode = ERR_INVALID_REQUEST;
+ drbd_msg_put_info("resource exists");
+ }
+ /* else: still NO_ERROR */
+ goto out;
}
- reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
-
- reply->packet_type =
- cm->reply_body_size ? nlp->packet_type : P_return_code_only;
- reply->minor = nlp->drbd_minor;
- reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
- /* reply->tag_list; might be modified by cm->function. */
-
- rr = cm->function(mdev, nlp, reply);
-
- cn_reply->id = req->id;
- cn_reply->seq = req->seq;
- cn_reply->ack = req->ack + 1;
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
- cn_reply->flags = 0;
- rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
- if (rr && rr != -ESRCH)
- printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
-
- kfree(cn_reply);
- module_put(THIS_MODULE);
- return;
- fail:
- drbd_nl_send_reply(req, retcode);
- module_put(THIS_MODULE);
+ if (!conn_create(adm_ctx.resource_name, &res_opts))
+ retcode = ERR_NOMEM;
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
-static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
-
-static unsigned short *
-__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
- unsigned short len, int nul_terminated)
+int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info)
{
- unsigned short l = tag_descriptions[tag_number(tag)].max_len;
- len = (len < l) ? len : l;
- put_unaligned(tag, tl++);
- put_unaligned(len, tl++);
- memcpy(tl, data, len);
- tl = (unsigned short*)((char*)tl + len);
- if (nul_terminated)
- *((char*)tl - 1) = 0;
- return tl;
-}
+ struct drbd_genlmsghdr *dh = info->userhdr;
+ enum drbd_ret_code retcode;
-static unsigned short *
-tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
-{
- return __tl_add_blob(tl, tag, data, len, 0);
-}
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
-static unsigned short *
-tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
-{
- return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
-}
+ if (dh->minor > MINORMASK) {
+ drbd_msg_put_info("requested minor out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+ if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+ drbd_msg_put_info("requested volume id out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
-static unsigned short *
-tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
-{
- put_unaligned(tag, tl++);
- switch(tag_type(tag)) {
- case TT_INTEGER:
- put_unaligned(sizeof(int), tl++);
- put_unaligned(*(int *)val, (int *)tl);
- tl = (unsigned short*)((char*)tl+sizeof(int));
- break;
- case TT_INT64:
- put_unaligned(sizeof(u64), tl++);
- put_unaligned(*(u64 *)val, (u64 *)tl);
- tl = (unsigned short*)((char*)tl+sizeof(u64));
- break;
- default:
- /* someone did something stupid. */
- ;
+ /* drbd_adm_prepare made sure already
+ * that mdev->tconn and mdev->vnr match the request. */
+ if (adm_ctx.mdev) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+ retcode = ERR_MINOR_EXISTS;
+ /* else: still NO_ERROR */
+ goto out;
}
- return tl;
+
+ retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume);
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
-void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
+static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct get_state_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
-
- /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-
- tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
-
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_get_state;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
-
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+ if (mdev->state.disk == D_DISKLESS &&
+ /* no need to be mdev->state.conn == C_STANDALONE &&
+ * we may want to delete a minor from a live replication group.
+ */
+ mdev->state.role == R_SECONDARY) {
+ _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS),
+ CS_VERBOSE + CS_WAIT_COMPLETE);
+ idr_remove(&mdev->tconn->volumes, mdev->vnr);
+ idr_remove(&minors, mdev_to_minor(mdev));
+ del_gendisk(mdev->vdisk);
+ synchronize_rcu();
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ return NO_ERROR;
+ } else
+ return ERR_MINOR_CONFIGURED;
}
-void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
+int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info)
{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct call_helper_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
-
- /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-
- tl = tl_add_str(tl, T_helper, helper_name);
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
+ enum drbd_ret_code retcode;
- reply->packet_type = P_call_helper;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+ retcode = adm_delete_minor(adm_ctx.mdev);
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
-void drbd_bcast_ee(struct drbd_conf *mdev,
- const char *reason, const int dgs,
- const char* seen_hash, const char* calc_hash,
- const struct drbd_epoch_entry* e)
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
{
- struct cn_msg *cn_reply;
- struct drbd_nl_cfg_reply *reply;
- unsigned short *tl;
- struct page *page;
- unsigned len;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+ struct drbd_conf *mdev;
+ unsigned i;
- if (!e)
- return;
- if (!reason || !reason[0])
- return;
+ retcode = drbd_adm_prepare(skb, info, 0);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- /* apparently we have to memcpy twice, first to prepare the data for the
- * struct cn_msg, then within cn_netlink_send from the cn_msg to the
- * netlink skb. */
- /* receiver thread context, which is not in the writeout path (of this node),
- * but may be in the writeout path of the _other_ node.
- * GFP_NOIO to avoid potential "distributed deadlock". */
- cn_reply = kzalloc(
- sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct dump_ee_tag_len_struct)+
- sizeof(short int),
- GFP_NOIO);
-
- if (!cn_reply) {
- dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
- (unsigned long long)e->sector, e->size);
- return;
+ if (!adm_ctx.tconn) {
+ retcode = ERR_RES_NOT_KNOWN;
+ goto out;
}
- reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
- tl = reply->tag_list;
-
- tl = tl_add_str(tl, T_dump_ee_reason, reason);
- tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
- tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
- tl = tl_add_int(tl, T_ee_sector, &e->sector);
- tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
-
- /* dump the first 32k */
- len = min_t(unsigned, e->size, 32 << 10);
- put_unaligned(T_ee_data, tl++);
- put_unaligned(len, tl++);
-
- page = e->pages;
- page_chain_for_each(page) {
- void *d = kmap_atomic(page);
- unsigned l = min_t(unsigned, len, PAGE_SIZE);
- memcpy(tl, d, l);
- kunmap_atomic(d);
- tl = (unsigned short*)((char*)tl + l);
- len -= l;
- if (len == 0)
- break;
+ /* demote */
+ idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+ retcode = drbd_set_role(mdev, R_SECONDARY, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info("failed to demote");
+ goto out;
+ }
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
- cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
- cn_reply->ack = 0; // not used here.
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char*)tl - (char*)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_dump_ee;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
-
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
- kfree(cn_reply);
-}
-
-void drbd_bcast_sync_progress(struct drbd_conf *mdev)
-{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct sync_progress_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
- unsigned long rs_left;
- unsigned int res;
+ retcode = conn_try_disconnect(adm_ctx.tconn, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info("failed to disconnect");
+ goto out;
+ }
- /* no local ref, no bitmap, no syncer progress, no broadcast. */
- if (!get_ldev(mdev))
- return;
- drbd_get_syncer_progress(mdev, &rs_left, &res);
- put_ldev(mdev);
+ /* detach */
+ idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+ retcode = adm_detach(mdev, 0);
+ if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+ drbd_msg_put_info("failed to detach");
+ goto out;
+ }
+ }
- tl = tl_add_int(tl, T_sync_progress, &res);
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ /* If we reach this, all volumes (of this tconn) are Secondary,
+ * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
+ * actually stopped, state handling only does drbd_thread_stop_nowait(). */
+ drbd_thread_stop(&adm_ctx.tconn->worker);
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
+ /* Now, nothing can fail anymore */
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
+ /* delete volumes */
+ idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+ retcode = adm_delete_minor(mdev);
+ if (retcode != NO_ERROR) {
+ /* "can not happen" */
+ drbd_msg_put_info("failed to delete volume");
+ goto out;
+ }
+ }
- reply->packet_type = P_sync_progress;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
+ /* delete connection */
+ if (conn_lowest_minor(adm_ctx.tconn) < 0) {
+ list_del_rcu(&adm_ctx.tconn->all_tconn);
+ synchronize_rcu();
+ kref_put(&adm_ctx.tconn->kref, &conn_destroy);
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+ retcode = NO_ERROR;
+ } else {
+ /* "can not happen" */
+ retcode = ERR_RES_IN_USE;
+ drbd_msg_put_info("failed to delete connection");
+ }
+ goto out;
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
-int __init drbd_nl_init(void)
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
{
- static struct cb_id cn_id_drbd;
- int err, try=10;
+ enum drbd_ret_code retcode;
- cn_id_drbd.val = CN_VAL_DRBD;
- do {
- cn_id_drbd.idx = cn_idx;
- err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
- if (!err)
- break;
- cn_idx = (cn_idx + CN_IDX_STEP);
- } while (try--);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- if (err) {
- printk(KERN_ERR "drbd: cn_drbd failed to register\n");
- return err;
+ if (conn_lowest_minor(adm_ctx.tconn) < 0) {
+ list_del_rcu(&adm_ctx.tconn->all_tconn);
+ synchronize_rcu();
+ kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+
+ retcode = NO_ERROR;
+ } else {
+ retcode = ERR_RES_IN_USE;
}
+ if (retcode == NO_ERROR)
+ drbd_thread_stop(&adm_ctx.tconn->worker);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-void drbd_nl_cleanup(void)
+void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib)
{
- static struct cb_id cn_id_drbd;
-
- cn_id_drbd.idx = cn_idx;
- cn_id_drbd.val = CN_VAL_DRBD;
-
- cn_del_callback(&cn_id_drbd);
-}
-
-void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
-{
- char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- int rr;
-
- memset(buffer, 0, sizeof(buffer));
- cn_reply->id = req->id;
+ static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+ struct sk_buff *msg;
+ struct drbd_genlmsghdr *d_out;
+ unsigned seq;
+ int err = -ENOMEM;
+
+ if (sib->sib_reason == SIB_SYNC_PROGRESS &&
+ time_after(jiffies, mdev->rs_last_bcast + HZ))
+ mdev->rs_last_bcast = jiffies;
+ else
+ return;
- cn_reply->seq = req->seq;
- cn_reply->ack = req->ack + 1;
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
- cn_reply->flags = 0;
+ seq = atomic_inc_return(&drbd_genl_seq);
+ msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+ if (!msg)
+ goto failed;
+
+ err = -EMSGSIZE;
+ d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+ if (!d_out) /* cannot happen, but anyways. */
+ goto nla_put_failure;
+ d_out->minor = mdev_to_minor(mdev);
+ d_out->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(msg, mdev, sib))
+ goto nla_put_failure;
+ genlmsg_end(msg, d_out);
+ err = drbd_genl_multicast_events(msg, 0);
+ /* msg has been consumed or freed in netlink_broadcast() */
+ if (err && err != -ESRCH)
+ goto failed;
- reply->packet_type = P_return_code_only;
- reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
- reply->ret_code = ret_code;
+ return;
- rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
- if (rr && rr != -ESRCH)
- printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+nla_put_failure:
+ nlmsg_free(msg);
+failed:
+ dev_err(DEV, "Error %d while broadcasting event. "
+ "Event seq:%u sib_reason:%u\n",
+ err, seq, sib->sib_reason);
}
-
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 00000000000..fa672b6df8d
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,55 @@
+#include "drbd_wrappers.h"
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+ struct nlattr *head = nla_data(nla);
+ int len = nla_len(nla);
+ int rem;
+
+ /*
+ * validate_nla (called from nla_parse_nested) ignores attributes
+ * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+ * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+ * flag set also, check and remove that flag before calling
+ * nla_parse_nested.
+ */
+
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+ nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+ if (nla_type(nla) > maxtype)
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy)
+{
+ int err;
+
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (!err)
+ err = nla_parse_nested(tb, maxtype, nla, policy);
+
+ return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+ int err;
+ /*
+ * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+ * we don't know about that attribute, reject all the nested
+ * attributes.
+ */
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (err)
+ return ERR_PTR(err);
+ return nla_find_nested(nla, attrtype);
+}
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 00000000000..679c2d5b453
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,8 @@
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 662bc8ef830..56672a61eb9 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -171,7 +171,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
if (mdev->state.conn == C_VERIFY_S ||
mdev->state.conn == C_VERIFY_T) {
bit_pos = bm_bits - mdev->ov_left;
- if (mdev->agreed_pro_version >= 97)
+ if (verify_can_do_stop_sector(mdev))
stop_sector = mdev->ov_stop_sector;
} else
bit_pos = mdev->bm_resync_fo;
@@ -200,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
static int drbd_seq_show(struct seq_file *seq, void *v)
{
- int i, hole = 0;
+ int i, prev_i = -1;
const char *sn;
struct drbd_conf *mdev;
+ struct net_conf *nc;
+ char wp;
static char write_ordering_chars[] = {
[WO_none] = 'n',
@@ -233,16 +235,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
oos .. known out-of-sync kB
*/
- for (i = 0; i < minor_count; i++) {
- mdev = minor_to_mdev(i);
- if (!mdev) {
- hole = 1;
- continue;
- }
- if (hole) {
- hole = 0;
+ rcu_read_lock();
+ idr_for_each_entry(&minors, mdev, i) {
+ if (prev_i != i - 1)
seq_printf(seq, "\n");
- }
+ prev_i = i;
sn = drbd_conn_str(mdev->state.conn);
@@ -254,6 +251,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
/* reset mdev->congestion_reason */
bdi_rw_congested(&mdev->rq_queue->backing_dev_info);
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
@@ -263,14 +262,13 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
drbd_role_str(mdev->state.peer),
drbd_disk_str(mdev->state.disk),
drbd_disk_str(mdev->state.pdsk),
- (mdev->net_conf == NULL ? ' ' :
- (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
- is_susp(mdev->state) ? 's' : 'r',
+ wp,
+ drbd_suspended(mdev) ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
mdev->congestion_reason ?: '-',
- drbd_test_flag(mdev, AL_SUSPENDED) ? 's' : '-',
+ test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
mdev->send_cnt/2,
mdev->recv_cnt/2,
mdev->writ_cnt/2,
@@ -282,8 +280,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
atomic_read(&mdev->rs_pending_cnt),
atomic_read(&mdev->unacked_cnt),
atomic_read(&mdev->ap_bio_cnt),
- mdev->epochs,
- write_ordering_chars[mdev->write_ordering]
+ mdev->tconn->epochs,
+ write_ordering_chars[mdev->tconn->write_ordering]
);
seq_printf(seq, " oos:%llu\n",
Bit2KB((unsigned long long)
@@ -308,6 +306,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
}
}
}
+ rcu_read_unlock();
return 0;
}
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index eb0cafea142..0331ad0b61e 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -48,17 +48,25 @@
#include "drbd_vli.h"
+struct packet_info {
+ enum drbd_packet cmd;
+ unsigned int size;
+ unsigned int vnr;
+ void *data;
+};
+
enum finish_epoch {
FE_STILL_LIVE,
FE_DESTROYED,
FE_RECYCLED,
};
-static int drbd_do_handshake(struct drbd_conf *mdev);
-static int drbd_do_auth(struct drbd_conf *mdev);
+static int drbd_do_features(struct drbd_tconn *tconn);
+static int drbd_do_auth(struct drbd_tconn *tconn);
+static int drbd_disconnected(struct drbd_conf *mdev);
-static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
-static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
@@ -142,11 +150,12 @@ static void page_chain_add(struct page **head,
*head = chain_first;
}
-static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
+static struct page *__drbd_alloc_pages(struct drbd_conf *mdev,
+ unsigned int number)
{
struct page *page = NULL;
struct page *tmp = NULL;
- int i = 0;
+ unsigned int i = 0;
/* Yes, testing drbd_pp_vacant outside the lock is racy.
* So what. It saves a spin_lock. */
@@ -175,7 +184,7 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
return page;
/* Not enough pages immediately available this time.
- * No need to jump around here, drbd_pp_alloc will retry this
+ * No need to jump around here, drbd_alloc_pages will retry this
* function "soon". */
if (page) {
tmp = page_chain_tail(page, NULL);
@@ -187,9 +196,10 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
return NULL;
}
-static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
+static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev,
+ struct list_head *to_be_freed)
{
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
struct list_head *le, *tle;
/* The EEs are always appended to the end of the list. Since
@@ -198,8 +208,8 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
stop to examine the list... */
list_for_each_safe(le, tle, &mdev->net_ee) {
- e = list_entry(le, struct drbd_epoch_entry, w.list);
- if (drbd_ee_has_active_page(e))
+ peer_req = list_entry(le, struct drbd_peer_request, w.list);
+ if (drbd_peer_req_has_active_page(peer_req))
break;
list_move(le, to_be_freed);
}
@@ -208,18 +218,18 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
{
LIST_HEAD(reclaimed);
- struct drbd_epoch_entry *e, *t;
+ struct drbd_peer_request *peer_req, *t;
- spin_lock_irq(&mdev->req_lock);
- reclaim_net_ee(mdev, &reclaimed);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ reclaim_finished_net_peer_reqs(mdev, &reclaimed);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_net_ee(mdev, e);
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(mdev, peer_req);
}
/**
- * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
* @mdev: DRBD device.
* @number: number of pages requested
* @retry: whether to retry, if not enough pages are available right now
@@ -230,23 +240,31 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
*
* Returns a page chain linked via page->private.
*/
-static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
+struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number,
+ bool retry)
{
struct page *page = NULL;
+ struct net_conf *nc;
DEFINE_WAIT(wait);
+ int mxb;
/* Yes, we may run up to @number over max_buffers. If we
* follow it strictly, the admin will get it wrong anyways. */
- if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
- page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000;
+ rcu_read_unlock();
+
+ if (atomic_read(&mdev->pp_in_use) < mxb)
+ page = __drbd_alloc_pages(mdev, number);
while (page == NULL) {
prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
drbd_kick_lo_and_reclaim_net(mdev);
- if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
- page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+ if (atomic_read(&mdev->pp_in_use) < mxb) {
+ page = __drbd_alloc_pages(mdev, number);
if (page)
break;
}
@@ -255,7 +273,7 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
break;
if (signal_pending(current)) {
- dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+ dev_warn(DEV, "drbd_alloc_pages interrupted!\n");
break;
}
@@ -268,11 +286,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
return page;
}
-/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
- * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
+static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net)
{
atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
int i;
@@ -280,7 +298,7 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
if (page == NULL)
return;
- if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
+ if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
i = page_chain_free(page);
else {
struct page *tmp;
@@ -302,127 +320,130 @@ You need to hold the req_lock:
_drbd_wait_ee_list_empty()
You must not have the req_lock:
- drbd_free_ee()
- drbd_alloc_ee()
- drbd_init_ee()
- drbd_release_ee()
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
drbd_ee_fix_bhs()
- drbd_process_done_ee()
+ drbd_finish_peer_reqs()
drbd_clear_done_ee()
drbd_wait_ee_list_empty()
*/
-struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
- u64 id,
- sector_t sector,
- unsigned int data_size,
- gfp_t gfp_mask) __must_hold(local)
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector,
+ unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
{
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
struct page *page = NULL;
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
return NULL;
- e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
- if (!e) {
+ peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+ if (!peer_req) {
if (!(gfp_mask & __GFP_NOWARN))
- dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+ dev_err(DEV, "%s: allocation failed\n", __func__);
return NULL;
}
if (data_size) {
- page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+ page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
if (!page)
goto fail;
}
- INIT_HLIST_NODE(&e->collision);
- e->epoch = NULL;
- e->mdev = mdev;
- e->pages = page;
- atomic_set(&e->pending_bios, 0);
- e->size = data_size;
- e->flags = 0;
- e->sector = sector;
- e->block_id = id;
+ drbd_clear_interval(&peer_req->i);
+ peer_req->i.size = data_size;
+ peer_req->i.sector = sector;
+ peer_req->i.local = false;
+ peer_req->i.waiting = false;
+
+ peer_req->epoch = NULL;
+ peer_req->w.mdev = mdev;
+ peer_req->pages = page;
+ atomic_set(&peer_req->pending_bios, 0);
+ peer_req->flags = 0;
+ /*
+ * The block_id is opaque to the receiver. It is not endianness
+ * converted, and sent back to the sender unchanged.
+ */
+ peer_req->block_id = id;
- return e;
+ return peer_req;
fail:
- mempool_free(e, drbd_ee_mempool);
+ mempool_free(peer_req, drbd_ee_mempool);
return NULL;
}
-void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
+void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req,
+ int is_net)
{
- if (e->flags & EE_HAS_DIGEST)
- kfree(e->digest);
- drbd_pp_free(mdev, e->pages, is_net);
- D_ASSERT(atomic_read(&e->pending_bios) == 0);
- D_ASSERT(hlist_unhashed(&e->collision));
- mempool_free(e, drbd_ee_mempool);
+ if (peer_req->flags & EE_HAS_DIGEST)
+ kfree(peer_req->digest);
+ drbd_free_pages(mdev, peer_req->pages, is_net);
+ D_ASSERT(atomic_read(&peer_req->pending_bios) == 0);
+ D_ASSERT(drbd_interval_empty(&peer_req->i));
+ mempool_free(peer_req, drbd_ee_mempool);
}
-int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list)
{
LIST_HEAD(work_list);
- struct drbd_epoch_entry *e, *t;
+ struct drbd_peer_request *peer_req, *t;
int count = 0;
int is_net = list == &mdev->net_ee;
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
list_splice_init(list, &work_list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- list_for_each_entry_safe(e, t, &work_list, w.list) {
- drbd_free_some_ee(mdev, e, is_net);
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ __drbd_free_peer_req(mdev, peer_req, is_net);
count++;
}
return count;
}
-
/*
- * This function is called from _asender only_
- * but see also comments in _req_mod(,barrier_acked)
- * and receive_Barrier.
- *
- * Move entries from net_ee to done_ee, if ready.
- * Grab done_ee, call all callbacks, free the entries.
- * The callbacks typically send out ACKs.
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
*/
-static int drbd_process_done_ee(struct drbd_conf *mdev)
+static int drbd_finish_peer_reqs(struct drbd_conf *mdev)
{
LIST_HEAD(work_list);
LIST_HEAD(reclaimed);
- struct drbd_epoch_entry *e, *t;
- int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+ struct drbd_peer_request *peer_req, *t;
+ int err = 0;
- spin_lock_irq(&mdev->req_lock);
- reclaim_net_ee(mdev, &reclaimed);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ reclaim_finished_net_peer_reqs(mdev, &reclaimed);
list_splice_init(&mdev->done_ee, &work_list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_net_ee(mdev, e);
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(mdev, peer_req);
/* possible callbacks here:
- * e_end_block, and e_end_resync_block, e_send_discard_ack.
+ * e_end_block, and e_end_resync_block, e_send_superseded.
* all ignore the last argument.
*/
- list_for_each_entry_safe(e, t, &work_list, w.list) {
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ int err2;
+
/* list_del not necessary, next/prev members not touched */
- ok = e->w.cb(mdev, &e->w, !ok) && ok;
- drbd_free_ee(mdev, e);
+ err2 = peer_req->w.cb(&peer_req->w, !!err);
+ if (!err)
+ err = err2;
+ drbd_free_peer_req(mdev, peer_req);
}
wake_up(&mdev->ee_wait);
- return ok;
+ return err;
}
-void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+ struct list_head *head)
{
DEFINE_WAIT(wait);
@@ -430,55 +451,22 @@ void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
* and calling prepare_to_wait in the fast path */
while (!list_empty(head)) {
prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
io_schedule();
finish_wait(&mdev->ee_wait, &wait);
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
}
}
-void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+ struct list_head *head)
{
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
_drbd_wait_ee_list_empty(mdev, head);
- spin_unlock_irq(&mdev->req_lock);
-}
-
-/* see also kernel_accept; which is only present since 2.6.18.
- * also we want to log which part of it failed, exactly */
-static int drbd_accept(struct drbd_conf *mdev, const char **what,
- struct socket *sock, struct socket **newsock)
-{
- struct sock *sk = sock->sk;
- int err = 0;
-
- *what = "listen";
- err = sock->ops->listen(sock, 5);
- if (err < 0)
- goto out;
-
- *what = "sock_create_lite";
- err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
- newsock);
- if (err < 0)
- goto out;
-
- *what = "accept";
- err = sock->ops->accept(sock, *newsock, 0);
- if (err < 0) {
- sock_release(*newsock);
- *newsock = NULL;
- goto out;
- }
- (*newsock)->ops = sock->ops;
- __module_get((*newsock)->ops->owner);
-
-out:
- return err;
+ spin_unlock_irq(&mdev->tconn->req_lock);
}
-static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
- void *buf, size_t size, int flags)
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
{
mm_segment_t oldfs;
struct kvec iov = {
@@ -500,48 +488,62 @@ static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
return rv;
}
-static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
{
- mm_segment_t oldfs;
- struct kvec iov = {
- .iov_base = buf,
- .iov_len = size,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&iov,
- .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
- };
int rv;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
- set_fs(oldfs);
+ rv = drbd_recv_short(tconn->data.socket, buf, size, 0);
if (rv < 0) {
if (rv == -ECONNRESET)
- dev_info(DEV, "sock was reset by peer\n");
+ conn_info(tconn, "sock was reset by peer\n");
else if (rv != -ERESTARTSYS)
- dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+ conn_err(tconn, "sock_recvmsg returned %d\n", rv);
} else if (rv == 0) {
- if (drbd_test_flag(mdev, DISCONNECT_SENT)) {
- long t; /* time_left */
- t = wait_event_timeout(mdev->state_wait, mdev->state.conn < C_CONNECTED,
- mdev->net_conf->ping_timeo * HZ/10);
+ if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
+
+ t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t);
+
if (t)
goto out;
}
- dev_info(DEV, "sock was shut down by peer\n");
+ conn_info(tconn, "sock was shut down by peer\n");
}
if (rv != size)
- drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+ conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
out:
return rv;
}
+static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv(tconn, buf, size);
+ if (err != size) {
+ if (err >= 0)
+ err = -EIO;
+ } else
+ err = 0;
+ return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv_all(tconn, buf, size);
+ if (err && !signal_pending(current))
+ conn_warn(tconn, "short read (expected size %d)\n", (int)size);
+ return err;
+}
+
/* quoting tcp(7):
* On individual connections, the socket buffer size must be set prior to the
* listen(2) or connect(2) calls in order to have it take effect.
@@ -561,29 +563,50 @@ static void drbd_setbufsize(struct socket *sock, unsigned int snd,
}
}
-static struct socket *drbd_try_connect(struct drbd_conf *mdev)
+static struct socket *drbd_try_connect(struct drbd_tconn *tconn)
{
const char *what;
struct socket *sock;
struct sockaddr_in6 src_in6;
- int err;
+ struct sockaddr_in6 peer_in6;
+ struct net_conf *nc;
+ int err, peer_addr_len, my_addr_len;
+ int sndbuf_size, rcvbuf_size, connect_int;
int disconnect_on_error = 1;
- if (!get_net_conf(mdev))
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
return NULL;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6));
+ memcpy(&src_in6, &tconn->my_addr, my_addr_len);
+
+ if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6)
+ src_in6.sin6_port = 0;
+ else
+ ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+ peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6));
+ memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len);
what = "sock_create_kern";
- err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
- SOCK_STREAM, IPPROTO_TCP, &sock);
+ err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (err < 0) {
sock = NULL;
goto out;
}
sock->sk->sk_rcvtimeo =
- sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
- drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
- mdev->net_conf->rcvbuf_size);
+ sock->sk->sk_sndtimeo = connect_int * HZ;
+ drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
/* explicitly bind to the configured IP as source IP
* for the outgoing connections.
@@ -592,17 +615,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
* Make sure to use 0 as port number, so linux selects
* a free one dynamically.
*/
- memcpy(&src_in6, mdev->net_conf->my_addr,
- min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
- if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
- src_in6.sin6_port = 0;
- else
- ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
-
what = "bind before connect";
- err = sock->ops->bind(sock,
- (struct sockaddr *) &src_in6,
- mdev->net_conf->my_addr_len);
+ err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
if (err < 0)
goto out;
@@ -610,9 +624,7 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
* stay C_WF_CONNECTION, don't go Disconnecting! */
disconnect_on_error = 0;
what = "connect";
- err = sock->ops->connect(sock,
- (struct sockaddr *)mdev->net_conf->peer_addr,
- mdev->net_conf->peer_addr_len, 0);
+ err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
out:
if (err < 0) {
@@ -630,91 +642,174 @@ out:
disconnect_on_error = 0;
break;
default:
- dev_err(DEV, "%s failed, err = %d\n", what, err);
+ conn_err(tconn, "%s failed, err = %d\n", what, err);
}
if (disconnect_on_error)
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
}
- put_net_conf(mdev);
+
return sock;
}
-static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+struct accept_wait_data {
+ struct drbd_tconn *tconn;
+ struct socket *s_listen;
+ struct completion door_bell;
+ void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
+{
+ struct accept_wait_data *ad = sk->sk_user_data;
+ void (*state_change)(struct sock *sk);
+
+ state_change = ad->original_sk_state_change;
+ if (sk->sk_state == TCP_ESTABLISHED)
+ complete(&ad->door_bell);
+ state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad)
{
- int timeo, err;
- struct socket *s_estab = NULL, *s_listen;
+ int err, sndbuf_size, rcvbuf_size, my_addr_len;
+ struct sockaddr_in6 my_addr;
+ struct socket *s_listen;
+ struct net_conf *nc;
const char *what;
- if (!get_net_conf(mdev))
- return NULL;
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -EIO;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6));
+ memcpy(&my_addr, &tconn->my_addr, my_addr_len);
what = "sock_create_kern";
- err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
- SOCK_STREAM, IPPROTO_TCP, &s_listen);
+ err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &s_listen);
if (err) {
s_listen = NULL;
goto out;
}
- timeo = mdev->net_conf->try_connect_int * HZ;
- timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
-
- s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- s_listen->sk->sk_rcvtimeo = timeo;
- s_listen->sk->sk_sndtimeo = timeo;
- drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
- mdev->net_conf->rcvbuf_size);
+ s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
what = "bind before listen";
- err = s_listen->ops->bind(s_listen,
- (struct sockaddr *) mdev->net_conf->my_addr,
- mdev->net_conf->my_addr_len);
+ err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
if (err < 0)
goto out;
- err = drbd_accept(mdev, &what, s_listen, &s_estab);
+ ad->s_listen = s_listen;
+ write_lock_bh(&s_listen->sk->sk_callback_lock);
+ ad->original_sk_state_change = s_listen->sk->sk_state_change;
+ s_listen->sk->sk_state_change = drbd_incoming_connection;
+ s_listen->sk->sk_user_data = ad;
+ write_unlock_bh(&s_listen->sk->sk_callback_lock);
+
+ what = "listen";
+ err = s_listen->ops->listen(s_listen, 5);
+ if (err < 0)
+ goto out;
+ return 0;
out:
if (s_listen)
sock_release(s_listen);
if (err < 0) {
if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
- dev_err(DEV, "%s failed, err = %d\n", what, err);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ conn_err(tconn, "%s failed, err = %d\n", what, err);
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
}
}
- put_net_conf(mdev);
- return s_estab;
+ return -EIO;
}
-static int drbd_send_fp(struct drbd_conf *mdev,
- struct socket *sock, enum drbd_packets cmd)
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
{
- struct p_header80 *h = &mdev->data.sbuf.header.h80;
-
- return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_state_change = ad->original_sk_state_change;
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
}
-static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad)
{
- struct p_header80 *h = &mdev->data.rbuf.header.h80;
- int rr;
+ int timeo, connect_int, err = 0;
+ struct socket *s_estab = NULL;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ timeo = connect_int * HZ;
+ timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
- rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
+ err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+ if (err <= 0)
+ return NULL;
- if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
- return be16_to_cpu(h->command);
+ err = kernel_accept(ad->s_listen, &s_estab, 0);
+ if (err < 0) {
+ if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+ conn_err(tconn, "accept failed, err = %d\n", err);
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ }
- return 0xffff;
+ if (s_estab)
+ unregister_state_change(s_estab->sk, ad);
+
+ return s_estab;
+}
+
+static int decode_header(struct drbd_tconn *, void *, struct packet_info *);
+
+static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock,
+ enum drbd_packet cmd)
+{
+ if (!conn_prepare_command(tconn, sock))
+ return -EIO;
+ return conn_send_command(tconn, sock, cmd, 0, NULL, 0);
+}
+
+static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock)
+{
+ unsigned int header_size = drbd_header_size(tconn);
+ struct packet_info pi;
+ int err;
+
+ err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0);
+ if (err != header_size) {
+ if (err >= 0)
+ err = -EIO;
+ return err;
+ }
+ err = decode_header(tconn, tconn->data.rbuf, &pi);
+ if (err)
+ return err;
+ return pi.cmd;
}
/**
* drbd_socket_okay() - Free the socket if its connection is not okay
- * @mdev: DRBD device.
* @sock: pointer to the pointer to the socket.
*/
-static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+static int drbd_socket_okay(struct socket **sock)
{
int rr;
char tb[4];
@@ -722,7 +817,7 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
if (!*sock)
return false;
- rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+ rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
if (rr > 0 || rr == -EAGAIN) {
return true;
@@ -732,6 +827,31 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
return false;
}
}
+/* Gets called if a connection is established, or if a new minor gets created
+ in a connection */
+int drbd_connected(struct drbd_conf *mdev)
+{
+ int err;
+
+ atomic_set(&mdev->packet_seq, 0);
+ mdev->peer_seq = 0;
+
+ mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ?
+ &mdev->tconn->cstate_mutex :
+ &mdev->own_state_mutex;
+
+ err = drbd_send_sync_param(mdev);
+ if (!err)
+ err = drbd_send_sizes(mdev, 0, 0);
+ if (!err)
+ err = drbd_send_uuids(mdev);
+ if (!err)
+ err = drbd_send_current_state(mdev);
+ clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+ clear_bit(RESIZE_PENDING, &mdev->flags);
+ mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+ return err;
+}
/*
* return values:
@@ -741,232 +861,305 @@ static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
* no point in trying again, please go standalone.
* -2 We do not have a network config...
*/
-static int drbd_connect(struct drbd_conf *mdev)
+static int conn_connect(struct drbd_tconn *tconn)
{
- struct socket *s, *sock, *msock;
- int try, h, ok;
+ struct drbd_socket sock, msock;
+ struct drbd_conf *mdev;
+ struct net_conf *nc;
+ int vnr, timeout, h, ok;
+ bool discard_my_data;
enum drbd_state_rv rv;
+ struct accept_wait_data ad = {
+ .tconn = tconn,
+ .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+ };
- D_ASSERT(!mdev->data.socket);
-
- drbd_clear_flag(mdev, DISCONNECT_SENT);
- if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+ clear_bit(DISCONNECT_SENT, &tconn->flags);
+ if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
return -2;
- sock = NULL;
- msock = NULL;
+ mutex_init(&sock.mutex);
+ sock.sbuf = tconn->data.sbuf;
+ sock.rbuf = tconn->data.rbuf;
+ sock.socket = NULL;
+ mutex_init(&msock.mutex);
+ msock.sbuf = tconn->meta.sbuf;
+ msock.rbuf = tconn->meta.rbuf;
+ msock.socket = NULL;
+
+ /* Assume that the peer only understands protocol 80 until we know better. */
+ tconn->agreed_pro_version = 80;
+
+ if (prepare_listen_socket(tconn, &ad))
+ return 0;
do {
- for (try = 0;;) {
- /* 3 tries, this should take less than a second! */
- s = drbd_try_connect(mdev);
- if (s || ++try >= 3)
- break;
- /* give the other side time to call bind() & listen() */
- schedule_timeout_interruptible(HZ / 10);
- }
+ struct socket *s;
+ s = drbd_try_connect(tconn);
if (s) {
- if (!sock) {
- drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
- sock = s;
- s = NULL;
- } else if (!msock) {
- drbd_clear_flag(mdev, DISCARD_CONCURRENT);
- drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
- msock = s;
- s = NULL;
+ if (!sock.socket) {
+ sock.socket = s;
+ send_first_packet(tconn, &sock, P_INITIAL_DATA);
+ } else if (!msock.socket) {
+ clear_bit(RESOLVE_CONFLICTS, &tconn->flags);
+ msock.socket = s;
+ send_first_packet(tconn, &msock, P_INITIAL_META);
} else {
- dev_err(DEV, "Logic error in drbd_connect()\n");
+ conn_err(tconn, "Logic error in conn_connect()\n");
goto out_release_sockets;
}
}
- if (sock && msock) {
- schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
- ok = drbd_socket_okay(mdev, &sock);
- ok = drbd_socket_okay(mdev, &msock) && ok;
+ if (sock.socket && msock.socket) {
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ timeout = nc->ping_timeo * HZ / 10;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeout);
+ ok = drbd_socket_okay(&sock.socket);
+ ok = drbd_socket_okay(&msock.socket) && ok;
if (ok)
break;
}
retry:
- s = drbd_wait_for_connect(mdev);
+ s = drbd_wait_for_connect(tconn, &ad);
if (s) {
- try = drbd_recv_fp(mdev, s);
- drbd_socket_okay(mdev, &sock);
- drbd_socket_okay(mdev, &msock);
- switch (try) {
- case P_HAND_SHAKE_S:
- if (sock) {
- dev_warn(DEV, "initial packet S crossed\n");
- sock_release(sock);
+ int fp = receive_first_packet(tconn, s);
+ drbd_socket_okay(&sock.socket);
+ drbd_socket_okay(&msock.socket);
+ switch (fp) {
+ case P_INITIAL_DATA:
+ if (sock.socket) {
+ conn_warn(tconn, "initial packet S crossed\n");
+ sock_release(sock.socket);
+ sock.socket = s;
+ goto randomize;
}
- sock = s;
+ sock.socket = s;
break;
- case P_HAND_SHAKE_M:
- if (msock) {
- dev_warn(DEV, "initial packet M crossed\n");
- sock_release(msock);
+ case P_INITIAL_META:
+ set_bit(RESOLVE_CONFLICTS, &tconn->flags);
+ if (msock.socket) {
+ conn_warn(tconn, "initial packet M crossed\n");
+ sock_release(msock.socket);
+ msock.socket = s;
+ goto randomize;
}
- msock = s;
- drbd_set_flag(mdev, DISCARD_CONCURRENT);
+ msock.socket = s;
break;
default:
- dev_warn(DEV, "Error receiving initial packet\n");
+ conn_warn(tconn, "Error receiving initial packet\n");
sock_release(s);
+randomize:
if (random32() & 1)
goto retry;
}
}
- if (mdev->state.conn <= C_DISCONNECTING)
+ if (tconn->cstate <= C_DISCONNECTING)
goto out_release_sockets;
if (signal_pending(current)) {
flush_signals(current);
smp_rmb();
- if (get_t_state(&mdev->receiver) == Exiting)
+ if (get_t_state(&tconn->receiver) == EXITING)
goto out_release_sockets;
}
- if (sock && msock) {
- ok = drbd_socket_okay(mdev, &sock);
- ok = drbd_socket_okay(mdev, &msock) && ok;
- if (ok)
- break;
- }
- } while (1);
+ ok = drbd_socket_okay(&sock.socket);
+ ok = drbd_socket_okay(&msock.socket) && ok;
+ } while (!ok);
- msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
- sock->sk->sk_allocation = GFP_NOIO;
- msock->sk->sk_allocation = GFP_NOIO;
+ sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
- msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+ sock.socket->sk->sk_allocation = GFP_NOIO;
+ msock.socket->sk->sk_allocation = GFP_NOIO;
+
+ sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
/* NOT YET ...
- * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- * first set it to the P_HAND_SHAKE timeout,
+ * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
+ * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ * first set it to the P_CONNECTION_FEATURES timeout,
* which we set to 4x the configured ping_timeout. */
- sock->sk->sk_sndtimeo =
- sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+
+ sock.socket->sk->sk_sndtimeo =
+ sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
+
+ msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+ timeout = nc->timeout * HZ / 10;
+ discard_my_data = nc->discard_my_data;
+ rcu_read_unlock();
- msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+ msock.socket->sk->sk_sndtimeo = timeout;
/* we don't want delays.
* we use TCP_CORK where appropriate, though */
- drbd_tcp_nodelay(sock);
- drbd_tcp_nodelay(msock);
+ drbd_tcp_nodelay(sock.socket);
+ drbd_tcp_nodelay(msock.socket);
- mdev->data.socket = sock;
- mdev->meta.socket = msock;
- mdev->last_received = jiffies;
+ tconn->data.socket = sock.socket;
+ tconn->meta.socket = msock.socket;
+ tconn->last_received = jiffies;
- D_ASSERT(mdev->asender.task == NULL);
-
- h = drbd_do_handshake(mdev);
+ h = drbd_do_features(tconn);
if (h <= 0)
return h;
- if (mdev->cram_hmac_tfm) {
+ if (tconn->cram_hmac_tfm) {
/* drbd_request_state(mdev, NS(conn, WFAuth)); */
- switch (drbd_do_auth(mdev)) {
+ switch (drbd_do_auth(tconn)) {
case -1:
- dev_err(DEV, "Authentication of peer failed\n");
+ conn_err(tconn, "Authentication of peer failed\n");
return -1;
case 0:
- dev_err(DEV, "Authentication of peer failed, trying again.\n");
+ conn_err(tconn, "Authentication of peer failed, trying again.\n");
return 0;
}
}
- sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
-
- atomic_set(&mdev->packet_seq, 0);
- mdev->peer_seq = 0;
+ tconn->data.socket->sk->sk_sndtimeo = timeout;
+ tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- if (drbd_send_protocol(mdev) == -1)
+ if (drbd_send_protocol(tconn) == -EOPNOTSUPP)
return -1;
- drbd_set_flag(mdev, STATE_SENT);
- drbd_send_sync_param(mdev, &mdev->sync_conf);
- drbd_send_sizes(mdev, 0, 0);
- drbd_send_uuids(mdev);
- drbd_send_current_state(mdev);
- drbd_clear_flag(mdev, USE_DEGR_WFC_T);
- drbd_clear_flag(mdev, RESIZE_PENDING);
-
- spin_lock_irq(&mdev->req_lock);
- rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
- if (mdev->state.conn != C_WF_REPORT_PARAMS)
- drbd_clear_flag(mdev, STATE_SENT);
- spin_unlock_irq(&mdev->req_lock);
-
- if (rv < SS_SUCCESS)
+
+ set_bit(STATE_SENT, &tconn->flags);
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+
+ if (discard_my_data)
+ set_bit(DISCARD_MY_DATA, &mdev->flags);
+ else
+ clear_bit(DISCARD_MY_DATA, &mdev->flags);
+
+ drbd_connected(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+ if (rv < SS_SUCCESS) {
+ clear_bit(STATE_SENT, &tconn->flags);
return 0;
+ }
- drbd_thread_start(&mdev->asender);
- mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+ drbd_thread_start(&tconn->asender);
- return 1;
+ mutex_lock(&tconn->conf_update);
+ /* The discard_my_data flag is a single-shot modifier to the next
+ * connection attempt, the handshake of which is now well underway.
+ * No need for rcu style copying of the whole struct
+ * just to clear a single value. */
+ tconn->net_conf->discard_my_data = 0;
+ mutex_unlock(&tconn->conf_update);
+
+ return h;
out_release_sockets:
- if (sock)
- sock_release(sock);
- if (msock)
- sock_release(msock);
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
+ if (sock.socket)
+ sock_release(sock.socket);
+ if (msock.socket)
+ sock_release(msock.socket);
return -1;
}
-static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
+static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi)
{
- union p_header *h = &mdev->data.rbuf.header;
- int r;
-
- r = drbd_recv(mdev, h, sizeof(*h));
- if (unlikely(r != sizeof(*h))) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
- return false;
- }
-
- if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
- *cmd = be16_to_cpu(h->h80.command);
- *packet_size = be16_to_cpu(h->h80.length);
- } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
- *cmd = be16_to_cpu(h->h95.command);
- *packet_size = be32_to_cpu(h->h95.length);
+ unsigned int header_size = drbd_header_size(tconn);
+
+ if (header_size == sizeof(struct p_header100) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+ struct p_header100 *h = header;
+ if (h->pad != 0) {
+ conn_err(tconn, "Header padding is not zero\n");
+ return -EINVAL;
+ }
+ pi->vnr = be16_to_cpu(h->volume);
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ } else if (header_size == sizeof(struct p_header95) &&
+ *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+ struct p_header95 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ pi->vnr = 0;
+ } else if (header_size == sizeof(struct p_header80) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+ struct p_header80 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be16_to_cpu(h->length);
+ pi->vnr = 0;
} else {
- dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
- be32_to_cpu(h->h80.magic),
- be16_to_cpu(h->h80.command),
- be16_to_cpu(h->h80.length));
- return false;
+ conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n",
+ be32_to_cpu(*(__be32 *)header),
+ tconn->agreed_pro_version);
+ return -EINVAL;
}
- mdev->last_received = jiffies;
+ pi->data = header + header_size;
+ return 0;
+}
+
+static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ void *buffer = tconn->data.rbuf;
+ int err;
+
+ err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn));
+ if (err)
+ return err;
+
+ err = decode_header(tconn, buffer, pi);
+ tconn->last_received = jiffies;
- return true;
+ return err;
}
-static void drbd_flush(struct drbd_conf *mdev)
+static void drbd_flush(struct drbd_tconn *tconn)
{
int rv;
+ struct drbd_conf *mdev;
+ int vnr;
- if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
- rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_NOIO,
- NULL);
- if (rv) {
- dev_info(DEV, "local disk flush failed with status %d\n", rv);
- /* would rather check on EOPNOTSUPP, but that is not reliable.
- * don't try again for ANY return value != 0
- * if (rv == -EOPNOTSUPP) */
- drbd_bump_write_ordering(mdev, WO_drain_io);
+ if (tconn->write_ordering >= WO_bdev_flush) {
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (!get_ldev(mdev))
+ continue;
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+
+ rv = blkdev_issue_flush(mdev->ldev->backing_bdev,
+ GFP_NOIO, NULL);
+ if (rv) {
+ dev_info(DEV, "local disk flush failed with status %d\n", rv);
+ /* would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0
+ * if (rv == -EOPNOTSUPP) */
+ drbd_bump_write_ordering(tconn, WO_drain_io);
+ }
+ put_ldev(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+
+ rcu_read_lock();
+ if (rv)
+ break;
}
- put_ldev(mdev);
+ rcu_read_unlock();
}
}
@@ -976,7 +1169,7 @@ static void drbd_flush(struct drbd_conf *mdev)
* @epoch: Epoch object.
* @ev: Epoch event.
*/
-static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn,
struct drbd_epoch *epoch,
enum epoch_event ev)
{
@@ -984,7 +1177,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
struct drbd_epoch *next_epoch;
enum finish_epoch rv = FE_STILL_LIVE;
- spin_lock(&mdev->epoch_lock);
+ spin_lock(&tconn->epoch_lock);
do {
next_epoch = NULL;
@@ -1006,18 +1199,22 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
atomic_read(&epoch->active) == 0 &&
(test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
if (!(ev & EV_CLEANUP)) {
- spin_unlock(&mdev->epoch_lock);
- drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
- spin_lock(&mdev->epoch_lock);
+ spin_unlock(&tconn->epoch_lock);
+ drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size);
+ spin_lock(&tconn->epoch_lock);
}
+#if 0
+ /* FIXME: dec unacked on connection, once we have
+ * something to count pending connection packets in. */
if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
- dec_unacked(mdev);
+ dec_unacked(epoch->tconn);
+#endif
- if (mdev->current_epoch != epoch) {
+ if (tconn->current_epoch != epoch) {
next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
list_del(&epoch->list);
ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
- mdev->epochs--;
+ tconn->epochs--;
kfree(epoch);
if (rv == FE_STILL_LIVE)
@@ -1028,7 +1225,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
/* atomic_set(&epoch->active, 0); is already zero */
if (rv == FE_STILL_LIVE)
rv = FE_RECYCLED;
- wake_up(&mdev->ee_wait);
}
}
@@ -1038,40 +1234,52 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
epoch = next_epoch;
} while (1);
- spin_unlock(&mdev->epoch_lock);
+ spin_unlock(&tconn->epoch_lock);
return rv;
}
/**
* drbd_bump_write_ordering() - Fall back to an other write ordering method
- * @mdev: DRBD device.
+ * @tconn: DRBD connection.
* @wo: Write ordering method to try.
*/
-void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo)
{
+ struct disk_conf *dc;
+ struct drbd_conf *mdev;
enum write_ordering_e pwo;
+ int vnr;
static char *write_ordering_str[] = {
[WO_none] = "none",
[WO_drain_io] = "drain",
[WO_bdev_flush] = "flush",
};
- pwo = mdev->write_ordering;
+ pwo = tconn->write_ordering;
wo = min(pwo, wo);
- if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
- wo = WO_drain_io;
- if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
- wo = WO_none;
- mdev->write_ordering = wo;
- if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
- dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (!get_ldev_if_state(mdev, D_ATTACHING))
+ continue;
+ dc = rcu_dereference(mdev->ldev->disk_conf);
+
+ if (wo == WO_bdev_flush && !dc->disk_flushes)
+ wo = WO_drain_io;
+ if (wo == WO_drain_io && !dc->disk_drain)
+ wo = WO_none;
+ put_ldev(mdev);
+ }
+ rcu_read_unlock();
+ tconn->write_ordering = wo;
+ if (pwo != tconn->write_ordering || wo == WO_bdev_flush)
+ conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]);
}
/**
- * drbd_submit_ee()
+ * drbd_submit_peer_request()
* @mdev: DRBD device.
- * @e: epoch entry
+ * @peer_req: peer request
* @rw: flag field, see bio->bi_rw
*
* May spread the pages to multiple bios,
@@ -1085,14 +1293,15 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
* on certain Xen deployments.
*/
/* TODO allocate from our own bio_set. */
-int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- const unsigned rw, const int fault_type)
+int drbd_submit_peer_request(struct drbd_conf *mdev,
+ struct drbd_peer_request *peer_req,
+ const unsigned rw, const int fault_type)
{
struct bio *bios = NULL;
struct bio *bio;
- struct page *page = e->pages;
- sector_t sector = e->sector;
- unsigned ds = e->size;
+ struct page *page = peer_req->pages;
+ sector_t sector = peer_req->i.sector;
+ unsigned ds = peer_req->i.size;
unsigned n_bios = 0;
unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
int err = -ENOMEM;
@@ -1111,12 +1320,12 @@ next_bio:
dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
goto fail;
}
- /* > e->sector, unless this is the first bio */
+ /* > peer_req->i.sector, unless this is the first bio */
bio->bi_sector = sector;
bio->bi_bdev = mdev->ldev->backing_bdev;
bio->bi_rw = rw;
- bio->bi_private = e;
- bio->bi_end_io = drbd_endio_sec;
+ bio->bi_private = peer_req;
+ bio->bi_end_io = drbd_peer_request_endio;
bio->bi_next = bios;
bios = bio;
@@ -1145,7 +1354,7 @@ next_bio:
D_ASSERT(page == NULL);
D_ASSERT(ds == 0);
- atomic_set(&e->pending_bios, n_bios);
+ atomic_set(&peer_req->pending_bios, n_bios);
do {
bio = bios;
bios = bios->bi_next;
@@ -1164,26 +1373,57 @@ fail:
return err;
}
-static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_interval *i = &peer_req->i;
+
+ drbd_remove_interval(&mdev->write_requests, i);
+ drbd_clear_interval(i);
+
+ /* Wake up any processes waiting for this peer request to complete. */
+ if (i->waiting)
+ wake_up(&mdev->misc_wait);
+}
+
+void conn_wait_active_ee_empty(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi)
{
int rv;
- struct p_barrier *p = &mdev->data.rbuf.barrier;
+ struct p_barrier *p = pi->data;
struct drbd_epoch *epoch;
- inc_unacked(mdev);
-
- mdev->current_epoch->barrier_nr = p->barrier;
- rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+ /* FIXME these are unacked on connection,
+ * not a specific (peer)device.
+ */
+ tconn->current_epoch->barrier_nr = p->barrier;
+ tconn->current_epoch->tconn = tconn;
+ rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR);
/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
* the activity log, which means it would not be resynced in case the
* R_PRIMARY crashes now.
* Therefore we must send the barrier_ack after the barrier request was
* completed. */
- switch (mdev->write_ordering) {
+ switch (tconn->write_ordering) {
case WO_none:
if (rv == FE_RECYCLED)
- return true;
+ return 0;
/* receiver context, in the writeout path of the other node.
* avoid potential distributed deadlock */
@@ -1191,81 +1431,75 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
if (epoch)
break;
else
- dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+ conn_warn(tconn, "Allocation of an epoch failed, slowing down\n");
/* Fall through */
case WO_bdev_flush:
case WO_drain_io:
- drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
- drbd_flush(mdev);
+ conn_wait_active_ee_empty(tconn);
+ drbd_flush(tconn);
- if (atomic_read(&mdev->current_epoch->epoch_size)) {
+ if (atomic_read(&tconn->current_epoch->epoch_size)) {
epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
if (epoch)
break;
}
- epoch = mdev->current_epoch;
- wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
-
- D_ASSERT(atomic_read(&epoch->active) == 0);
- D_ASSERT(epoch->flags == 0);
-
- return true;
+ return 0;
default:
- dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
- return false;
+ conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering);
+ return -EIO;
}
epoch->flags = 0;
atomic_set(&epoch->epoch_size, 0);
atomic_set(&epoch->active, 0);
- spin_lock(&mdev->epoch_lock);
- if (atomic_read(&mdev->current_epoch->epoch_size)) {
- list_add(&epoch->list, &mdev->current_epoch->list);
- mdev->current_epoch = epoch;
- mdev->epochs++;
+ spin_lock(&tconn->epoch_lock);
+ if (atomic_read(&tconn->current_epoch->epoch_size)) {
+ list_add(&epoch->list, &tconn->current_epoch->list);
+ tconn->current_epoch = epoch;
+ tconn->epochs++;
} else {
/* The current_epoch got recycled while we allocated this one... */
kfree(epoch);
}
- spin_unlock(&mdev->epoch_lock);
+ spin_unlock(&tconn->epoch_lock);
- return true;
+ return 0;
}
/* used from receive_RSDataReply (recv_resync_read)
* and from receive_Data */
-static struct drbd_epoch_entry *
-read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+static struct drbd_peer_request *
+read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector,
+ int data_size) __must_hold(local)
{
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
struct page *page;
- int dgs, ds, rr;
- void *dig_in = mdev->int_dig_in;
- void *dig_vv = mdev->int_dig_vv;
+ int dgs, ds, err;
+ void *dig_in = mdev->tconn->int_dig_in;
+ void *dig_vv = mdev->tconn->int_dig_vv;
unsigned long *data;
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
-
- if (dgs) {
- rr = drbd_recv(mdev, dig_in, dgs);
- if (rr != dgs) {
- if (!signal_pending(current))
- dev_warn(DEV,
- "short read receiving data digest: read %d expected %d\n",
- rr, dgs);
+ dgs = 0;
+ if (mdev->tconn->peer_integrity_tfm) {
+ dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+ /*
+ * FIXME: Receive the incoming digest into the receive buffer
+ * here, together with its struct p_data?
+ */
+ err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
+ if (err)
return NULL;
- }
+ data_size -= dgs;
}
- data_size -= dgs;
-
- ERR_IF(data_size & 0x1ff) return NULL;
- ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL;
+ if (!expect(IS_ALIGNED(data_size, 512)))
+ return NULL;
+ if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
+ return NULL;
/* even though we trust out peer,
* we sometimes have to double check. */
@@ -1280,47 +1514,42 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
- if (!e)
+ peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO);
+ if (!peer_req)
return NULL;
if (!data_size)
- return e;
+ return peer_req;
ds = data_size;
- page = e->pages;
+ page = peer_req->pages;
page_chain_for_each(page) {
unsigned len = min_t(int, ds, PAGE_SIZE);
data = kmap(page);
- rr = drbd_recv(mdev, data, len);
+ err = drbd_recv_all_warn(mdev->tconn, data, len);
if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
dev_err(DEV, "Fault injection: Corrupting data on receive\n");
data[0] = data[0] ^ (unsigned long)-1;
}
kunmap(page);
- if (rr != len) {
- drbd_free_ee(mdev, e);
- if (!signal_pending(current))
- dev_warn(DEV, "short read receiving data: read %d expected %d\n",
- rr, len);
+ if (err) {
+ drbd_free_peer_req(mdev, peer_req);
return NULL;
}
- ds -= rr;
+ ds -= len;
}
if (dgs) {
- drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
+ drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
(unsigned long long)sector, data_size);
- drbd_bcast_ee(mdev, "digest failed",
- dgs, dig_in, dig_vv, e);
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
return NULL;
}
}
mdev->recv_cnt += data_size>>9;
- return e;
+ return peer_req;
}
/* drbd_drain_block() just takes a data block
@@ -1329,30 +1558,26 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
{
struct page *page;
- int rr, rv = 1;
+ int err = 0;
void *data;
if (!data_size)
- return true;
+ return 0;
- page = drbd_pp_alloc(mdev, 1, 1);
+ page = drbd_alloc_pages(mdev, 1, 1);
data = kmap(page);
while (data_size) {
- rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
- if (rr != min_t(int, data_size, PAGE_SIZE)) {
- rv = 0;
- if (!signal_pending(current))
- dev_warn(DEV,
- "short read receiving data: read %d expected %d\n",
- rr, min_t(int, data_size, PAGE_SIZE));
+ unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+ err = drbd_recv_all_warn(mdev->tconn, data, len);
+ if (err)
break;
- }
- data_size -= rr;
+ data_size -= len;
}
kunmap(page);
- drbd_pp_free(mdev, page, 0);
- return rv;
+ drbd_free_pages(mdev, page, 0);
+ return err;
}
static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
@@ -1360,26 +1585,19 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
{
struct bio_vec *bvec;
struct bio *bio;
- int dgs, rr, i, expect;
- void *dig_in = mdev->int_dig_in;
- void *dig_vv = mdev->int_dig_vv;
-
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+ int dgs, err, i, expect;
+ void *dig_in = mdev->tconn->int_dig_in;
+ void *dig_vv = mdev->tconn->int_dig_vv;
- if (dgs) {
- rr = drbd_recv(mdev, dig_in, dgs);
- if (rr != dgs) {
- if (!signal_pending(current))
- dev_warn(DEV,
- "short read receiving data reply digest: read %d expected %d\n",
- rr, dgs);
- return 0;
- }
+ dgs = 0;
+ if (mdev->tconn->peer_integrity_tfm) {
+ dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+ err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
+ if (err)
+ return err;
+ data_size -= dgs;
}
- data_size -= dgs;
-
/* optimistically update recv_cnt. if receiving fails below,
* we disconnect anyways, and counters will be reset. */
mdev->recv_cnt += data_size>>9;
@@ -1388,63 +1606,61 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
D_ASSERT(sector == bio->bi_sector);
bio_for_each_segment(bvec, bio, i) {
+ void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
expect = min_t(int, data_size, bvec->bv_len);
- rr = drbd_recv(mdev,
- kmap(bvec->bv_page)+bvec->bv_offset,
- expect);
+ err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
kunmap(bvec->bv_page);
- if (rr != expect) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read receiving data reply: "
- "read %d expected %d\n",
- rr, expect);
- return 0;
- }
- data_size -= rr;
+ if (err)
+ return err;
+ data_size -= expect;
}
if (dgs) {
- drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+ drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
- return 0;
+ return -EINVAL;
}
}
D_ASSERT(data_size == 0);
- return 1;
+ return 0;
}
-/* e_end_resync_block() is called via
- * drbd_process_done_ee() by asender only */
-static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+/*
+ * e_end_resync_block() is called in asender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- sector_t sector = e->sector;
- int ok;
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ sector_t sector = peer_req->i.sector;
+ int err;
- D_ASSERT(hlist_unhashed(&e->collision));
+ D_ASSERT(drbd_interval_empty(&peer_req->i));
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- drbd_set_in_sync(mdev, sector, e->size);
- ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ drbd_set_in_sync(mdev, sector, peer_req->i.size);
+ err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req);
} else {
/* Record failure to sync */
- drbd_rs_failed_io(mdev, sector, e->size);
+ drbd_rs_failed_io(mdev, sector, peer_req->i.size);
- ok = drbd_send_ack(mdev, P_NEG_ACK, e);
+ err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
}
dec_unacked(mdev);
- return ok;
+ return err;
}
static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
{
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
- e = read_in_block(mdev, ID_SYNCER, sector, data_size);
- if (!e)
+ peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size);
+ if (!peer_req)
goto fail;
dec_rs_pending(mdev);
@@ -1453,64 +1669,88 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
/* corresponding dec_unacked() in e_end_resync_block()
* respective _drbd_clear_done_ee */
- e->w.cb = e_end_resync_block;
+ peer_req->w.cb = e_end_resync_block;
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->sync_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add(&peer_req->w.list, &mdev->sync_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
atomic_add(data_size >> 9, &mdev->rs_sect_ev);
- if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
- return true;
+ if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+ return 0;
/* don't care for the reason here */
dev_err(DEV, "submit failed, triggering re-connect\n");
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
fail:
put_ldev(mdev);
- return false;
+ return -EIO;
+}
+
+static struct drbd_request *
+find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id,
+ sector_t sector, bool missing_ok, const char *func)
+{
+ struct drbd_request *req;
+
+ /* Request object according to our peer */
+ req = (struct drbd_request *)(unsigned long)id;
+ if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+ return req;
+ if (!missing_ok) {
+ dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func,
+ (unsigned long)id, (unsigned long long)sector);
+ }
+ return NULL;
}
-static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
struct drbd_request *req;
sector_t sector;
- int ok;
- struct p_data *p = &mdev->data.rbuf.data;
+ int err;
+ struct p_data *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
sector = be64_to_cpu(p->sector);
- spin_lock_irq(&mdev->req_lock);
- req = _ar_id_to_req(mdev, p->block_id, sector);
- spin_unlock_irq(&mdev->req_lock);
- if (unlikely(!req)) {
- dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
- return false;
- }
+ spin_lock_irq(&mdev->tconn->req_lock);
+ req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ if (unlikely(!req))
+ return -EIO;
/* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
* special casing it there for the various failure cases.
* still no race with drbd_fail_pending_reads */
- ok = recv_dless_read(mdev, req, sector, data_size);
-
- if (ok)
- req_mod(req, data_received);
+ err = recv_dless_read(mdev, req, sector, pi->size);
+ if (!err)
+ req_mod(req, DATA_RECEIVED);
/* else: nothing. handled from drbd_disconnect...
* I don't think we may complete this just yet
* in case we are "on-disconnect: freeze" */
- return ok;
+ return err;
}
-static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
sector_t sector;
- int ok;
- struct p_data *p = &mdev->data.rbuf.data;
+ int err;
+ struct p_data *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
sector = be64_to_cpu(p->sector);
D_ASSERT(p->block_id == ID_SYNCER);
@@ -1518,42 +1758,63 @@ static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, un
if (get_ldev(mdev)) {
/* data is submitted to disk within recv_resync_read.
* corresponding put_ldev done below on error,
- * or in drbd_endio_write_sec. */
- ok = recv_resync_read(mdev, sector, data_size);
+ * or in drbd_peer_request_endio. */
+ err = recv_resync_read(mdev, sector, pi->size);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not write resync data to local disk.\n");
- ok = drbd_drain_block(mdev, data_size);
+ err = drbd_drain_block(mdev, pi->size);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
}
- atomic_add(data_size >> 9, &mdev->rs_sect_in);
+ atomic_add(pi->size >> 9, &mdev->rs_sect_in);
- return ok;
+ return err;
}
-/* e_end_block() is called via drbd_process_done_ee().
- * this means this function only runs in the asender thread
- */
-static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static void restart_conflicting_writes(struct drbd_conf *mdev,
+ sector_t sector, int size)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- sector_t sector = e->sector;
- int ok = 1, pcmd;
+ struct drbd_interval *i;
+ struct drbd_request *req;
- if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+ drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED))
+ continue;
+ /* as it is RQ_POSTPONED, this will cause it to
+ * be queued on the retry workqueue. */
+ __req_mod(req, CONFLICT_RESOLVED, NULL);
+ }
+}
+
+/*
+ * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ sector_t sector = peer_req->i.sector;
+ int err = 0, pcmd;
+
+ if (peer_req->flags & EE_SEND_WRITE_ACK) {
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
mdev->state.conn <= C_PAUSED_SYNC_T &&
- e->flags & EE_MAY_SET_IN_SYNC) ?
+ peer_req->flags & EE_MAY_SET_IN_SYNC) ?
P_RS_WRITE_ACK : P_WRITE_ACK;
- ok &= drbd_send_ack(mdev, pcmd, e);
+ err = drbd_send_ack(mdev, pcmd, peer_req);
if (pcmd == P_RS_WRITE_ACK)
- drbd_set_in_sync(mdev, sector, e->size);
+ drbd_set_in_sync(mdev, sector, peer_req->i.size);
} else {
- ok = drbd_send_ack(mdev, P_NEG_ACK, e);
+ err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
/* we expect it to be marked out of sync anyways...
* maybe assert this? */
}
@@ -1561,52 +1822,115 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
/* we delete from the conflict detection hash _after_ we sent out the
* P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
- if (mdev->net_conf->two_primaries) {
- spin_lock_irq(&mdev->req_lock);
- D_ASSERT(!hlist_unhashed(&e->collision));
- hlist_del_init(&e->collision);
- spin_unlock_irq(&mdev->req_lock);
- } else {
- D_ASSERT(hlist_unhashed(&e->collision));
- }
+ if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+ spin_lock_irq(&mdev->tconn->req_lock);
+ D_ASSERT(!drbd_interval_empty(&peer_req->i));
+ drbd_remove_epoch_entry_interval(mdev, peer_req);
+ if (peer_req->flags & EE_RESTART_REQUESTS)
+ restart_conflicting_writes(mdev, sector, peer_req->i.size);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ } else
+ D_ASSERT(drbd_interval_empty(&peer_req->i));
- drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+ drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
- return ok;
+ return err;
}
-static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- int ok = 1;
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ int err;
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
+ err = drbd_send_ack(mdev, ack, peer_req);
+ dec_unacked(mdev);
- spin_lock_irq(&mdev->req_lock);
- D_ASSERT(!hlist_unhashed(&e->collision));
- hlist_del_init(&e->collision);
- spin_unlock_irq(&mdev->req_lock);
+ return err;
+}
- dec_unacked(mdev);
+static int e_send_superseded(struct drbd_work *w, int unused)
+{
+ return e_send_ack(w, P_SUPERSEDED);
+}
+
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+ struct drbd_tconn *tconn = w->mdev->tconn;
+
+ return e_send_ack(w, tconn->agreed_pro_version >= 100 ?
+ P_RETRY_WRITE : P_SUPERSEDED);
+}
+
+static bool seq_greater(u32 a, u32 b)
+{
+ /*
+ * We assume 32-bit wrap-around here.
+ * For 24-bit wrap-around, we would have to shift:
+ * a <<= 8; b <<= 8;
+ */
+ return (s32)a - (s32)b > 0;
+}
+
+static u32 seq_max(u32 a, u32 b)
+{
+ return seq_greater(a, b) ? a : b;
+}
+
+static bool need_peer_seq(struct drbd_conf *mdev)
+{
+ struct drbd_tconn *tconn = mdev->tconn;
+ int tp;
+
+ /*
+ * We only need to keep track of the last packet_seq number of our peer
+ * if we are in dual-primary mode and we have the resolve-conflicts flag set; see
+ * handle_write_conflicts().
+ */
+
+ rcu_read_lock();
+ tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+ rcu_read_unlock();
+
+ return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags);
+}
+
+static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
+{
+ unsigned int newest_peer_seq;
- return ok;
+ if (need_peer_seq(mdev)) {
+ spin_lock(&mdev->peer_seq_lock);
+ newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
+ mdev->peer_seq = newest_peer_seq;
+ spin_unlock(&mdev->peer_seq_lock);
+ /* wake up only if we actually changed mdev->peer_seq */
+ if (peer_seq == newest_peer_seq)
+ wake_up(&mdev->seq_wait);
+ }
}
-static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
{
+ return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
- struct drbd_epoch_entry *rs_e;
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
+{
+ struct drbd_peer_request *rs_req;
bool rv = 0;
- spin_lock_irq(&mdev->req_lock);
- list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
- if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_for_each_entry(rs_req, &mdev->sync_ee, w.list) {
+ if (overlaps(peer_req->i.sector, peer_req->i.size,
+ rs_req->i.sector, rs_req->i.size)) {
rv = 1;
break;
}
}
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
return rv;
}
@@ -1632,35 +1956,41 @@ static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_e
*
* returns 0 if we may process the packet,
* -ERESTARTSYS if we were interrupted (by disconnect signal). */
-static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq)
{
DEFINE_WAIT(wait);
- unsigned int p_seq;
long timeout;
- int ret = 0;
+ int ret;
+
+ if (!need_peer_seq(mdev))
+ return 0;
+
spin_lock(&mdev->peer_seq_lock);
for (;;) {
- prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
- if (seq_le(packet_seq, mdev->peer_seq+1))
+ if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
+ mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
+ ret = 0;
break;
+ }
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
- p_seq = mdev->peer_seq;
+ prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
spin_unlock(&mdev->peer_seq_lock);
- timeout = schedule_timeout(30*HZ);
+ rcu_read_lock();
+ timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10;
+ rcu_read_unlock();
+ timeout = schedule_timeout(timeout);
spin_lock(&mdev->peer_seq_lock);
- if (timeout == 0 && p_seq == mdev->peer_seq) {
+ if (!timeout) {
ret = -ETIMEDOUT;
- dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+ dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n");
break;
}
}
- finish_wait(&mdev->seq_wait, &wait);
- if (mdev->peer_seq+1 == packet_seq)
- mdev->peer_seq++;
spin_unlock(&mdev->peer_seq_lock);
+ finish_wait(&mdev->seq_wait, &wait);
return ret;
}
@@ -1675,233 +2005,277 @@ static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
(dpf & DP_DISCARD ? REQ_DISCARD : 0);
}
+static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector,
+ unsigned int size)
+{
+ struct drbd_interval *i;
+
+ repeat:
+ drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+ struct drbd_request *req;
+ struct bio_and_error m;
+
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (!(req->rq_state & RQ_POSTPONED))
+ continue;
+ req->rq_state &= ~RQ_POSTPONED;
+ __req_mod(req, NEG_ACKED, &m);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ if (m.bio)
+ complete_master_bio(mdev, &m);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ goto repeat;
+ }
+}
+
+static int handle_write_conflicts(struct drbd_conf *mdev,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_tconn *tconn = mdev->tconn;
+ bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &tconn->flags);
+ sector_t sector = peer_req->i.sector;
+ const unsigned int size = peer_req->i.size;
+ struct drbd_interval *i;
+ bool equal;
+ int err;
+
+ /*
+ * Inserting the peer request into the write_requests tree will prevent
+ * new conflicting local requests from being added.
+ */
+ drbd_insert_interval(&mdev->write_requests, &peer_req->i);
+
+ repeat:
+ drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+ if (i == &peer_req->i)
+ continue;
+
+ if (!i->local) {
+ /*
+ * Our peer has sent a conflicting remote request; this
+ * should not happen in a two-node setup. Wait for the
+ * earlier peer request to complete.
+ */
+ err = drbd_wait_misc(mdev, i);
+ if (err)
+ goto out;
+ goto repeat;
+ }
+
+ equal = i->sector == sector && i->size == size;
+ if (resolve_conflicts) {
+ /*
+ * If the peer request is fully contained within the
+ * overlapping request, it can be considered overwritten
+ * and thus superseded; otherwise, it will be retried
+ * once all overlapping requests have completed.
+ */
+ bool superseded = i->sector <= sector && i->sector +
+ (i->size >> 9) >= sector + (size >> 9);
+
+ if (!equal)
+ dev_alert(DEV, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u, "
+ "assuming %s came first\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size,
+ superseded ? "local" : "remote");
+
+ inc_unacked(mdev);
+ peer_req->w.cb = superseded ? e_send_superseded :
+ e_send_retry_write;
+ list_add_tail(&peer_req->w.list, &mdev->done_ee);
+ wake_asender(mdev->tconn);
+
+ err = -ENOENT;
+ goto out;
+ } else {
+ struct drbd_request *req =
+ container_of(i, struct drbd_request, i);
+
+ if (!equal)
+ dev_alert(DEV, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size);
+
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED)) {
+ /*
+ * Wait for the node with the discard flag to
+ * decide if this request has been superseded
+ * or needs to be retried.
+ * Requests that have been superseded will
+ * disappear from the write_requests tree.
+ *
+ * In addition, wait for the conflicting
+ * request to finish locally before submitting
+ * the conflicting peer request.
+ */
+ err = drbd_wait_misc(mdev, &req->i);
+ if (err) {
+ _conn_request_state(mdev->tconn,
+ NS(conn, C_TIMEOUT),
+ CS_HARD);
+ fail_postponed_requests(mdev, sector, size);
+ goto out;
+ }
+ goto repeat;
+ }
+ /*
+ * Remember to restart the conflicting requests after
+ * the new peer request has completed.
+ */
+ peer_req->flags |= EE_RESTART_REQUESTS;
+ }
+ }
+ err = 0;
+
+ out:
+ if (err)
+ drbd_remove_epoch_entry_interval(mdev, peer_req);
+ return err;
+}
+
/* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
sector_t sector;
- struct drbd_epoch_entry *e;
- struct p_data *p = &mdev->data.rbuf.data;
+ struct drbd_peer_request *peer_req;
+ struct p_data *p = pi->data;
+ u32 peer_seq = be32_to_cpu(p->seq_num);
int rw = WRITE;
u32 dp_flags;
+ int err, tp;
- if (!get_ldev(mdev)) {
- spin_lock(&mdev->peer_seq_lock);
- if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
- mdev->peer_seq++;
- spin_unlock(&mdev->peer_seq_lock);
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
- drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
- atomic_inc(&mdev->current_epoch->epoch_size);
- return drbd_drain_block(mdev, data_size);
+ if (!get_ldev(mdev)) {
+ int err2;
+
+ err = wait_for_and_update_peer_seq(mdev, peer_seq);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
+ atomic_inc(&tconn->current_epoch->epoch_size);
+ err2 = drbd_drain_block(mdev, pi->size);
+ if (!err)
+ err = err2;
+ return err;
}
- /* get_ldev(mdev) successful.
- * Corresponding put_ldev done either below (on various errors),
- * or in drbd_endio_write_sec, if we successfully submit the data at
- * the end of this function. */
+ /*
+ * Corresponding put_ldev done either below (on various errors), or in
+ * drbd_peer_request_endio, if we successfully submit the data at the
+ * end of this function.
+ */
sector = be64_to_cpu(p->sector);
- e = read_in_block(mdev, p->block_id, sector, data_size);
- if (!e) {
+ peer_req = read_in_block(mdev, p->block_id, sector, pi->size);
+ if (!peer_req) {
put_ldev(mdev);
- return false;
+ return -EIO;
}
- e->w.cb = e_end_block;
+ peer_req->w.cb = e_end_block;
dp_flags = be32_to_cpu(p->dp_flags);
rw |= wire_flags_to_bio(mdev, dp_flags);
- if (e->pages == NULL) {
- D_ASSERT(e->size == 0);
+ if (peer_req->pages == NULL) {
+ D_ASSERT(peer_req->i.size == 0);
D_ASSERT(dp_flags & DP_FLUSH);
}
if (dp_flags & DP_MAY_SET_IN_SYNC)
- e->flags |= EE_MAY_SET_IN_SYNC;
-
- spin_lock(&mdev->epoch_lock);
- e->epoch = mdev->current_epoch;
- atomic_inc(&e->epoch->epoch_size);
- atomic_inc(&e->epoch->active);
- spin_unlock(&mdev->epoch_lock);
-
- /* I'm the receiver, I do hold a net_cnt reference. */
- if (!mdev->net_conf->two_primaries) {
- spin_lock_irq(&mdev->req_lock);
- } else {
- /* don't get the req_lock yet,
- * we may sleep in drbd_wait_peer_seq */
- const int size = e->size;
- const int discard = drbd_test_flag(mdev, DISCARD_CONCURRENT);
- DEFINE_WAIT(wait);
- struct drbd_request *i;
- struct hlist_node *n;
- struct hlist_head *slot;
- int first;
-
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- BUG_ON(mdev->ee_hash == NULL);
- BUG_ON(mdev->tl_hash == NULL);
-
- /* conflict detection and handling:
- * 1. wait on the sequence number,
- * in case this data packet overtook ACK packets.
- * 2. check our hash tables for conflicting requests.
- * we only need to walk the tl_hash, since an ee can not
- * have a conflict with an other ee: on the submitting
- * node, the corresponding req had already been conflicting,
- * and a conflicting req is never sent.
- *
- * Note: for two_primaries, we are protocol C,
- * so there cannot be any request that is DONE
- * but still on the transfer log.
- *
- * unconditionally add to the ee_hash.
- *
- * if no conflicting request is found:
- * submit.
- *
- * if any conflicting request is found
- * that has not yet been acked,
- * AND I have the "discard concurrent writes" flag:
- * queue (via done_ee) the P_DISCARD_ACK; OUT.
- *
- * if any conflicting request is found:
- * block the receiver, waiting on misc_wait
- * until no more conflicting requests are there,
- * or we get interrupted (disconnect).
- *
- * we do not just write after local io completion of those
- * requests, but only after req is done completely, i.e.
- * we wait for the P_DISCARD_ACK to arrive!
- *
- * then proceed normally, i.e. submit.
- */
- if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
+ peer_req->flags |= EE_MAY_SET_IN_SYNC;
+
+ spin_lock(&tconn->epoch_lock);
+ peer_req->epoch = tconn->current_epoch;
+ atomic_inc(&peer_req->epoch->epoch_size);
+ atomic_inc(&peer_req->epoch->active);
+ spin_unlock(&tconn->epoch_lock);
+
+ rcu_read_lock();
+ tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+ rcu_read_unlock();
+ if (tp) {
+ peer_req->flags |= EE_IN_INTERVAL_TREE;
+ err = wait_for_and_update_peer_seq(mdev, peer_seq);
+ if (err)
goto out_interrupted;
-
- spin_lock_irq(&mdev->req_lock);
-
- hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
-
-#define OVERLAPS overlaps(i->sector, i->size, sector, size)
- slot = tl_hash_slot(mdev, sector);
- first = 1;
- for (;;) {
- int have_unacked = 0;
- int have_conflict = 0;
- prepare_to_wait(&mdev->misc_wait, &wait,
- TASK_INTERRUPTIBLE);
- hlist_for_each_entry(i, n, slot, collision) {
- if (OVERLAPS) {
- /* only ALERT on first iteration,
- * we may be woken up early... */
- if (first)
- dev_alert(DEV, "%s[%u] Concurrent local write detected!"
- " new: %llus +%u; pending: %llus +%u\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- (unsigned long long)i->sector, i->size);
- if (i->rq_state & RQ_NET_PENDING)
- ++have_unacked;
- ++have_conflict;
- }
- }
-#undef OVERLAPS
- if (!have_conflict)
- break;
-
- /* Discard Ack only for the _first_ iteration */
- if (first && discard && have_unacked) {
- dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
- (unsigned long long)sector);
- inc_unacked(mdev);
- e->w.cb = e_send_discard_ack;
- list_add_tail(&e->w.list, &mdev->done_ee);
-
- spin_unlock_irq(&mdev->req_lock);
-
- /* we could probably send that P_DISCARD_ACK ourselves,
- * but I don't like the receiver using the msock */
-
+ spin_lock_irq(&mdev->tconn->req_lock);
+ err = handle_write_conflicts(mdev, peer_req);
+ if (err) {
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ if (err == -ENOENT) {
put_ldev(mdev);
- wake_asender(mdev);
- finish_wait(&mdev->misc_wait, &wait);
- return true;
+ return 0;
}
+ goto out_interrupted;
+ }
+ } else
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add(&peer_req->w.list, &mdev->active_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- if (signal_pending(current)) {
- hlist_del_init(&e->collision);
-
- spin_unlock_irq(&mdev->req_lock);
-
- finish_wait(&mdev->misc_wait, &wait);
- goto out_interrupted;
- }
+ if (mdev->state.conn == C_SYNC_TARGET)
+ wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req));
- spin_unlock_irq(&mdev->req_lock);
- if (first) {
- first = 0;
- dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
- "sec=%llus\n", (unsigned long long)sector);
- } else if (discard) {
- /* we had none on the first iteration.
- * there must be none now. */
- D_ASSERT(have_unacked == 0);
- }
- schedule();
- spin_lock_irq(&mdev->req_lock);
+ if (mdev->tconn->agreed_pro_version < 100) {
+ rcu_read_lock();
+ switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) {
+ case DRBD_PROT_C:
+ dp_flags |= DP_SEND_WRITE_ACK;
+ break;
+ case DRBD_PROT_B:
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ break;
}
- finish_wait(&mdev->misc_wait, &wait);
+ rcu_read_unlock();
}
- list_add(&e->w.list, &mdev->active_ee);
- spin_unlock_irq(&mdev->req_lock);
-
- if (mdev->state.conn == C_SYNC_TARGET)
- wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
-
- switch (mdev->net_conf->wire_protocol) {
- case DRBD_PROT_C:
+ if (dp_flags & DP_SEND_WRITE_ACK) {
+ peer_req->flags |= EE_SEND_WRITE_ACK;
inc_unacked(mdev);
/* corresponding dec_unacked() in e_end_block()
* respective _drbd_clear_done_ee */
- break;
- case DRBD_PROT_B:
+ }
+
+ if (dp_flags & DP_SEND_RECEIVE_ACK) {
/* I really don't like it that the receiver thread
* sends on the msock, but anyways */
- drbd_send_ack(mdev, P_RECV_ACK, e);
- break;
- case DRBD_PROT_A:
- /* nothing to do */
- break;
+ drbd_send_ack(mdev, P_RECV_ACK, peer_req);
}
if (mdev->state.pdsk < D_INCONSISTENT) {
/* In case we have the only disk of the cluster, */
- drbd_set_out_of_sync(mdev, e->sector, e->size);
- e->flags |= EE_CALL_AL_COMPLETE_IO;
- e->flags &= ~EE_MAY_SET_IN_SYNC;
- drbd_al_begin_io(mdev, e->sector);
+ drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
+ peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+ peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+ drbd_al_begin_io(mdev, &peer_req->i);
}
- if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
- return true;
+ err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
+ if (!err)
+ return 0;
/* don't care for the reason here */
dev_err(DEV, "submit failed, triggering re-connect\n");
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- hlist_del_init(&e->collision);
- spin_unlock_irq(&mdev->req_lock);
- if (e->flags & EE_CALL_AL_COMPLETE_IO)
- drbd_al_complete_io(mdev, e->sector);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_del(&peer_req->w.list);
+ drbd_remove_epoch_entry_interval(mdev, peer_req);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+ drbd_al_complete_io(mdev, &peer_req->i);
out_interrupted:
- drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
+ drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP);
put_ldev(mdev);
- drbd_free_ee(mdev, e);
- return false;
+ drbd_free_peer_req(mdev, peer_req);
+ return err;
}
/* We may throttle resync, if the lower device seems to be busy,
@@ -1922,9 +2296,14 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
struct lc_element *tmp;
int curr_events;
int throttle = 0;
+ unsigned int c_min_rate;
+
+ rcu_read_lock();
+ c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate;
+ rcu_read_unlock();
/* feature disabled? */
- if (mdev->sync_conf.c_min_rate == 0)
+ if (c_min_rate == 0)
return 0;
spin_lock_irq(&mdev->al_lock);
@@ -1964,40 +2343,46 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
db = mdev->rs_mark_left[i] - rs_left;
dbdt = Bit2KB(db/dt);
- if (dbdt > mdev->sync_conf.c_min_rate)
+ if (dbdt > c_min_rate)
throttle = 1;
}
return throttle;
}
-static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
+static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
sector_t sector;
- const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
- struct drbd_epoch_entry *e;
+ sector_t capacity;
+ struct drbd_peer_request *peer_req;
struct digest_info *di = NULL;
int size, verb;
unsigned int fault_type;
- struct p_block_req *p = &mdev->data.rbuf.block_req;
+ struct p_block_req *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+ capacity = drbd_get_capacity(mdev->this_bdev);
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
(unsigned long long)sector, size);
- return false;
+ return -EINVAL;
}
if (sector + (size>>9) > capacity) {
dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
(unsigned long long)sector, size);
- return false;
+ return -EINVAL;
}
if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
verb = 1;
- switch (cmd) {
+ switch (pi->cmd) {
case P_DATA_REQUEST:
drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
break;
@@ -2012,35 +2397,34 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
break;
default:
- dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(cmd));
+ BUG();
}
if (verb && __ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not satisfy peer's read request, "
"no local data.\n");
/* drain possibly payload */
- return drbd_drain_block(mdev, digest_size);
+ return drbd_drain_block(mdev, pi->size);
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
- if (!e) {
+ peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO);
+ if (!peer_req) {
put_ldev(mdev);
- return false;
+ return -ENOMEM;
}
- switch (cmd) {
+ switch (pi->cmd) {
case P_DATA_REQUEST:
- e->w.cb = w_e_end_data_req;
+ peer_req->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
/* application IO, don't drbd_rs_begin_io */
goto submit;
case P_RS_DATA_REQUEST:
- e->w.cb = w_e_end_rsdata_req;
+ peer_req->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
/* used in the sector offset progress display */
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
@@ -2049,28 +2433,28 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
fault_type = DRBD_FAULT_RS_RD;
- di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+ di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
if (!di)
goto out_free_e;
- di->digest_size = digest_size;
+ di->digest_size = pi->size;
di->digest = (((char *)di)+sizeof(struct digest_info));
- e->digest = di;
- e->flags |= EE_HAS_DIGEST;
+ peer_req->digest = di;
+ peer_req->flags |= EE_HAS_DIGEST;
- if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+ if (drbd_recv_all(mdev->tconn, di->digest, pi->size))
goto out_free_e;
- if (cmd == P_CSUM_RS_REQUEST) {
- D_ASSERT(mdev->agreed_pro_version >= 89);
- e->w.cb = w_e_end_csum_rs_req;
+ if (pi->cmd == P_CSUM_RS_REQUEST) {
+ D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
+ peer_req->w.cb = w_e_end_csum_rs_req;
/* used in the sector offset progress display */
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
- } else if (cmd == P_OV_REPLY) {
+ } else if (pi->cmd == P_OV_REPLY) {
/* track progress, we may need to throttle */
atomic_add(size >> 9, &mdev->rs_sect_in);
- e->w.cb = w_e_end_ov_reply;
+ peer_req->w.cb = w_e_end_ov_reply;
dec_rs_pending(mdev);
/* drbd_rs_begin_io done when we sent this request,
* but accounting still needs to be done. */
@@ -2080,7 +2464,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
case P_OV_REQUEST:
if (mdev->ov_start_sector == ~(sector_t)0 &&
- mdev->agreed_pro_version >= 90) {
+ mdev->tconn->agreed_pro_version >= 90) {
unsigned long now = jiffies;
int i;
mdev->ov_start_sector = sector;
@@ -2094,15 +2478,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
dev_info(DEV, "Online Verify start sector: %llu\n",
(unsigned long long)sector);
}
- e->w.cb = w_e_end_ov_req;
+ peer_req->w.cb = w_e_end_ov_req;
fault_type = DRBD_FAULT_RS_RD;
break;
default:
- dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(cmd));
- fault_type = DRBD_FAULT_MAX;
- goto out_free_e;
+ BUG();
}
/* Throttle, drbd_rs_begin_io and submit should become asynchronous
@@ -2137,30 +2518,31 @@ submit_for_resync:
submit:
inc_unacked(mdev);
- spin_lock_irq(&mdev->req_lock);
- list_add_tail(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add_tail(&peer_req->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
- return true;
+ if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0)
+ return 0;
/* don't care for the reason here */
dev_err(DEV, "submit failed, triggering re-connect\n");
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&mdev->tconn->req_lock);
/* no drbd_rs_complete_io(), we are dropping the connection anyways */
out_free_e:
put_ldev(mdev);
- drbd_free_ee(mdev, e);
- return false;
+ drbd_free_peer_req(mdev, peer_req);
+ return -EIO;
}
static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
{
int self, peer, rv = -100;
unsigned long ch_self, ch_peer;
+ enum drbd_after_sb_p after_sb_0p;
self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
peer = mdev->p_uuid[UI_BITMAP] & 1;
@@ -2168,10 +2550,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
ch_peer = mdev->p_uuid[UI_SIZE];
ch_self = mdev->comm_bm_set;
- switch (mdev->net_conf->after_sb_0p) {
+ rcu_read_lock();
+ after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p;
+ rcu_read_unlock();
+ switch (after_sb_0p) {
case ASB_CONSENSUS:
case ASB_DISCARD_SECONDARY:
case ASB_CALL_HELPER:
+ case ASB_VIOLENTLY:
dev_err(DEV, "Configuration error.\n");
break;
case ASB_DISCONNECT:
@@ -2200,14 +2586,14 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
"Using discard-least-changes instead\n");
case ASB_DISCARD_ZERO_CHG:
if (ch_peer == 0 && ch_self == 0) {
- rv = drbd_test_flag(mdev, DISCARD_CONCURRENT)
+ rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)
? -1 : 1;
break;
} else {
if (ch_peer == 0) { rv = 1; break; }
if (ch_self == 0) { rv = -1; break; }
}
- if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+ if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
break;
case ASB_DISCARD_LEAST_CHG:
if (ch_self < ch_peer)
@@ -2216,7 +2602,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
rv = 1;
else /* ( ch_self == ch_peer ) */
/* Well, then use something else. */
- rv = drbd_test_flag(mdev, DISCARD_CONCURRENT)
+ rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)
? -1 : 1;
break;
case ASB_DISCARD_LOCAL:
@@ -2232,13 +2618,18 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
{
int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_1p;
- switch (mdev->net_conf->after_sb_1p) {
+ rcu_read_lock();
+ after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p;
+ rcu_read_unlock();
+ switch (after_sb_1p) {
case ASB_DISCARD_YOUNGER_PRI:
case ASB_DISCARD_OLDER_PRI:
case ASB_DISCARD_LEAST_CHG:
case ASB_DISCARD_LOCAL:
case ASB_DISCARD_REMOTE:
+ case ASB_DISCARD_ZERO_CHG:
dev_err(DEV, "Configuration error.\n");
break;
case ASB_DISCONNECT:
@@ -2281,8 +2672,12 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
{
int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_2p;
- switch (mdev->net_conf->after_sb_2p) {
+ rcu_read_lock();
+ after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p;
+ rcu_read_unlock();
+ switch (after_sb_2p) {
case ASB_DISCARD_YOUNGER_PRI:
case ASB_DISCARD_OLDER_PRI:
case ASB_DISCARD_LEAST_CHG:
@@ -2290,6 +2685,7 @@ static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
case ASB_DISCARD_REMOTE:
case ASB_CONSENSUS:
case ASB_DISCARD_SECONDARY:
+ case ASB_DISCARD_ZERO_CHG:
dev_err(DEV, "Configuration error.\n");
break;
case ASB_VIOLENTLY:
@@ -2375,7 +2771,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
- if (mdev->agreed_pro_version < 91)
+ if (mdev->tconn->agreed_pro_version < 91)
return -1091;
if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
@@ -2398,7 +2794,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
- if (mdev->agreed_pro_version < 91)
+ if (mdev->tconn->agreed_pro_version < 91)
return -1091;
if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
@@ -2420,7 +2816,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
}
/* Common power [off|failure] */
- rct = (drbd_test_flag(mdev, CRASHED_PRIMARY) ? 1 : 0) +
+ rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
(mdev->p_uuid[UI_FLAGS] & 2);
/* lowest bit is set when we were primary,
* next bit (weight 2) is set when peer was primary */
@@ -2431,7 +2827,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
case 1: /* self_pri && !peer_pri */ return 1;
case 2: /* !self_pri && peer_pri */ return -1;
case 3: /* self_pri && peer_pri */
- dc = drbd_test_flag(mdev, DISCARD_CONCURRENT);
+ dc = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
return dc ? -1 : 1;
}
}
@@ -2444,14 +2840,14 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
*rule_nr = 51;
peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
if (self == peer) {
- if (mdev->agreed_pro_version < 96 ?
+ if (mdev->tconn->agreed_pro_version < 96 ?
(mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
(mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
/* The last P_SYNC_UUID did not get though. Undo the last start of
resync as sync source modifications of the peer's UUIDs. */
- if (mdev->agreed_pro_version < 91)
+ if (mdev->tconn->agreed_pro_version < 91)
return -1091;
mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
@@ -2481,14 +2877,14 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
*rule_nr = 71;
self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
if (self == peer) {
- if (mdev->agreed_pro_version < 96 ?
+ if (mdev->tconn->agreed_pro_version < 96 ?
(mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
(mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
/* The last P_SYNC_UUID did not get though. Undo the last start of
resync as sync source modifications of our UUIDs. */
- if (mdev->agreed_pro_version < 91)
+ if (mdev->tconn->agreed_pro_version < 91)
return -1091;
__drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
@@ -2536,9 +2932,10 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
enum drbd_disk_state peer_disk) __must_hold(local)
{
- int hg, rule_nr;
enum drbd_conns rv = C_MASK;
enum drbd_disk_state mydisk;
+ struct net_conf *nc;
+ int hg, rule_nr, rr_conflict, tentative;
mydisk = mdev->state.disk;
if (mydisk == D_NEGOTIATING)
@@ -2578,7 +2975,10 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
if (abs(hg) == 100)
drbd_khelper(mdev, "initial-split-brain");
- if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+
+ if (hg == 100 || (hg == -100 && nc->always_asbp)) {
int pcount = (mdev->state.role == R_PRIMARY)
+ (peer_role == R_PRIMARY);
int forced = (hg == -100);
@@ -2607,9 +3007,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
}
if (hg == -100) {
- if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+ if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1))
hg = -1;
- if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+ if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1))
hg = 1;
if (abs(hg) < 100)
@@ -2617,6 +3017,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
"Sync from %s node\n",
(hg < 0) ? "peer" : "this");
}
+ rr_conflict = nc->rr_conflict;
+ tentative = nc->tentative;
+ rcu_read_unlock();
if (hg == -100) {
/* FIXME this log message is not correct if we end up here
@@ -2635,7 +3038,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
if (hg < 0 && /* by intention we do not use mydisk here. */
mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
- switch (mdev->net_conf->rr_conflict) {
+ switch (rr_conflict) {
case ASB_CALL_HELPER:
drbd_khelper(mdev, "pri-lost");
/* fall through */
@@ -2648,7 +3051,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
}
}
- if (mdev->net_conf->dry_run || drbd_test_flag(mdev, CONN_DRY_RUN)) {
+ if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) {
if (hg == 0)
dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
else
@@ -2680,33 +3083,29 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
return rv;
}
-/* returns 1 if invalid */
-static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
{
/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
- if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
- (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
- return 0;
+ if (peer == ASB_DISCARD_REMOTE)
+ return ASB_DISCARD_LOCAL;
/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
- if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
- self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
- return 1;
+ if (peer == ASB_DISCARD_LOCAL)
+ return ASB_DISCARD_REMOTE;
/* everything else is valid if they are equal on both sides. */
- if (peer == self)
- return 0;
-
- /* everything es is invalid. */
- return 1;
+ return peer;
}
-static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_protocol *p = &mdev->data.rbuf.protocol;
- int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
- int p_want_lose, p_two_primaries, cf;
- char p_integrity_alg[SHARED_SECRET_MAX] = "";
+ struct p_protocol *p = pi->data;
+ enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+ int p_proto, p_discard_my_data, p_two_primaries, cf;
+ struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+ char integrity_alg[SHARED_SECRET_MAX] = "";
+ struct crypto_hash *peer_integrity_tfm = NULL;
+ void *int_dig_in = NULL, *int_dig_vv = NULL;
p_proto = be32_to_cpu(p->protocol);
p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
@@ -2714,63 +3113,138 @@ static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsig
p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
p_two_primaries = be32_to_cpu(p->two_primaries);
cf = be32_to_cpu(p->conn_flags);
- p_want_lose = cf & CF_WANT_LOSE;
-
- drbd_clear_flag(mdev, CONN_DRY_RUN);
+ p_discard_my_data = cf & CF_DISCARD_MY_DATA;
- if (cf & CF_DRY_RUN)
- drbd_set_flag(mdev, CONN_DRY_RUN);
+ if (tconn->agreed_pro_version >= 87) {
+ int err;
- if (p_proto != mdev->net_conf->wire_protocol) {
- dev_err(DEV, "incompatible communication protocols\n");
- goto disconnect;
+ if (pi->size > sizeof(integrity_alg))
+ return -EIO;
+ err = drbd_recv_all(tconn, integrity_alg, pi->size);
+ if (err)
+ return err;
+ integrity_alg[SHARED_SECRET_MAX - 1] = 0;
}
- if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
- dev_err(DEV, "incompatible after-sb-0pri settings\n");
- goto disconnect;
- }
+ if (pi->cmd != P_PROTOCOL_UPDATE) {
+ clear_bit(CONN_DRY_RUN, &tconn->flags);
- if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
- dev_err(DEV, "incompatible after-sb-1pri settings\n");
- goto disconnect;
- }
+ if (cf & CF_DRY_RUN)
+ set_bit(CONN_DRY_RUN, &tconn->flags);
- if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
- dev_err(DEV, "incompatible after-sb-2pri settings\n");
- goto disconnect;
- }
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
- if (p_want_lose && mdev->net_conf->want_lose) {
- dev_err(DEV, "both sides have the 'want_lose' flag set\n");
- goto disconnect;
- }
+ if (p_proto != nc->wire_protocol) {
+ conn_err(tconn, "incompatible %s settings\n", "protocol");
+ goto disconnect_rcu_unlock;
+ }
- if (p_two_primaries != mdev->net_conf->two_primaries) {
- dev_err(DEV, "incompatible setting of the two-primaries options\n");
- goto disconnect;
+ if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+ conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+ conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+ conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_discard_my_data && nc->discard_my_data) {
+ conn_err(tconn, "incompatible %s settings\n", "discard-my-data");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_two_primaries != nc->two_primaries) {
+ conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (strcmp(integrity_alg, nc->integrity_alg)) {
+ conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg");
+ goto disconnect_rcu_unlock;
+ }
+
+ rcu_read_unlock();
}
- if (mdev->agreed_pro_version >= 87) {
- unsigned char *my_alg = mdev->net_conf->integrity_alg;
+ if (integrity_alg[0]) {
+ int hash_size;
+
+ /*
+ * We can only change the peer data integrity algorithm
+ * here. Changing our own data integrity algorithm
+ * requires that we send a P_PROTOCOL_UPDATE packet at
+ * the same time; otherwise, the peer has no way to
+ * tell between which packets the algorithm should
+ * change.
+ */
- if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
- return false;
+ peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+ if (!peer_integrity_tfm) {
+ conn_err(tconn, "peer data-integrity-alg %s not supported\n",
+ integrity_alg);
+ goto disconnect;
+ }
- p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
- if (strcmp(p_integrity_alg, my_alg)) {
- dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+ hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+ int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+ int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+ if (!(int_dig_in && int_dig_vv)) {
+ conn_err(tconn, "Allocation of buffers for data integrity checking failed\n");
goto disconnect;
}
- dev_info(DEV, "data-integrity-alg: %s\n",
- my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
}
- return true;
+ new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ conn_err(tconn, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ mutex_lock(&tconn->data.mutex);
+ mutex_lock(&tconn->conf_update);
+ old_net_conf = tconn->net_conf;
+ *new_net_conf = *old_net_conf;
+
+ new_net_conf->wire_protocol = p_proto;
+ new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+ new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+ new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+ new_net_conf->two_primaries = p_two_primaries;
+
+ rcu_assign_pointer(tconn->net_conf, new_net_conf);
+ mutex_unlock(&tconn->conf_update);
+ mutex_unlock(&tconn->data.mutex);
+
+ crypto_free_hash(tconn->peer_integrity_tfm);
+ kfree(tconn->int_dig_in);
+ kfree(tconn->int_dig_vv);
+ tconn->peer_integrity_tfm = peer_integrity_tfm;
+ tconn->int_dig_in = int_dig_in;
+ tconn->int_dig_vv = int_dig_vv;
+
+ if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+ conn_info(tconn, "peer data-integrity-alg: %s\n",
+ integrity_alg[0] ? integrity_alg : "(none)");
+ synchronize_rcu();
+ kfree(old_net_conf);
+ return 0;
+
+disconnect_rcu_unlock:
+ rcu_read_unlock();
disconnect:
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ crypto_free_hash(peer_integrity_tfm);
+ kfree(int_dig_in);
+ kfree(int_dig_vv);
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
/* helper function
@@ -2792,24 +3266,64 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
alg, name, PTR_ERR(tfm));
return tfm;
}
- if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
- crypto_free_hash(tfm);
- dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
- return ERR_PTR(-EINVAL);
- }
return tfm;
}
-static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
+static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi)
{
- int ok = true;
- struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
+ void *buffer = tconn->data.rbuf;
+ int size = pi->size;
+
+ while (size) {
+ int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+ s = drbd_recv(tconn, buffer, s);
+ if (s <= 0) {
+ if (s < 0)
+ return s;
+ break;
+ }
+ size -= s;
+ }
+ if (size)
+ return -EIO;
+ return 0;
+}
+
+/*
+ * config_unknown_volume - device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet. It will warn and ignore these
+ * commands. Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n",
+ cmdname(pi->cmd), pi->vnr);
+ return ignore_remaining_packet(tconn, pi);
+}
+
+static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ struct drbd_conf *mdev;
+ struct p_rs_param_95 *p;
unsigned int header_size, data_size, exp_max_sz;
struct crypto_hash *verify_tfm = NULL;
struct crypto_hash *csums_tfm = NULL;
- const int apv = mdev->agreed_pro_version;
- int *rs_plan_s = NULL;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+ const int apv = tconn->agreed_pro_version;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
int fifo_size = 0;
+ int err;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return config_unknown_volume(tconn, pi);
exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
@@ -2817,32 +3331,49 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
: apv <= 94 ? sizeof(struct p_rs_param_89)
: /* apv >= 95 */ sizeof(struct p_rs_param_95);
- if (packet_size > exp_max_sz) {
+ if (pi->size > exp_max_sz) {
dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
- packet_size, exp_max_sz);
- return false;
+ pi->size, exp_max_sz);
+ return -EIO;
}
if (apv <= 88) {
- header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
+ header_size = sizeof(struct p_rs_param);
+ data_size = pi->size - header_size;
} else if (apv <= 94) {
- header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
+ header_size = sizeof(struct p_rs_param_89);
+ data_size = pi->size - header_size;
D_ASSERT(data_size == 0);
} else {
- header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
+ header_size = sizeof(struct p_rs_param_95);
+ data_size = pi->size - header_size;
D_ASSERT(data_size == 0);
}
/* initialize verify_alg and csums_alg */
+ p = pi->data;
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
- return false;
+ err = drbd_recv_all(mdev->tconn, p, header_size);
+ if (err)
+ return err;
- mdev->sync_conf.rate = be32_to_cpu(p->rate);
+ mutex_lock(&mdev->tconn->conf_update);
+ old_net_conf = mdev->tconn->net_conf;
+ if (get_ldev(mdev)) {
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ put_ldev(mdev);
+ mutex_unlock(&mdev->tconn->conf_update);
+ dev_err(DEV, "Allocation of new disk_conf failed\n");
+ return -ENOMEM;
+ }
+
+ old_disk_conf = mdev->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+
+ new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+ }
if (apv >= 88) {
if (apv == 88) {
@@ -2850,12 +3381,13 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
dev_err(DEV, "verify-alg of wrong size, "
"peer wants %u, accepting only up to %u byte\n",
data_size, SHARED_SECRET_MAX);
- return false;
+ err = -EIO;
+ goto reconnect;
}
- if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
- return false;
-
+ err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size);
+ if (err)
+ goto reconnect;
/* we expect NUL terminated string */
/* but just in case someone tries to be evil */
D_ASSERT(p->verify_alg[data_size-1] == 0);
@@ -2870,10 +3402,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
p->csums_alg[SHARED_SECRET_MAX-1] = 0;
}
- if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+ if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
if (mdev->state.conn == C_WF_REPORT_PARAMS) {
dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
- mdev->sync_conf.verify_alg, p->verify_alg);
+ old_net_conf->verify_alg, p->verify_alg);
goto disconnect;
}
verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
@@ -2884,10 +3416,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
}
}
- if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+ if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
if (mdev->state.conn == C_WF_REPORT_PARAMS) {
dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
- mdev->sync_conf.csums_alg, p->csums_alg);
+ old_net_conf->csums_alg, p->csums_alg);
goto disconnect;
}
csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
@@ -2898,57 +3430,91 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
}
}
- if (apv > 94) {
- mdev->sync_conf.rate = be32_to_cpu(p->rate);
- mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
- mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
- mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
- mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
-
- fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
- if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
- rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_NOIO);
- if (!rs_plan_s) {
+ if (apv > 94 && new_disk_conf) {
+ new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+ new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+ new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
dev_err(DEV, "kmalloc of fifo_buffer failed");
+ put_ldev(mdev);
goto disconnect;
}
}
}
- spin_lock(&mdev->peer_seq_lock);
- /* lock against drbd_nl_syncer_conf() */
- if (verify_tfm) {
- strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
- mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = verify_tfm;
- dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
- }
- if (csums_tfm) {
- strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
- mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = csums_tfm;
- dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
- }
- if (fifo_size != mdev->rs_plan_s.size) {
- kfree(mdev->rs_plan_s.values);
- mdev->rs_plan_s.values = rs_plan_s;
- mdev->rs_plan_s.size = fifo_size;
- mdev->rs_planed = 0;
+ if (verify_tfm || csums_tfm) {
+ new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ dev_err(DEV, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ *new_net_conf = *old_net_conf;
+
+ if (verify_tfm) {
+ strcpy(new_net_conf->verify_alg, p->verify_alg);
+ new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+ crypto_free_hash(mdev->tconn->verify_tfm);
+ mdev->tconn->verify_tfm = verify_tfm;
+ dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+ }
+ if (csums_tfm) {
+ strcpy(new_net_conf->csums_alg, p->csums_alg);
+ new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+ crypto_free_hash(mdev->tconn->csums_tfm);
+ mdev->tconn->csums_tfm = csums_tfm;
+ dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
+ }
+ rcu_assign_pointer(tconn->net_conf, new_net_conf);
}
- spin_unlock(&mdev->peer_seq_lock);
}
- return ok;
+ if (new_disk_conf) {
+ rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+ put_ldev(mdev);
+ }
+
+ if (new_plan) {
+ old_plan = mdev->rs_plan_s;
+ rcu_assign_pointer(mdev->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&mdev->tconn->conf_update);
+ synchronize_rcu();
+ if (new_net_conf)
+ kfree(old_net_conf);
+ kfree(old_disk_conf);
+ kfree(old_plan);
+
+ return 0;
+
+reconnect:
+ if (new_disk_conf) {
+ put_ldev(mdev);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&mdev->tconn->conf_update);
+ return -EIO;
+
disconnect:
+ kfree(new_plan);
+ if (new_disk_conf) {
+ put_ldev(mdev);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&mdev->tconn->conf_update);
/* just for completeness: actually not needed,
* as this is not reached if csums_tfm was ok. */
crypto_free_hash(csums_tfm);
/* but free the verify_tfm again, if csums_tfm did not work out */
crypto_free_hash(verify_tfm);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
/* warn if the arguments differ by more than 12.5% */
@@ -2964,59 +3530,77 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
(unsigned long long)a, (unsigned long long)b);
}
-static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_sizes *p = &mdev->data.rbuf.sizes;
+ struct drbd_conf *mdev;
+ struct p_sizes *p = pi->data;
enum determine_dev_size dd = unchanged;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return config_unknown_volume(tconn, pi);
+
p_size = be64_to_cpu(p->d_size);
p_usize = be64_to_cpu(p->u_size);
- if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
- dev_err(DEV, "some backing storage is needed\n");
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
- }
-
/* just store the peer's disk size for now.
* we still need to figure out whether we accept that. */
mdev->p_size = p_size;
if (get_ldev(mdev)) {
+ rcu_read_lock();
+ my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+
warn_if_differ_considerably(mdev, "lower level device sizes",
p_size, drbd_get_max_capacity(mdev->ldev));
warn_if_differ_considerably(mdev, "user requested size",
- p_usize, mdev->ldev->dc.disk_size);
+ p_usize, my_usize);
/* if this is the first connect, or an otherwise expected
* param exchange, choose the minimum */
if (mdev->state.conn == C_WF_REPORT_PARAMS)
- p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
- p_usize);
-
- my_usize = mdev->ldev->dc.disk_size;
-
- if (mdev->ldev->dc.disk_size != p_usize) {
- mdev->ldev->dc.disk_size = p_usize;
- dev_info(DEV, "Peer sets u_size to %lu sectors\n",
- (unsigned long)mdev->ldev->dc.disk_size);
- }
+ p_usize = min_not_zero(my_usize, p_usize);
/* Never shrink a device with usable data during connect.
But allow online shrinking if we are connected. */
- if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
- drbd_get_capacity(mdev->this_bdev) &&
- mdev->state.disk >= D_OUTDATED &&
- mdev->state.conn < C_CONNECTED) {
+ if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) <
+ drbd_get_capacity(mdev->this_bdev) &&
+ mdev->state.disk >= D_OUTDATED &&
+ mdev->state.conn < C_CONNECTED) {
dev_err(DEV, "The peer's disk size is too small!\n");
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- mdev->ldev->dc.disk_size = my_usize;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
put_ldev(mdev);
- return false;
+ return -EIO;
+ }
+
+ if (my_usize != p_usize) {
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ dev_err(DEV, "Allocation of new disk_conf failed\n");
+ put_ldev(mdev);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&mdev->tconn->conf_update);
+ old_disk_conf = mdev->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = p_usize;
+
+ rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&mdev->tconn->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+
+ dev_info(DEV, "Peer sets u_size to %lu sectors\n",
+ (unsigned long)my_usize);
}
+
put_ldev(mdev);
}
@@ -3025,7 +3609,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
dd = drbd_determine_dev_size(mdev, ddsf);
put_ldev(mdev);
if (dd == dev_size_error)
- return false;
+ return -EIO;
drbd_md_sync(mdev);
} else {
/* I am diskless, need to accept the peer's size. */
@@ -3051,7 +3635,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
* needs to know my new size... */
drbd_send_sizes(mdev, 0, ddsf);
}
- if (drbd_test_and_clear_flag(mdev, RESIZE_PENDING) ||
+ if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
(dd == grew && mdev->state.conn == C_CONNECTED)) {
if (mdev->state.pdsk >= D_INCONSISTENT &&
mdev->state.disk >= D_INCONSISTENT) {
@@ -3060,19 +3644,24 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
else
resync_after_online_grow(mdev);
} else
- drbd_set_flag(mdev, RESYNC_AFTER_NEG);
+ set_bit(RESYNC_AFTER_NEG, &mdev->flags);
}
}
- return true;
+ return 0;
}
-static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_uuids *p = &mdev->data.rbuf.uuids;
+ struct drbd_conf *mdev;
+ struct p_uuids *p = pi->data;
u64 *p_uuid;
int i, updated_uuids = 0;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return config_unknown_volume(tconn, pi);
+
p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
@@ -3087,14 +3676,14 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
(mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
(unsigned long long)mdev->ed_uuid);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
if (get_ldev(mdev)) {
int skip_initial_sync =
mdev->state.conn == C_CONNECTED &&
- mdev->agreed_pro_version >= 90 &&
+ mdev->tconn->agreed_pro_version >= 90 &&
mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
(p_uuid[UI_FLAGS] & 8);
if (skip_initial_sync) {
@@ -3121,14 +3710,15 @@ static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
ongoing cluster wide state change is finished. That is important if
we are primary and are detaching from our disk. We need to see the
new disk state... */
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, CLUSTER_ST_CHANGE));
+ mutex_lock(mdev->state_mutex);
+ mutex_unlock(mdev->state_mutex);
if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
if (updated_uuids)
drbd_print_uuids(mdev, "receiver updated UUIDs to");
- return true;
+ return 0;
}
/**
@@ -3140,6 +3730,7 @@ static union drbd_state convert_state(union drbd_state ps)
union drbd_state ms;
static enum drbd_conns c_tab[] = {
+ [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
[C_CONNECTED] = C_CONNECTED,
[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
@@ -3161,40 +3752,74 @@ static union drbd_state convert_state(union drbd_state ps)
return ms;
}
-static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_req_state *p = &mdev->data.rbuf.req_state;
+ struct drbd_conf *mdev;
+ struct p_req_state *p = pi->data;
union drbd_state mask, val;
enum drbd_state_rv rv;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+
mask.i = be32_to_cpu(p->mask);
val.i = be32_to_cpu(p->val);
- if (drbd_test_flag(mdev, DISCARD_CONCURRENT) &&
- drbd_test_flag(mdev, CLUSTER_ST_CHANGE)) {
+ if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) &&
+ mutex_is_locked(mdev->state_mutex)) {
drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
- return true;
+ return 0;
}
mask = convert_state(mask);
val = convert_state(val);
rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
-
drbd_send_sr_reply(mdev, rv);
+
drbd_md_sync(mdev);
- return true;
+ return 0;
}
-static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_state *p = &mdev->data.rbuf.state;
+ struct p_req_state *p = pi->data;
+ union drbd_state mask, val;
+ enum drbd_state_rv rv;
+
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
+
+ if (test_bit(RESOLVE_CONFLICTS, &tconn->flags) &&
+ mutex_is_locked(&tconn->cstate_mutex)) {
+ conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG);
+ return 0;
+ }
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+ conn_send_sr_reply(tconn, rv);
+
+ return 0;
+}
+
+static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ struct drbd_conf *mdev;
+ struct p_state *p = pi->data;
union drbd_state os, ns, peer_state;
enum drbd_disk_state real_peer_disk;
enum chg_state_flags cs_flags;
int rv;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return config_unknown_volume(tconn, pi);
+
peer_state.i = be32_to_cpu(p->state);
real_peer_disk = peer_state.disk;
@@ -3203,16 +3828,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
}
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
retry:
- os = ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ os = ns = drbd_read_state(mdev);
+ spin_unlock_irq(&mdev->tconn->req_lock);
/* If some other part of the code (asender thread, timeout)
* already decided to close the connection again,
* we must not "re-establish" it here. */
if (os.conn <= C_TEAR_DOWN)
- return false;
+ return -ECONNRESET;
/* If this is the "end of sync" confirmation, usually the peer disk
* transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
@@ -3240,16 +3865,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
peer_state.conn == C_CONNECTED) {
if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
drbd_resync_finished(mdev);
- return true;
+ return 0;
}
}
/* explicit verify finished notification, stop sector reached. */
if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
- return true;
+ return 0;
}
/* peer says his disk is inconsistent, while we think it is uptodate,
@@ -3280,7 +3905,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
os.disk == D_NEGOTIATING));
/* if we have both been inconsistent, and the peer has been
* forced to be UpToDate with --overwrite-data */
- cr |= drbd_test_flag(mdev, CONSIDER_RESYNC);
+ cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
/* if we had been plain connected, and the admin requested to
* start a sync by "invalidate" or "invalidate-remote" */
cr |= (os.conn == C_CONNECTED &&
@@ -3300,44 +3925,44 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
peer_state.disk = D_DISKLESS;
real_peer_disk = D_DISKLESS;
} else {
- if (drbd_test_and_clear_flag(mdev, CONN_DRY_RUN))
- return false;
+ if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags))
+ return -EIO;
D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
}
}
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.i != os.i)
+ spin_lock_irq(&mdev->tconn->req_lock);
+ if (os.i != drbd_read_state(mdev).i)
goto retry;
- drbd_clear_flag(mdev, CONSIDER_RESYNC);
+ clear_bit(CONSIDER_RESYNC, &mdev->flags);
ns.peer = peer_state.role;
ns.pdsk = real_peer_disk;
ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
ns.disk = mdev->new_state_tmp.disk;
cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
- if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
- drbd_test_flag(mdev, NEW_CUR_UUID)) {
- /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+ if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
for temporal network outages! */
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
- tl_clear(mdev);
+ tl_clear(mdev->tconn);
drbd_uuid_new_current(mdev);
- drbd_clear_flag(mdev, NEW_CUR_UUID);
- drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
- return false;
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+ return -EIO;
}
rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
- ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ ns = drbd_read_state(mdev);
+ spin_unlock_irq(&mdev->tconn->req_lock);
if (rv < SS_SUCCESS) {
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
if (os.conn > C_WF_REPORT_PARAMS) {
@@ -3351,16 +3976,21 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
}
}
- mdev->net_conf->want_lose = 0;
+ clear_bit(DISCARD_MY_DATA, &mdev->flags);
drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
- return true;
+ return 0;
}
-static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
+ struct drbd_conf *mdev;
+ struct p_rs_uuid *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
wait_event(mdev->misc_wait,
mdev->state.conn == C_WF_SYNC_UUID ||
@@ -3383,7 +4013,7 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
} else
dev_err(DEV, "Ignoring SyncUUID packet!\n");
- return true;
+ return 0;
}
/**
@@ -3393,27 +4023,27 @@ static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
* code upon failure.
*/
static int
-receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
- unsigned long *buffer, struct bm_xfer_ctx *c)
+receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size,
+ unsigned long *p, struct bm_xfer_ctx *c)
{
- unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
- unsigned want = num_words * sizeof(long);
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+ drbd_header_size(mdev->tconn);
+ unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ unsigned int want = num_words * sizeof(*p);
int err;
- if (want != data_size) {
- dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
+ if (want != size) {
+ dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size);
return -EIO;
}
if (want == 0)
return 0;
- err = drbd_recv(mdev, buffer, want);
- if (err != want) {
- if (err >= 0)
- err = -EIO;
+ err = drbd_recv_all(mdev->tconn, p, want);
+ if (err)
return err;
- }
- drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+ drbd_bm_merge_lel(mdev, c->word_offset, num_words, p);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
@@ -3423,6 +4053,21 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
return 1;
}
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+ return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+ return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+ return (p->encoding >> 4) & 0x7;
+}
+
/**
* recv_bm_rle_bits
*
@@ -3432,7 +4077,8 @@ receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
static int
recv_bm_rle_bits(struct drbd_conf *mdev,
struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+ struct bm_xfer_ctx *c,
+ unsigned int len)
{
struct bitstream bs;
u64 look_ahead;
@@ -3440,12 +4086,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
u64 tmp;
unsigned long s = c->bit_offset;
unsigned long e;
- int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
- int toggle = DCBP_get_start(p);
+ int toggle = dcbp_get_start(p);
int have;
int bits;
- bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+ bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
bits = bitstream_get_bits(&bs, &look_ahead, 64);
if (bits < 0)
@@ -3497,17 +4142,18 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
static int
decode_bitmap_c(struct drbd_conf *mdev,
struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+ struct bm_xfer_ctx *c,
+ unsigned int len)
{
- if (DCBP_get_code(p) == RLE_VLI_Bits)
- return recv_bm_rle_bits(mdev, p, c);
+ if (dcbp_get_code(p) == RLE_VLI_Bits)
+ return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p));
/* other variants had been implemented for evaluation,
* but have been dropped as this one turned out to be "best"
* during all our tests. */
dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
return -EIO;
}
@@ -3515,11 +4161,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
const char *direction, struct bm_xfer_ctx *c)
{
/* what would it take to transfer it "plaintext" */
- unsigned plain = sizeof(struct p_header80) *
- ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
- + c->bm_words * sizeof(long);
- unsigned total = c->bytes[0] + c->bytes[1];
- unsigned r;
+ unsigned int header_size = drbd_header_size(mdev->tconn);
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ unsigned int plain =
+ header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+ c->bm_words * sizeof(unsigned long);
+ unsigned int total = c->bytes[0] + c->bytes[1];
+ unsigned int r;
/* total can not be zero. but just in case: */
if (total == 0)
@@ -3553,67 +4201,63 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
in order to be agnostic to the 32 vs 64 bits issue.
returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
struct bm_xfer_ctx c;
- void *buffer;
int err;
- int ok = false;
- struct p_header80 *h = &mdev->data.rbuf.header.h80;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
/* you are supposed to send additional out-of-sync information
* if you actually set bits during this phase */
- /* maybe we should use some per thread scratch page,
- * and allocate that during initial device creation? */
- buffer = (unsigned long *) __get_free_page(GFP_NOIO);
- if (!buffer) {
- dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
- goto out;
- }
-
c = (struct bm_xfer_ctx) {
.bm_bits = drbd_bm_bits(mdev),
.bm_words = drbd_bm_words(mdev),
};
for(;;) {
- if (cmd == P_BITMAP) {
- err = receive_bitmap_plain(mdev, data_size, buffer, &c);
- } else if (cmd == P_COMPRESSED_BITMAP) {
+ if (pi->cmd == P_BITMAP)
+ err = receive_bitmap_plain(mdev, pi->size, pi->data, &c);
+ else if (pi->cmd == P_COMPRESSED_BITMAP) {
/* MAYBE: sanity check that we speak proto >= 90,
* and the feature is enabled! */
- struct p_compressed_bm *p;
+ struct p_compressed_bm *p = pi->data;
- if (data_size > BM_PACKET_PAYLOAD_BYTES) {
+ if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) {
dev_err(DEV, "ReportCBitmap packet too large\n");
+ err = -EIO;
goto out;
}
- /* use the page buff */
- p = buffer;
- memcpy(p, h, sizeof(*h));
- if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
- goto out;
- if (data_size <= (sizeof(*p) - sizeof(p->head))) {
- dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
+ if (pi->size <= sizeof(*p)) {
+ dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+ err = -EIO;
goto out;
}
- err = decode_bitmap_c(mdev, p, &c);
+ err = drbd_recv_all(mdev->tconn, p, pi->size);
+ if (err)
+ goto out;
+ err = decode_bitmap_c(mdev, p, &c, pi->size);
} else {
- dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
+ dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+ err = -EIO;
goto out;
}
- c.packets[cmd == P_BITMAP]++;
- c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
+ c.packets[pi->cmd == P_BITMAP]++;
+ c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size;
if (err <= 0) {
if (err < 0)
goto out;
break;
}
- if (!drbd_recv_header(mdev, &cmd, &data_size))
+ err = drbd_recv_header(mdev->tconn, pi);
+ if (err)
goto out;
}
@@ -3622,8 +4266,8 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
if (mdev->state.conn == C_WF_BITMAP_T) {
enum drbd_state_rv rv;
- ok = !drbd_send_bitmap(mdev);
- if (!ok)
+ err = drbd_send_bitmap(mdev);
+ if (err)
goto out;
/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
@@ -3634,47 +4278,40 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
drbd_conn_str(mdev->state.conn));
}
+ err = 0;
- ok = true;
out:
drbd_bm_unlock(mdev);
- if (ok && mdev->state.conn == C_WF_BITMAP_S)
+ if (!err && mdev->state.conn == C_WF_BITMAP_S)
drbd_start_resync(mdev, C_SYNC_SOURCE);
- free_page((unsigned long) buffer);
- return ok;
+ return err;
}
-static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi)
{
- /* TODO zero copy sink :) */
- static char sink[128];
- int size, want, r;
+ conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n",
+ pi->cmd, pi->size);
- dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
- cmd, data_size);
-
- size = data_size;
- while (size > 0) {
- want = min_t(int, size, sizeof(sink));
- r = drbd_recv(mdev, sink, want);
- ERR_IF(r <= 0) break;
- size -= r;
- }
- return size == 0;
+ return ignore_remaining_packet(tconn, pi);
}
-static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi)
{
/* Make sure we've acked all the TCP data associated
* with the data requests being unplugged */
- drbd_tcp_quickack(mdev->data.socket);
+ drbd_tcp_quickack(tconn->data.socket);
- return true;
+ return 0;
}
-static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_block_desc *p = &mdev->data.rbuf.block_desc;
+ struct drbd_conf *mdev;
+ struct p_block_desc *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
switch (mdev->state.conn) {
case C_WF_SYNC_UUID:
@@ -3688,15 +4325,13 @@ static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, un
drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
- return true;
+ return 0;
}
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
-
struct data_cmd {
int expect_payload;
size_t pkt_size;
- drbd_cmd_handler_f function;
+ int (*fn)(struct drbd_tconn *, struct packet_info *);
};
static struct data_cmd drbd_cmd_handler[] = {
@@ -3704,13 +4339,13 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
[P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
[P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
- [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
- [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
- [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+ [P_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
[P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
- [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
- [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
+ [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
[P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
[P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
[P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
@@ -3722,124 +4357,75 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
[P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
- /* anything missing from this table is in
- * the asender_tbl, see get_asender_cmd */
- [P_MAX_CMD] = { 0, 0, NULL },
+ [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+ [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
};
-/* All handler functions that expect a sub-header get that sub-heder in
- mdev->data.rbuf.header.head.payload.
-
- Usually in mdev->data.rbuf.header.head the callback can find the usual
- p_header, but they may not rely on that. Since there is also p_header95 !
- */
-
-static void drbdd(struct drbd_conf *mdev)
+static void drbdd(struct drbd_tconn *tconn)
{
- union p_header *header = &mdev->data.rbuf.header;
- unsigned int packet_size;
- enum drbd_packets cmd;
+ struct packet_info pi;
size_t shs; /* sub header size */
- int rv;
+ int err;
+
+ while (get_t_state(&tconn->receiver) == RUNNING) {
+ struct data_cmd *cmd;
- while (get_t_state(&mdev->receiver) == Running) {
- drbd_thread_current_set_cpu(mdev);
- if (!drbd_recv_header(mdev, &cmd, &packet_size))
+ drbd_thread_current_set_cpu(&tconn->receiver);
+ if (drbd_recv_header(tconn, &pi))
goto err_out;
- if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
- dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+ cmd = &drbd_cmd_handler[pi.cmd];
+ if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+ conn_err(tconn, "Unexpected data packet %s (0x%04x)",
+ cmdname(pi.cmd), pi.cmd);
goto err_out;
}
- shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
- if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
- dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+ shs = cmd->pkt_size;
+ if (pi.size > shs && !cmd->expect_payload) {
+ conn_err(tconn, "No payload expected %s l:%d\n",
+ cmdname(pi.cmd), pi.size);
goto err_out;
}
if (shs) {
- rv = drbd_recv(mdev, &header->h80.payload, shs);
- if (unlikely(rv != shs)) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
+ err = drbd_recv_all_warn(tconn, pi.data, shs);
+ if (err)
goto err_out;
- }
+ pi.size -= shs;
}
- rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
-
- if (unlikely(!rv)) {
- dev_err(DEV, "error receiving %s, l: %d!\n",
- cmdname(cmd), packet_size);
+ err = cmd->fn(tconn, &pi);
+ if (err) {
+ conn_err(tconn, "error receiving %s, e: %d l: %d!\n",
+ cmdname(pi.cmd), err, pi.size);
goto err_out;
}
}
+ return;
- if (0) {
- err_out:
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- }
- /* If we leave here, we probably want to update at least the
- * "Connected" indicator on stable storage. Do so explicitly here. */
- drbd_md_sync(mdev);
+ err_out:
+ conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
}
-void drbd_flush_workqueue(struct drbd_conf *mdev)
+void conn_flush_workqueue(struct drbd_tconn *tconn)
{
struct drbd_wq_barrier barr;
barr.w.cb = w_prev_work_done;
+ barr.w.tconn = tconn;
init_completion(&barr.done);
- drbd_queue_work(&mdev->data.work, &barr.w);
+ drbd_queue_work(&tconn->sender_work, &barr.w);
wait_for_completion(&barr.done);
}
-void drbd_free_tl_hash(struct drbd_conf *mdev)
+static void conn_disconnect(struct drbd_tconn *tconn)
{
- struct hlist_head *h;
-
- spin_lock_irq(&mdev->req_lock);
+ struct drbd_conf *mdev;
+ enum drbd_conns oc;
+ int vnr;
- if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
- spin_unlock_irq(&mdev->req_lock);
- return;
- }
- /* paranoia code */
- for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
- (int)(h - mdev->ee_hash), h->first);
- kfree(mdev->ee_hash);
- mdev->ee_hash = NULL;
- mdev->ee_hash_s = 0;
-
- /* We may not have had the chance to wait for all locally pending
- * application requests. The hlist_add_fake() prevents access after
- * free on master bio completion. */
- for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) {
- struct drbd_request *req;
- struct hlist_node *pos, *n;
- hlist_for_each_entry_safe(req, pos, n, h, collision) {
- hlist_del_init(&req->collision);
- hlist_add_fake(&req->collision);
- }
- }
-
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
- spin_unlock_irq(&mdev->req_lock);
-}
-
-static void drbd_disconnect(struct drbd_conf *mdev)
-{
- enum drbd_fencing_p fp;
- union drbd_state os, ns;
- int rv = SS_UNKNOWN_ERROR;
- unsigned int i;
-
- if (mdev->state.conn == C_STANDALONE)
+ if (tconn->cstate == C_STANDALONE)
return;
/* We are about to start the cleanup after connection loss.
@@ -3847,18 +4433,54 @@ static void drbd_disconnect(struct drbd_conf *mdev)
* Usually we should be in some network failure state already,
* but just in case we are not, we fix it up here.
*/
- drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+ conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
/* asender does not clean up anything. it must not interfere, either */
- drbd_thread_stop(&mdev->asender);
- drbd_free_sock(mdev);
+ drbd_thread_stop(&tconn->asender);
+ drbd_free_sock(tconn);
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ drbd_disconnected(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ if (!list_empty(&tconn->current_epoch->list))
+ conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n");
+ /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+ atomic_set(&tconn->current_epoch->epoch_size, 0);
+ tconn->send.seen_any_write_yet = false;
+
+ conn_info(tconn, "Connection closed\n");
+
+ if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN)
+ conn_try_outdate_peer_async(tconn);
+
+ spin_lock_irq(&tconn->req_lock);
+ oc = tconn->cstate;
+ if (oc >= C_UNCONNECTED)
+ _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ spin_unlock_irq(&tconn->req_lock);
+
+ if (oc == C_DISCONNECTING)
+ conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+}
+
+static int drbd_disconnected(struct drbd_conf *mdev)
+{
+ unsigned int i;
/* wait for current activity to cease. */
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
_drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
/* We do not have data structures that would allow us to
* get the rs_pending_cnt down to 0 again.
@@ -3876,7 +4498,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
atomic_set(&mdev->rs_pending_cnt, 0);
wake_up(&mdev->misc_wait);
- /* make sure syncer is stopped and w_resume_next_sg queued */
del_timer_sync(&mdev->resync_timer);
resync_timer_fn((unsigned long)mdev);
@@ -3885,53 +4506,28 @@ static void drbd_disconnect(struct drbd_conf *mdev)
* to be "canceled" */
drbd_flush_workqueue(mdev);
- /* This also does reclaim_net_ee(). If we do this too early, we might
- * miss some resync ee and pages.*/
- drbd_process_done_ee(mdev);
+ drbd_finish_peer_reqs(mdev);
+
+ /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+ might have issued a work again. The one before drbd_finish_peer_reqs() is
+ necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+ drbd_flush_workqueue(mdev);
+
+ /* need to do it again, drbd_finish_peer_reqs() may have populated it
+ * again via drbd_try_clear_on_disk_bm(). */
+ drbd_rs_cancel_all(mdev);
kfree(mdev->p_uuid);
mdev->p_uuid = NULL;
- if (!is_susp(mdev->state))
- tl_clear(mdev);
-
- dev_info(DEV, "Connection closed\n");
+ if (!drbd_suspended(mdev))
+ tl_clear(mdev->tconn);
drbd_md_sync(mdev);
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
- drbd_try_outdate_peer_async(mdev);
-
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
- if (os.conn >= C_UNCONNECTED) {
- /* Do not restart in case we are C_DISCONNECTING */
- ns = os;
- ns.conn = C_UNCONNECTED;
- rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- }
- spin_unlock_irq(&mdev->req_lock);
-
- if (os.conn == C_DISCONNECTING) {
- wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
-
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = NULL;
-
- kfree(mdev->net_conf);
- mdev->net_conf = NULL;
- drbd_request_state(mdev, NS(conn, C_STANDALONE));
- }
-
/* serialize with bitmap writeout triggered by the state change,
* if any. */
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
/* tcp_close and release of sendpage pages can be deferred. I don't
* want to use SO_LINGER, because apparently it can be deferred for
@@ -3940,7 +4536,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
* Actually we don't care for exactly when the network stack does its
* put_page(), but release our reference on these pages right here.
*/
- i = drbd_release_ee(mdev, &mdev->net_ee);
+ i = drbd_free_peer_reqs(mdev, &mdev->net_ee);
if (i)
dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
i = atomic_read(&mdev->pp_in_use_by_net);
@@ -3955,9 +4551,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
D_ASSERT(list_empty(&mdev->sync_ee));
D_ASSERT(list_empty(&mdev->done_ee));
- /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
- atomic_set(&mdev->current_epoch->epoch_size, 0);
- D_ASSERT(list_empty(&mdev->current_epoch->list));
+ return 0;
}
/*
@@ -3969,29 +4563,19 @@ static void drbd_disconnect(struct drbd_conf *mdev)
*
* for now, they are expected to be zero, but ignored.
*/
-static int drbd_send_handshake(struct drbd_conf *mdev)
+static int drbd_send_features(struct drbd_tconn *tconn)
{
- /* ASSERT current == mdev->receiver ... */
- struct p_handshake *p = &mdev->data.sbuf.handshake;
- int ok;
-
- if (mutex_lock_interruptible(&mdev->data.mutex)) {
- dev_err(DEV, "interrupted during initial handshake\n");
- return 0; /* interrupted. not ok. */
- }
-
- if (mdev->data.socket == NULL) {
- mutex_unlock(&mdev->data.mutex);
- return 0;
- }
+ struct drbd_socket *sock;
+ struct p_connection_features *p;
+ sock = &tconn->data;
+ p = conn_prepare_command(tconn, sock);
+ if (!p)
+ return -EIO;
memset(p, 0, sizeof(*p));
p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
- ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
- (struct p_header80 *)p, sizeof(*p), 0 );
- mutex_unlock(&mdev->data.mutex);
- return ok;
+ return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
}
/*
@@ -4001,42 +4585,38 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
* -1 peer talks different language,
* no point in trying again, please go standalone.
*/
-static int drbd_do_handshake(struct drbd_conf *mdev)
+static int drbd_do_features(struct drbd_tconn *tconn)
{
- /* ASSERT current == mdev->receiver ... */
- struct p_handshake *p = &mdev->data.rbuf.handshake;
- const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
- unsigned int length;
- enum drbd_packets cmd;
- int rv;
+ /* ASSERT current == tconn->receiver ... */
+ struct p_connection_features *p;
+ const int expect = sizeof(struct p_connection_features);
+ struct packet_info pi;
+ int err;
- rv = drbd_send_handshake(mdev);
- if (!rv)
+ err = drbd_send_features(tconn);
+ if (err)
return 0;
- rv = drbd_recv_header(mdev, &cmd, &length);
- if (!rv)
+ err = drbd_recv_header(tconn, &pi);
+ if (err)
return 0;
- if (cmd != P_HAND_SHAKE) {
- dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
- cmdname(cmd), cmd);
+ if (pi.cmd != P_CONNECTION_FEATURES) {
+ conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
return -1;
}
- if (length != expect) {
- dev_err(DEV, "expected HandShake length: %u, received: %u\n",
- expect, length);
+ if (pi.size != expect) {
+ conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n",
+ expect, pi.size);
return -1;
}
- rv = drbd_recv(mdev, &p->head.payload, expect);
-
- if (rv != expect) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
+ p = pi.data;
+ err = drbd_recv_all_warn(tconn, p, expect);
+ if (err)
return 0;
- }
p->protocol_min = be32_to_cpu(p->protocol_min);
p->protocol_max = be32_to_cpu(p->protocol_max);
@@ -4047,15 +4627,15 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
PRO_VERSION_MIN > p->protocol_max)
goto incompat;
- mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+ tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
- dev_info(DEV, "Handshake successful: "
- "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+ conn_info(tconn, "Handshake successful: "
+ "Agreed network protocol version %d\n", tconn->agreed_pro_version);
return 1;
incompat:
- dev_err(DEV, "incompatible DRBD dialects: "
+ conn_err(tconn, "incompatible DRBD dialects: "
"I support %d-%d, peer supports %d-%d\n",
PRO_VERSION_MIN, PRO_VERSION_MAX,
p->protocol_min, p->protocol_max);
@@ -4063,7 +4643,7 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
}
#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
-static int drbd_do_auth(struct drbd_conf *mdev)
+static int drbd_do_auth(struct drbd_tconn *tconn)
{
dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
@@ -4078,121 +4658,139 @@ static int drbd_do_auth(struct drbd_conf *mdev)
-1 - auth failed, don't try again.
*/
-static int drbd_do_auth(struct drbd_conf *mdev)
+static int drbd_do_auth(struct drbd_tconn *tconn)
{
+ struct drbd_socket *sock;