aboutsummaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/numa.c23
-rw-r--r--drivers/amba/tegra-ahb.c2
-rw-r--r--drivers/clocksource/Kconfig5
-rw-r--r--drivers/clocksource/Makefile1
-rw-r--r--drivers/clocksource/metag_generic.c198
-rw-r--r--drivers/dma/dw_dmac.c145
-rw-r--r--drivers/dma/dw_dmac_regs.h7
-rw-r--r--drivers/hsi/hsi.c2
-rw-r--r--drivers/hv/channel_mgmt.c2
-rw-r--r--drivers/hv/hv.c5
-rw-r--r--drivers/hv/vmbus_drv.c11
-rw-r--r--drivers/input/misc/hp_sdc_rtc.c15
-rw-r--r--drivers/input/serio/Kconfig2
-rw-r--r--drivers/irqchip/Makefile2
-rw-r--r--drivers/irqchip/irq-metag-ext.c868
-rw-r--r--drivers/irqchip/irq-metag.c343
-rw-r--r--drivers/md/Kconfig66
-rw-r--r--drivers/md/Makefile6
-rw-r--r--drivers/md/dm-bio-prison.c155
-rw-r--r--drivers/md/dm-bio-prison.h58
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-cache-block-types.h54
-rw-r--r--drivers/md/dm-cache-metadata.c1146
-rw-r--r--drivers/md/dm-cache-metadata.h142
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c464
-rw-r--r--drivers/md/dm-cache-policy-internal.h124
-rw-r--r--drivers/md/dm-cache-policy-mq.c1195
-rw-r--r--drivers/md/dm-cache-policy.c161
-rw-r--r--drivers/md/dm-cache-policy.h228
-rw-r--r--drivers/md/dm-cache-target.c2584
-rw-r--r--drivers/md/dm-crypt.c45
-rw-r--r--drivers/md/dm-delay.c12
-rw-r--r--drivers/md/dm-flakey.c11
-rw-r--r--drivers/md/dm-ioctl.c166
-rw-r--r--drivers/md/dm-kcopyd.c121
-rw-r--r--drivers/md/dm-linear.c13
-rw-r--r--drivers/md/dm-mpath.c12
-rw-r--r--drivers/md/dm-raid.c133
-rw-r--r--drivers/md/dm-raid1.c17
-rw-r--r--drivers/md/dm-snap.c33
-rw-r--r--drivers/md/dm-stripe.c27
-rw-r--r--drivers/md/dm-table.c11
-rw-r--r--drivers/md/dm-target.c2
-rw-r--r--drivers/md/dm-thin-metadata.c12
-rw-r--r--drivers/md/dm-thin.c277
-rw-r--r--drivers/md/dm-verity.c8
-rw-r--r--drivers/md/dm-zero.c2
-rw-r--r--drivers/md/dm.c452
-rw-r--r--drivers/md/md.c19
-rw-r--r--drivers/md/persistent-data/Kconfig2
-rw-r--r--drivers/md/persistent-data/Makefile2
-rw-r--r--drivers/md/persistent-data/dm-array.c808
-rw-r--r--drivers/md/persistent-data/dm-array.h166
-rw-r--r--drivers/md/persistent-data/dm-bitset.c163
-rw-r--r--drivers/md/persistent-data/dm-bitset.h165
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c1
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h1
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c7
-rw-r--r--drivers/md/persistent-data/dm-btree.c52
-rw-r--r--drivers/md/persistent-data/dm-btree.h15
-rw-r--r--drivers/md/raid0.c13
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/md/raid10.c97
-rw-r--r--drivers/md/raid10.h5
-rw-r--r--drivers/md/raid5.c38
-rw-r--r--drivers/misc/kgdbts.c2
-rw-r--r--drivers/mtd/Kconfig4
-rw-r--r--drivers/mtd/ar7part.c6
-rw-r--r--drivers/mtd/bcm47xxpart.c51
-rw-r--r--drivers/mtd/chips/cfi_cmdset_0002.c217
-rw-r--r--drivers/mtd/cmdlinepart.c47
-rw-r--r--drivers/mtd/devices/Makefile4
-rw-r--r--drivers/mtd/devices/bcm47xxsflash.c55
-rw-r--r--drivers/mtd/devices/bcm47xxsflash.h15
-rw-r--r--drivers/mtd/devices/elm.c404
-rw-r--r--drivers/mtd/devices/m25p80.c100
-rw-r--r--drivers/mtd/maps/Kconfig2
-rw-r--r--drivers/mtd/maps/physmap_of.c9
-rw-r--r--drivers/mtd/maps/uclinux.c30
-rw-r--r--drivers/mtd/nand/atmel_nand.c141
-rw-r--r--drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h4
-rw-r--r--drivers/mtd/nand/bcm47xxnflash/main.c14
-rw-r--r--drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c4
-rw-r--r--drivers/mtd/nand/davinci_nand.c24
-rw-r--r--drivers/mtd/nand/fsl_ifc_nand.c233
-rw-r--r--drivers/mtd/nand/gpmi-nand/bch-regs.h22
-rw-r--r--drivers/mtd/nand/gpmi-nand/gpmi-lib.c9
-rw-r--r--drivers/mtd/nand/gpmi-nand/gpmi-nand.c63
-rw-r--r--drivers/mtd/nand/gpmi-nand/gpmi-nand.h4
-rw-r--r--drivers/mtd/nand/mxc_nand.c11
-rw-r--r--drivers/mtd/nand/nand_base.c8
-rw-r--r--drivers/mtd/nand/nand_ecc.c5
-rw-r--r--drivers/mtd/nand/nandsim.c6
-rw-r--r--drivers/mtd/nand/omap2.c583
-rw-r--r--drivers/mtd/ofpart.c7
-rw-r--r--drivers/mtd/tests/mtd_nandecctest.c10
-rw-r--r--drivers/mtd/tests/mtd_stresstest.c8
-rw-r--r--drivers/mtd/tests/mtd_torturetest.c25
-rw-r--r--drivers/mtd/ubi/debug.h6
-rw-r--r--drivers/net/ethernet/broadcom/b44.c4
-rw-r--r--drivers/net/ethernet/broadcom/bgmac.c6
-rw-r--r--drivers/platform/x86/Kconfig15
-rw-r--r--drivers/platform/x86/Makefile1
-rw-r--r--drivers/platform/x86/acer-wmi.c21
-rw-r--r--drivers/platform/x86/asus-laptop.c85
-rw-r--r--drivers/platform/x86/asus-nb-wmi.c76
-rw-r--r--drivers/platform/x86/asus-wmi.c108
-rw-r--r--drivers/platform/x86/asus-wmi.h9
-rw-r--r--drivers/platform/x86/chromeos_laptop.c371
-rw-r--r--drivers/platform/x86/eeepc-wmi.c2
-rw-r--r--drivers/platform/x86/hp-wmi.c117
-rw-r--r--drivers/platform/x86/msi-laptop.c374
-rw-r--r--drivers/platform/x86/msi-wmi.c224
-rw-r--r--drivers/platform/x86/sony-laptop.c145
-rw-r--r--drivers/platform/x86/thinkpad_acpi.c17
-rw-r--r--drivers/rtc/rtc-stmp3xxx.c64
-rw-r--r--drivers/s390/block/dasd_eckd.c10
-rw-r--r--drivers/s390/cio/qdio_debug.c9
-rw-r--r--drivers/s390/cio/qdio_debug.h3
-rw-r--r--drivers/s390/cio/qdio_main.c2
-rw-r--r--drivers/scsi/aacraid/src.c4
-rw-r--r--drivers/scsi/bnx2fc/bnx2fc_fcoe.c256
-rw-r--r--drivers/scsi/dc395x.c2
-rw-r--r--drivers/scsi/fcoe/fcoe.c266
-rw-r--r--drivers/scsi/fcoe/fcoe.h6
-rw-r--r--drivers/scsi/fcoe/fcoe_ctlr.c45
-rw-r--r--drivers/scsi/fcoe/fcoe_sysfs.c186
-rw-r--r--drivers/scsi/fcoe/fcoe_transport.c199
-rw-r--r--drivers/scsi/fcoe/libfcoe.h20
-rw-r--r--drivers/scsi/fnic/Makefile2
-rw-r--r--drivers/scsi/fnic/fnic.h60
-rw-r--r--drivers/scsi/fnic/fnic_debugfs.c314
-rw-r--r--drivers/scsi/fnic/fnic_io.h6
-rw-r--r--drivers/scsi/fnic/fnic_main.c17
-rw-r--r--drivers/scsi/fnic/fnic_scsi.c721
-rw-r--r--drivers/scsi/fnic/fnic_trace.c273
-rw-r--r--drivers/scsi/fnic/fnic_trace.h90
-rw-r--r--drivers/scsi/hpsa.c117
-rw-r--r--drivers/scsi/ipr.c33
-rw-r--r--drivers/scsi/ipr.h1
-rw-r--r--drivers/scsi/libfc/fc_fcp.c6
-rw-r--r--drivers/scsi/libfc/fc_libfc.h38
-rw-r--r--drivers/scsi/libfc/fc_rport.c2
-rw-r--r--drivers/scsi/lpfc/lpfc_sli.c56
-rw-r--r--drivers/scsi/megaraid/megaraid_sas.h6
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_base.c2
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_fusion.c5
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_fusion.h1
-rw-r--r--drivers/scsi/mpt2sas/mpt2sas_base.c8
-rw-r--r--drivers/scsi/mpt2sas/mpt2sas_base.h6
-rw-r--r--drivers/scsi/mvsas/mv_sas.c10
-rw-r--r--drivers/scsi/mvsas/mv_sas.h1
-rw-r--r--drivers/scsi/osd/osd_initiator.c4
-rw-r--r--drivers/scsi/pm8001/pm8001_init.c3
-rw-r--r--drivers/scsi/qla2xxx/qla_attr.c35
-rw-r--r--drivers/scsi/qla2xxx/qla_bsg.c199
-rw-r--r--drivers/scsi/qla2xxx/qla_bsg.h2
-rw-r--r--drivers/scsi/qla2xxx/qla_dbg.c17
-rw-r--r--drivers/scsi/qla2xxx/qla_dbg.h2
-rw-r--r--drivers/scsi/qla2xxx/qla_def.h54
-rw-r--r--drivers/scsi/qla2xxx/qla_dfs.c2
-rw-r--r--drivers/scsi/qla2xxx/qla_fw.h7
-rw-r--r--drivers/scsi/qla2xxx/qla_gbl.h15
-rw-r--r--drivers/scsi/qla2xxx/qla_gs.c10
-rw-r--r--drivers/scsi/qla2xxx/qla_init.c200
-rw-r--r--drivers/scsi/qla2xxx/qla_inline.h28
-rw-r--r--drivers/scsi/qla2xxx/qla_iocb.c39
-rw-r--r--drivers/scsi/qla2xxx/qla_isr.c82
-rw-r--r--drivers/scsi/qla2xxx/qla_mbx.c207
-rw-r--r--drivers/scsi/qla2xxx/qla_mid.c9
-rw-r--r--drivers/scsi/qla2xxx/qla_nx.c13
-rw-r--r--drivers/scsi/qla2xxx/qla_nx.h4
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c147
-rw-r--r--drivers/scsi/qla2xxx/qla_settings.h2
-rw-r--r--drivers/scsi/qla2xxx/qla_sup.c18
-rw-r--r--drivers/scsi/qla2xxx/qla_target.c169
-rw-r--r--drivers/scsi/qla2xxx/qla_target.h19
-rw-r--r--drivers/scsi/qla2xxx/qla_version.h2
-rw-r--r--drivers/scsi/qla4xxx/ql4_mbx.c6
-rw-r--r--drivers/scsi/storvsc_drv.c137
-rw-r--r--drivers/scsi/ufs/Kconfig74
-rw-r--r--drivers/scsi/ufs/Makefile1
-rw-r--r--drivers/scsi/ufs/ufs.h44
-rw-r--r--drivers/scsi/ufs/ufshcd-pci.c211
-rw-r--r--drivers/scsi/ufs/ufshcd.c423
-rw-r--r--drivers/scsi/ufs/ufshcd.h202
-rw-r--r--drivers/scsi/ufs/ufshci.h44
-rw-r--r--drivers/ssb/driver_chipcommon_pmu.c4
-rw-r--r--drivers/staging/comedi/comedi_fops.c14
-rw-r--r--drivers/target/iscsi/iscsi_target.c11
-rw-r--r--drivers/target/sbp/sbp_target.c2
-rw-r--r--drivers/target/target_core_file.c2
-rw-r--r--drivers/target/target_core_iblock.c2
-rw-r--r--drivers/target/target_core_pscsi.c9
-rw-r--r--drivers/tty/serial/Kconfig10
-rw-r--r--drivers/tty/tty_io.c16
-rw-r--r--drivers/usb/host/ehci-timer.c29
-rw-r--r--drivers/watchdog/Kconfig35
-rw-r--r--drivers/watchdog/Makefile3
-rw-r--r--drivers/watchdog/at91rm9200_wdt.c9
-rw-r--r--drivers/watchdog/at91sam9_wdt.c168
-rw-r--r--drivers/watchdog/ath79_wdt.c66
-rw-r--r--drivers/watchdog/bcm47xx_wdt.c339
-rw-r--r--drivers/watchdog/booke_wdt.c185
-rw-r--r--drivers/watchdog/davinci_wdt.c29
-rw-r--r--drivers/watchdog/gef_wdt.c1
-rw-r--r--drivers/watchdog/omap_wdt.c6
-rw-r--r--drivers/watchdog/orion_wdt.c11
-rw-r--r--drivers/watchdog/pnx4008_wdt.c7
-rw-r--r--drivers/watchdog/retu_wdt.c178
-rw-r--r--drivers/watchdog/s3c2410_wdt.c48
-rw-r--r--drivers/watchdog/sp5100_tco.c27
-rw-r--r--drivers/watchdog/stmp3xxx_rtc_wdt.c111
-rw-r--r--drivers/watchdog/stmp3xxx_wdt.c288
-rw-r--r--drivers/watchdog/watchdog_core.c66
-rw-r--r--drivers/watchdog/watchdog_dev.c3
-rw-r--r--drivers/xen/xen-acpi-cpuhotplug.c34
-rw-r--r--drivers/xen/xen-acpi-memhotplug.c52
-rw-r--r--drivers/xen/xenbus/xenbus_client.c1
219 files changed, 18151 insertions, 3533 deletions
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 59844ee149b..33e609f6358 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id,
handler, max_entries);
}
-static int srat_mem_cnt;
-
-void __init early_parse_srat(void)
+int __init acpi_numa_init(void)
{
+ int cnt = 0;
+
/*
* Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
* SRAT cpu entries could have different order with that in MADT.
@@ -295,24 +295,21 @@ void __init early_parse_srat(void)
/* SRAT: Static Resource Affinity Table */
if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
- acpi_parse_x2apic_affinity, 0);
+ acpi_parse_x2apic_affinity, 0);
acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
- acpi_parse_processor_affinity, 0);
- srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
- acpi_parse_memory_affinity,
- NR_NODE_MEMBLKS);
+ acpi_parse_processor_affinity, 0);
+ cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
+ acpi_parse_memory_affinity,
+ NR_NODE_MEMBLKS);
}
-}
-int __init acpi_numa_init(void)
-{
/* SLIT: System Locality Information Table */
acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
acpi_numa_arch_fixup();
- if (srat_mem_cnt < 0)
- return srat_mem_cnt;
+ if (cnt < 0)
+ return cnt;
else if (!parsed_numa_memblks)
return -ENOENT;
return 0;
diff --git a/drivers/amba/tegra-ahb.c b/drivers/amba/tegra-ahb.c
index ab92785f54d..093c4355496 100644
--- a/drivers/amba/tegra-ahb.c
+++ b/drivers/amba/tegra-ahb.c
@@ -130,7 +130,7 @@ static inline void gizmo_writel(struct tegra_ahb *ahb, u32 value, u32 offset)
writel(value, ahb->regs + offset);
}
-#ifdef CONFIG_ARCH_TEGRA_3x_SOC
+#ifdef CONFIG_TEGRA_IOMMU_SMMU
static int tegra_ahb_match_by_smmu(struct device *dev, void *data)
{
struct tegra_ahb *ahb = dev_get_drvdata(dev);
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index e920cbe519f..e507ab7df60 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -62,3 +62,8 @@ config CLKSRC_DBX500_PRCMU_SCHED_CLOCK
config ARM_ARCH_TIMER
bool
+
+config CLKSRC_METAG_GENERIC
+ def_bool y if METAG
+ help
+ This option enables support for the Meta per-thread timers.
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 7d671b85a98..4d8283aec5b 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_ARCH_TEGRA) += tegra20_timer.o
obj-$(CONFIG_VT8500_TIMER) += vt8500_timer.o
obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o
+obj-$(CONFIG_CLKSRC_METAG_GENERIC) += metag_generic.o
diff --git a/drivers/clocksource/metag_generic.c b/drivers/clocksource/metag_generic.c
new file mode 100644
index 00000000000..ade7513a11d
--- /dev/null
+++ b/drivers/clocksource/metag_generic.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2005-2013 Imagination Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Support for Meta per-thread timers.
+ *
+ * Meta hardware threads have 2 timers. The background timer (TXTIMER) is used
+ * as a free-running time base (hz clocksource), and the interrupt timer
+ * (TXTIMERI) is used for the timer interrupt (clock event). Both counters
+ * traditionally count at approximately 1MHz.
+ */
+
+#include <clocksource/metag_generic.h>
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+
+#include <asm/clock.h>
+#include <asm/hwthread.h>
+#include <asm/core_reg.h>
+#include <asm/metag_mem.h>
+#include <asm/tbx.h>
+
+#define HARDWARE_FREQ 1000000 /* 1MHz */
+#define HARDWARE_DIV 1 /* divide by 1 = 1MHz clock */
+#define HARDWARE_TO_NS_SHIFT 10 /* convert ticks to ns */
+
+static unsigned int hwtimer_freq = HARDWARE_FREQ;
+static DEFINE_PER_CPU(struct clock_event_device, local_clockevent);
+static DEFINE_PER_CPU(char [11], local_clockevent_name);
+
+static int metag_timer_set_next_event(unsigned long delta,
+ struct clock_event_device *dev)
+{
+ __core_reg_set(TXTIMERI, -delta);
+ return 0;
+}
+
+static void metag_timer_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ switch (mode) {
+ case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_MODE_RESUME:
+ break;
+
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ /* We should disable the IRQ here */
+ break;
+
+ case CLOCK_EVT_MODE_PERIODIC:
+ case CLOCK_EVT_MODE_UNUSED:
+ WARN_ON(1);
+ break;
+ };
+}
+
+static cycle_t metag_clocksource_read(struct clocksource *cs)
+{
+ return __core_reg_get(TXTIMER);
+}
+
+static struct clocksource clocksource_metag = {
+ .name = "META",
+ .rating = 200,
+ .mask = CLOCKSOURCE_MASK(32),
+ .read = metag_clocksource_read,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static irqreturn_t metag_timer_interrupt(int irq, void *dummy)
+{
+ struct clock_event_device *evt = &__get_cpu_var(local_clockevent);
+
+ evt->event_handler(evt);
+
+ return IRQ_HANDLED;
+}
+
+static struct irqaction metag_timer_irq = {
+ .name = "META core timer",
+ .handler = metag_timer_interrupt,
+ .flags = IRQF_TIMER | IRQF_IRQPOLL | IRQF_PERCPU,
+};
+
+unsigned long long sched_clock(void)
+{
+ unsigned long long ticks = __core_reg_get(TXTIMER);
+ return ticks << HARDWARE_TO_NS_SHIFT;
+}
+
+static void __cpuinit arch_timer_setup(unsigned int cpu)
+{
+ unsigned int txdivtime;
+ struct clock_event_device *clk = &per_cpu(local_clockevent, cpu);
+ char *name = per_cpu(local_clockevent_name, cpu);
+
+ txdivtime = __core_reg_get(TXDIVTIME);
+
+ txdivtime &= ~TXDIVTIME_DIV_BITS;
+ txdivtime |= (HARDWARE_DIV & TXDIVTIME_DIV_BITS);
+
+ __core_reg_set(TXDIVTIME, txdivtime);
+
+ sprintf(name, "META %d", cpu);
+ clk->name = name;
+ clk->features = CLOCK_EVT_FEAT_ONESHOT,
+
+ clk->rating = 200,
+ clk->shift = 12,
+ clk->irq = tbisig_map(TBID_SIGNUM_TRT),
+ clk->set_mode = metag_timer_set_mode,
+ clk->set_next_event = metag_timer_set_next_event,
+
+ clk->mult = div_sc(hwtimer_freq, NSEC_PER_SEC, clk->shift);
+ clk->max_delta_ns = clockevent_delta2ns(0x7fffffff, clk);
+ clk->min_delta_ns = clockevent_delta2ns(0xf, clk);
+ clk->cpumask = cpumask_of(cpu);
+
+ clockevents_register_device(clk);
+
+ /*
+ * For all non-boot CPUs we need to synchronize our free
+ * running clock (TXTIMER) with the boot CPU's clock.
+ *
+ * While this won't be accurate, it should be close enough.
+ */
+ if (cpu) {
+ unsigned int thread0 = cpu_2_hwthread_id[0];
+ unsigned long val;
+
+ val = core_reg_read(TXUCT_ID, TXTIMER_REGNUM, thread0);
+ __core_reg_set(TXTIMER, val);
+ }
+}
+
+static int __cpuinit arch_timer_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_STARTING:
+ case CPU_STARTING_FROZEN:
+ arch_timer_setup(cpu);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata arch_timer_cpu_nb = {
+ .notifier_call = arch_timer_cpu_notify,
+};
+
+int __init metag_generic_timer_init(void)
+{
+ /*
+ * On Meta 2 SoCs, the actual frequency of the timer is based on the
+ * Meta core clock speed divided by an integer, so it is only
+ * approximately 1MHz. Calculating the real frequency here drastically
+ * reduces clock skew on these SoCs.
+ */
+#ifdef CONFIG_METAG_META21
+ hwtimer_freq = get_coreclock() / (metag_in32(EXPAND_TIMER_DIV) + 1);
+#endif
+ clocksource_register_hz(&clocksource_metag, hwtimer_freq);
+
+ setup_irq(tbisig_map(TBID_SIGNUM_TRT), &metag_timer_irq);
+
+ /* Configure timer on boot CPU */
+ arch_timer_setup(smp_processor_id());
+
+ /* Hook cpu boot to configure other CPU's timers */
+ register_cpu_notifier(&arch_timer_cpu_nb);
+
+ return 0;
+}
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 51c3ea2ed41..c599558faed 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -20,6 +20,7 @@
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/of.h>
+#include <linux/of_dma.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/platform_device.h>
@@ -171,7 +172,13 @@ static void dwc_initialize(struct dw_dma_chan *dwc)
if (dwc->initialized == true)
return;
- if (dws) {
+ if (dws && dws->cfg_hi == ~0 && dws->cfg_lo == ~0) {
+ /* autoconfigure based on request line from DT */
+ if (dwc->direction == DMA_MEM_TO_DEV)
+ cfghi = DWC_CFGH_DST_PER(dwc->request_line);
+ else if (dwc->direction == DMA_DEV_TO_MEM)
+ cfghi = DWC_CFGH_SRC_PER(dwc->request_line);
+ } else if (dws) {
/*
* We need controller-specific data to set up slave
* transfers.
@@ -1226,49 +1233,64 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
dev_vdbg(chan2dev(chan), "%s: done\n", __func__);
}
-bool dw_dma_generic_filter(struct dma_chan *chan, void *param)
+struct dw_dma_filter_args {
+ struct dw_dma *dw;
+ unsigned int req;
+ unsigned int src;
+ unsigned int dst;
+};
+
+static bool dw_dma_generic_filter(struct dma_chan *chan, void *param)
{
+ struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
struct dw_dma *dw = to_dw_dma(chan->device);
- static struct dw_dma *last_dw;
- static char *last_bus_id;
- int i = -1;
+ struct dw_dma_filter_args *fargs = param;
+ struct dw_dma_slave *dws = &dwc->slave;
- /*
- * dmaengine framework calls this routine for all channels of all dma
- * controller, until true is returned. If 'param' bus_id is not
- * registered with a dma controller (dw), then there is no need of
- * running below function for all channels of dw.
- *
- * This block of code does this by saving the parameters of last
- * failure. If dw and param are same, i.e. trying on same dw with
- * different channel, return false.
- */
- if ((last_dw == dw) && (last_bus_id == param))
- return false;
- /*
- * Return true:
- * - If dw_dma's platform data is not filled with slave info, then all
- * dma controllers are fine for transfer.
- * - Or if param is NULL
- */
- if (!dw->sd || !param)
- return true;
+ /* ensure the device matches our channel */
+ if (chan->device != &fargs->dw->dma)
+ return false;
- while (++i < dw->sd_count) {
- if (!strcmp(dw->sd[i].bus_id, param)) {
- chan->private = &dw->sd[i];
- last_dw = NULL;
- last_bus_id = NULL;
+ dws->dma_dev = dw->dma.dev;
+ dws->cfg_hi = ~0;
+ dws->cfg_lo = ~0;
+ dws->src_master = fargs->src;
+ dws->dst_master = fargs->dst;
- return true;
- }
- }
+ dwc->request_line = fargs->req;
- last_dw = dw;
- last_bus_id = param;
- return false;
+ chan->private = dws;
+
+ return true;
+}
+
+static struct dma_chan *dw_dma_xlate(struct of_phandle_args *dma_spec,
+ struct of_dma *ofdma)
+{
+ struct dw_dma *dw = ofdma->of_dma_data;
+ struct dw_dma_filter_args fargs = {
+ .dw = dw,
+ };
+ dma_cap_mask_t cap;
+
+ if (dma_spec->args_count != 3)
+ return NULL;
+
+ fargs.req = be32_to_cpup(dma_spec->args+0);
+ fargs.src = be32_to_cpup(dma_spec->args+1);
+ fargs.dst = be32_to_cpup(dma_spec->args+2);
+
+ if (WARN_ON(fargs.req >= DW_DMA_MAX_NR_REQUESTS ||
+ fargs.src >= dw->nr_masters ||
+ fargs.dst >= dw->nr_masters))
+ return NULL;
+
+ dma_cap_zero(cap);
+ dma_cap_set(DMA_SLAVE, cap);
+
+ /* TODO: there should be a simpler way to do this */
+ return dma_request_channel(cap, dw_dma_generic_filter, &fargs);
}
-EXPORT_SYMBOL(dw_dma_generic_filter);
/* --------------------- Cyclic DMA API extensions -------------------- */
@@ -1554,9 +1576,8 @@ static void dw_dma_off(struct dw_dma *dw)
static struct dw_dma_platform_data *
dw_dma_parse_dt(struct platform_device *pdev)
{
- struct device_node *sn, *cn, *np = pdev->dev.of_node;
+ struct device_node *np = pdev->dev.of_node;
struct dw_dma_platform_data *pdata;
- struct dw_dma_slave *sd;
u32 tmp, arr[4];
if (!np) {
@@ -1568,7 +1589,7 @@ dw_dma_parse_dt(struct platform_device *pdev)
if (!pdata)
return NULL;
- if (of_property_read_u32(np, "nr_channels", &pdata->nr_channels))
+ if (of_property_read_u32(np, "dma-channels", &pdata->nr_channels))
return NULL;
if (of_property_read_bool(np, "is_private"))
@@ -1583,7 +1604,7 @@ dw_dma_parse_dt(struct platform_device *pdev)
if (!of_property_read_u32(np, "block_size", &tmp))
pdata->block_size = tmp;
- if (!of_property_read_u32(np, "nr_masters", &tmp)) {
+ if (!of_property_read_u32(np, "dma-masters", &tmp)) {
if (tmp > 4)
return NULL;
@@ -1595,36 +1616,6 @@ dw_dma_parse_dt(struct platform_device *pdev)
for (tmp = 0; tmp < pdata->nr_masters; tmp++)
pdata->data_width[tmp] = arr[tmp];
- /* parse slave data */
- sn = of_find_node_by_name(np, "slave_info");
- if (!sn)
- return pdata;
-
- /* calculate number of slaves */
- tmp = of_get_child_count(sn);
- if (!tmp)
- return NULL;
-
- sd = devm_kzalloc(&pdev->dev, sizeof(*sd) * tmp, GFP_KERNEL);
- if (!sd)
- return NULL;
-
- pdata->sd = sd;
- pdata->sd_count = tmp;
-
- for_each_child_of_node(sn, cn) {
- sd->dma_dev = &pdev->dev;
- of_property_read_string(cn, "bus_id", &sd->bus_id);
- of_property_read_u32(cn, "cfg_hi", &sd->cfg_hi);
- of_property_read_u32(cn, "cfg_lo", &sd->cfg_lo);
- if (!of_property_read_u32(cn, "src_master", &tmp))
- sd->src_master = tmp;
-
- if (!of_property_read_u32(cn, "dst_master", &tmp))
- sd->dst_master = tmp;
- sd++;
- }
-
return pdata;
}
#else
@@ -1705,8 +1696,6 @@ static int dw_probe(struct platform_device *pdev)
clk_prepare_enable(dw->clk);
dw->regs = regs;
- dw->sd = pdata->sd;
- dw->sd_count = pdata->sd_count;
/* get hardware configuration parameters */
if (autocfg) {
@@ -1837,6 +1826,14 @@ static int dw_probe(struct platform_device *pdev)
dma_async_device_register(&dw->dma);
+ if (pdev->dev.of_node) {
+ err = of_dma_controller_register(pdev->dev.of_node,
+ dw_dma_xlate, dw);
+ if (err && err != -ENODEV)
+ dev_err(&pdev->dev,
+ "could not register of_dma_controller\n");
+ }
+
return 0;
}
@@ -1845,6 +1842,8 @@ static int dw_remove(struct platform_device *pdev)
struct dw_dma *dw = platform_get_drvdata(pdev);
struct dw_dma_chan *dwc, *_dwc;
+ if (pdev->dev.of_node)
+ of_dma_controller_free(pdev->dev.of_node);
dw_dma_off(dw);
dma_async_device_unregister(&dw->dma);
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index 88dd8eb3195..cf0ce5c77d6 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -13,6 +13,7 @@
#include <linux/dw_dmac.h>
#define DW_DMA_MAX_NR_CHANNELS 8
+#define DW_DMA_MAX_NR_REQUESTS 16
/* flow controller */
enum dw_dma_fc {
@@ -211,6 +212,8 @@ struct dw_dma_chan {
/* hardware configuration */
unsigned int block_size;
bool nollp;
+ unsigned int request_line;
+ struct dw_dma_slave slave;
/* configuration passed via DMA_SLAVE_CONFIG */
struct dma_slave_config dma_sconfig;
@@ -239,10 +242,6 @@ struct dw_dma {
struct tasklet_struct tasklet;
struct clk *clk;
- /* slave information */
- struct dw_dma_slave *sd;
- unsigned int sd_count;
-
u8 all_chan_mask;
/* hardware configuration */
diff --git a/drivers/hsi/hsi.c b/drivers/hsi/hsi.c
index 2d58f939d27..833dd1afbf4 100644
--- a/drivers/hsi/hsi.c
+++ b/drivers/hsi/hsi.c
@@ -420,7 +420,7 @@ static int hsi_event_notifier_call(struct notifier_block *nb,
/**
* hsi_register_port_event - Register a client to receive port events
* @cl: HSI client that wants to receive port events
- * @cb: Event handler callback
+ * @handler: Event handler callback
*
* Clients should register a callback to be able to receive
* events from the ports. Registration should happen after
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 53a8600162a..ff1be167eb0 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -318,7 +318,7 @@ static u32 get_vp_index(uuid_le *type_guid)
return 0;
}
cur_cpu = (++next_vp % max_cpus);
- return 0;
+ return cur_cpu;
}
/*
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 1c5481da6e4..731158910c1 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -272,7 +272,7 @@ u16 hv_signal_event(void *con_id)
* retrieve the initialized message and event pages. Otherwise, we create and
* initialize the message and event pages.
*/
-void hv_synic_init(void *irqarg)
+void hv_synic_init(void *arg)
{
u64 version;
union hv_synic_simp simp;
@@ -281,7 +281,6 @@ void hv_synic_init(void *irqarg)
union hv_synic_scontrol sctrl;
u64 vp_index;
- u32 irq_vector = *((u32 *)(irqarg));
int cpu = smp_processor_id();
if (!hv_context.hypercall_page)
@@ -335,7 +334,7 @@ void hv_synic_init(void *irqarg)
rdmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
shared_sint.as_uint64 = 0;
- shared_sint.vector = irq_vector; /* HV_SHARED_SINT_IDT_VECTOR + 0x20; */
+ shared_sint.vector = HYPERVISOR_CALLBACK_VECTOR;
shared_sint.masked = false;
shared_sint.auto_eoi = true;
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index cf19dfa5ead..bf421e0efa1 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -36,6 +36,7 @@
#include <linux/kernel_stat.h>
#include <asm/hyperv.h>
#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
@@ -528,7 +529,6 @@ static void vmbus_flow_handler(unsigned int irq, struct irq_desc *desc)
static int vmbus_bus_init(int irq)
{
int ret;
- unsigned int vector;
/* Hypervisor initialization...setup hypercall page..etc */
ret = hv_init();
@@ -558,13 +558,16 @@ static int vmbus_bus_init(int irq)
*/
irq_set_handler(irq, vmbus_flow_handler);
- vector = IRQ0_VECTOR + irq;
+ /*
+ * Register our interrupt handler.
+ */
+ hv_register_vmbus_handler(irq, vmbus_isr);
/*
- * Notify the hypervisor of our irq and
+ * Initialize the per-cpu interrupt state and
* connect to the host.
*/
- on_each_cpu(hv_synic_init, (void *)&vector, 1);
+ on_each_cpu(hv_synic_init, NULL, 1);
ret = vmbus_connect();
if (ret)
goto err_irq;
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index 0b4f54265f6..2e3334b8f82 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -109,7 +109,9 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm)
if (hp_sdc_enqueue_transaction(&t)) return -1;
- down_interruptible(&tsem); /* Put ourselves to sleep for results. */
+ /* Put ourselves to sleep for results. */
+ if (WARN_ON(down_interruptible(&tsem)))
+ return -1;
/* Check for nonpresence of BBRTC */
if (!((tseq[83] | tseq[90] | tseq[69] | tseq[76] |
@@ -176,11 +178,16 @@ static int64_t hp_sdc_rtc_read_i8042timer (uint8_t loadcmd, int numreg)
t.seq = tseq;
t.act.semaphore = &i8042tregs;
- down_interruptible(&i8042tregs); /* Sleep if output regs in use. */
+ /* Sleep if output regs in use. */
+ if (WARN_ON(down_interruptible(&i8042tregs)))
+ return -1;
if (hp_sdc_enqueue_transaction(&t)) return -1;
- down_interruptible(&i8042tregs); /* Sleep until results come back. */
+ /* Sleep until results come back. */
+ if (WARN_ON(down_interruptible(&i8042tregs)))
+ return -1;
+
up(&i8042tregs);
return (tseq[5] |
@@ -276,6 +283,7 @@ static inline int hp_sdc_rtc_read_ct(struct timeval *res) {
}
+#if 0 /* not used yet */
/* Set the i8042 real-time clock */
static int hp_sdc_rtc_set_rt (struct timeval *setto)
{
@@ -386,6 +394,7 @@ static int hp_sdc_rtc_set_i8042timer (struct timeval *setto, uint8_t setcmd)
}
return 0;
}
+#endif
static ssize_t hp_sdc_rtc_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos) {
diff --git a/drivers/input/serio/Kconfig b/drivers/input/serio/Kconfig
index 6e9cc765e0d..3ec5ef2dd44 100644
--- a/drivers/input/serio/Kconfig
+++ b/drivers/input/serio/Kconfig
@@ -22,7 +22,7 @@ config SERIO_I8042
tristate "i8042 PC Keyboard controller" if EXPERT || !X86
default y
depends on !PARISC && (!ARM || ARCH_SHARK || FOOTBRIDGE_HOST) && \
- (!SUPERH || SH_CAYMAN) && !M68K && !BLACKFIN
+ (!SUPERH || SH_CAYMAN) && !M68K && !BLACKFIN && !S390
help
i8042 is the chip over which the standard AT keyboard and PS/2
mouse are connected to the computer. If you use these devices,
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index e65fbf2cdf7..98e3b87bdf1 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -2,6 +2,8 @@ obj-$(CONFIG_IRQCHIP) += irqchip.o
obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o
obj-$(CONFIG_ARCH_EXYNOS) += exynos-combiner.o
+obj-$(CONFIG_METAG) += irq-metag-ext.o
+obj-$(CONFIG_METAG_PERFCOUNTER_IRQS) += irq-metag.o
obj-$(CONFIG_ARCH_SUNXI) += irq-sunxi.o
obj-$(CONFIG_ARCH_SPEAR3XX) += spear-shirq.o
obj-$(CONFIG_ARM_GIC) += irq-gic.o
diff --git a/drivers/irqchip/irq-metag-ext.c b/drivers/irqchip/irq-metag-ext.c
new file mode 100644
index 00000000000..92c41ab4dbf
--- /dev/null
+++ b/drivers/irqchip/irq-metag-ext.c
@@ -0,0 +1,868 @@
+/*
+ * Meta External interrupt code.
+ *
+ * Copyright (C) 2005-2012 Imagination Technologies Ltd.
+ *
+ * External interrupts on Meta are configured at two-levels, in the CPU core and
+ * in the external trigger block. Interrupts from SoC peripherals are
+ * multiplexed onto a single Meta CPU "trigger" - traditionally it has always
+ * been trigger 2 (TR2). For info on how de-multiplexing happens check out
+ * meta_intc_irq_demux().
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irqchip/metag-ext.h>
+#include <linux/irqdomain.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/irq.h>
+#include <asm/hwthread.h>
+
+#define HWSTAT_STRIDE 8
+#define HWVEC_BLK_STRIDE 0x1000
+
+/**
+ * struct meta_intc_priv - private meta external interrupt data
+ * @nr_banks: Number of interrupt banks
+ * @domain: IRQ domain for all banks of external IRQs
+ * @unmasked: Record of unmasked IRQs
+ * @levels_altered: Record of altered level bits
+ */
+struct meta_intc_priv {
+ unsigned int nr_banks;
+ struct irq_domain *domain;
+
+ unsigned long unmasked[4];
+
+#ifdef CONFIG_METAG_SUSPEND_MEM
+ unsigned long levels_altered[4];
+#endif
+};
+
+/* Private data for the one and only external interrupt controller */
+static struct meta_intc_priv meta_intc_priv;
+
+/**
+ * meta_intc_offset() - Get the offset into the bank of a hardware IRQ number
+ * @hw: Hardware IRQ number (within external trigger block)
+ *
+ * Returns: Bit offset into the IRQ's bank registers
+ */
+static unsigned int meta_intc_offset(irq_hw_number_t hw)
+{
+ return hw & 0x1f;
+}
+
+/**
+ * meta_intc_bank() - Get the bank number of a hardware IRQ number
+ * @hw: Hardware IRQ number (within external trigger block)
+ *
+ * Returns: Bank number indicating which register the IRQ's bits are
+ */
+static unsigned int meta_intc_bank(irq_hw_number_t hw)
+{
+ return hw >> 5;
+}
+
+/**
+ * meta_intc_stat_addr() - Get the address of a HWSTATEXT register
+ * @hw: Hardware IRQ number (within external trigger block)
+ *
+ * Returns: Address of a HWSTATEXT register containing the status bit for
+ * the specified hardware IRQ number
+ */
+static void __iomem *meta_intc_stat_addr(irq_hw_number_t hw)
+{
+ return (void __iomem *)(HWSTATEXT +
+ HWSTAT_STRIDE * meta_intc_bank(hw));
+}
+
+/**
+ * meta_intc_level_addr() - Get the address of a HWLEVELEXT register
+ * @hw: Hardware IRQ number (within external trigger block)
+ *
+ * Returns: Address of a HWLEVELEXT register containing the sense bit for
+ * the specified hardware IRQ number
+ */
+static void __iomem *meta_intc_level_addr(irq_hw_number_t hw)
+{
+ return (void __iomem *)(HWLEVELEXT +
+ HWSTAT_STRIDE * meta_intc_bank(hw));
+}
+
+/**
+ * meta_intc_mask_addr() - Get the address of a HWMASKEXT register
+ * @hw: Hardware IRQ number (within external trigger block)
+ *
+ * Returns: Address of a HWMASKEXT register containing the mask bit for the
+ * specified hardware IRQ number
+ */
+static void __iomem *meta_intc_mask_addr(irq_hw_number_t hw)
+{
+ return (void __iomem *)(HWMASKEXT +
+ HWSTAT_STRIDE * meta_intc_bank(hw));
+}
+
+/**
+ * meta_intc_vec_addr() - Get the vector address of a hardware interrupt
+ * @hw: Hardware IRQ number (within external trigger block)
+ *
+ * Returns: Address of a HWVECEXT register controlling the core trigger to
+ * vector the IRQ onto
+ */
+static inline void __iomem *meta_intc_vec_addr(irq_hw_number_t hw)
+{
+ return (void __iomem *)(HWVEC0EXT +
+ HWVEC_BLK_STRIDE * meta_intc_bank(hw) +
+ HWVECnEXT_STRIDE * meta_intc_offset(hw));
+}
+
+/**
+ * meta_intc_startup_irq() - set up an external irq
+ * @data: data for the external irq to start up
+ *
+ * Multiplex interrupts for irq onto TR2. Clear any pending interrupts and
+ * unmask irq, both using the appropriate callbacks.
+ */
+static unsigned int meta_intc_startup_irq(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ void __iomem *vec_addr = meta_intc_vec_addr(hw);
+ int thread = hard_processor_id();
+
+ /* Perform any necessary acking. */
+ if (data->chip->irq_ack)
+ data->chip->irq_ack(data);
+
+ /* Wire up this interrupt to the core with HWVECxEXT. */
+ metag_out32(TBI_TRIG_VEC(TBID_SIGNUM_TR2(thread)), vec_addr);
+
+ /* Perform any necessary unmasking. */
+ data->chip->irq_unmask(data);
+
+ return 0;
+}
+
+/**
+ * meta_intc_shutdown_irq() - turn off an external irq
+ * @data: data for the external irq to turn off
+ *
+ * Mask irq using the appropriate callback and stop muxing it onto TR2.
+ */
+static void meta_intc_shutdown_irq(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ void __iomem *vec_addr = meta_intc_vec_addr(hw);
+
+ /* Mask the IRQ */
+ data->chip->irq_mask(data);
+
+ /*
+ * Disable the IRQ at the core by removing the interrupt from
+ * the HW vector mapping.
+ */
+ metag_out32(0, vec_addr);
+}
+
+/**
+ * meta_intc_ack_irq() - acknowledge an external irq
+ * @data: data for the external irq to ack
+ *
+ * Clear down an edge interrupt in the status register.
+ */
+static void meta_intc_ack_irq(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ unsigned int bit = 1 << meta_intc_offset(hw);
+ void __iomem *stat_addr = meta_intc_stat_addr(hw);
+
+ /* Ack the int, if it is still 'on'.
+ * NOTE - this only works for edge triggered interrupts.
+ */
+ if (metag_in32(stat_addr) & bit)
+ metag_out32(bit, stat_addr);
+}
+
+/**
+ * record_irq_is_masked() - record the IRQ masked so it doesn't get handled
+ * @data: data for the external irq to record
+ *
+ * This should get called whenever an external IRQ is masked (by whichever
+ * callback is used). It records the IRQ masked so that it doesn't get handled
+ * if it still shows up in the status register.
+ */
+static void record_irq_is_masked(struct irq_data *data)
+{
+ struct meta_intc_priv *priv = &meta_intc_priv;
+ irq_hw_number_t hw = data->hwirq;
+
+ clear_bit(meta_intc_offset(hw), &priv->unmasked[meta_intc_bank(hw)]);
+}
+
+/**
+ * record_irq_is_unmasked() - record the IRQ unmasked so it can be handled
+ * @data: data for the external irq to record
+ *
+ * This should get called whenever an external IRQ is unmasked (by whichever
+ * callback is used). It records the IRQ unmasked so that it gets handled if it
+ * shows up in the status register.
+ */
+static void record_irq_is_unmasked(struct irq_data *data)
+{
+ struct meta_intc_priv *priv = &meta_intc_priv;
+ irq_hw_number_t hw = data->hwirq;
+
+ set_bit(meta_intc_offset(hw), &priv->unmasked[meta_intc_bank(hw)]);
+}
+
+/*
+ * For use by wrapper IRQ drivers
+ */
+
+/**
+ * meta_intc_mask_irq_simple() - minimal mask used by wrapper IRQ drivers
+ * @data: data for the external irq being masked
+ *
+ * This should be called by any wrapper IRQ driver mask functions. it doesn't do
+ * any masking but records the IRQ as masked so that the core code knows the
+ * mask has taken place. It is the callers responsibility to ensure that the IRQ
+ * won't trigger an interrupt to the core.
+ */
+void meta_intc_mask_irq_simple(struct irq_data *data)
+{
+ record_irq_is_masked(data);
+}
+
+/**
+ * meta_intc_unmask_irq_simple() - minimal unmask used by wrapper IRQ drivers
+ * @data: data for the external irq being unmasked
+ *
+ * This should be called by any wrapper IRQ driver unmask functions. it doesn't
+ * do any unmasking but records the IRQ as unmasked so that the core code knows
+ * the unmask has taken place. It is the callers responsibility to ensure that
+ * the IRQ can now trigger an interrupt to the core.
+ */
+void meta_intc_unmask_irq_simple(struct irq_data *data)
+{
+ record_irq_is_unmasked(data);
+}
+
+
+/**
+ * meta_intc_mask_irq() - mask an external irq using HWMASKEXT
+ * @data: data for the external irq to mask
+ *
+ * This is a default implementation of a mask function which makes use of the
+ * HWMASKEXT registers available in newer versions.
+ *
+ * Earlier versions without these registers should use SoC level IRQ masking
+ * which call the meta_intc_*_simple() functions above, or if that isn't
+ * available should use the fallback meta_intc_*_nomask() functions below.
+ */
+static void meta_intc_mask_irq(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ unsigned int bit = 1 << meta_intc_offset(hw);
+ void __iomem *mask_addr = meta_intc_mask_addr(hw);
+ unsigned long flags;
+
+ record_irq_is_masked(data);
+
+ /* update the interrupt mask */
+ __global_lock2(flags);
+ metag_out32(metag_in32(mask_addr) & ~bit, mask_addr);
+ __global_unlock2(flags);
+}
+
+/**
+ * meta_intc_unmask_irq() - unmask an external irq using HWMASKEXT
+ * @data: data for the external irq to unmask
+ *
+ * This is a default implementation of an unmask function which makes use of the
+ * HWMASKEXT registers available on new versions. It should be paired with
+ * meta_intc_mask_irq() above.
+ */
+static void meta_intc_unmask_irq(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ unsigned int bit = 1 << meta_intc_offset(hw);
+ void __iomem *mask_addr = meta_intc_mask_addr(hw);
+ unsigned long flags;
+
+ record_irq_is_unmasked(data);
+
+ /* update the interrupt mask */
+ __global_lock2(flags);
+ metag_out32(metag_in32(mask_addr) | bit, mask_addr);
+ __global_unlock2(flags);
+}
+
+/**
+ * meta_intc_mask_irq_nomask() - mask an external irq by unvectoring
+ * @data: data for the external irq to mask
+ *
+ * This is the version of the mask function for older versions which don't have
+ * HWMASKEXT registers, or a SoC level means of masking IRQs. Instead the IRQ is
+ * unvectored from the core and retriggered if necessary later.
+ */
+static void meta_intc_mask_irq_nomask(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ void __iomem *vec_addr = meta_intc_vec_addr(hw);
+
+ record_irq_is_masked(data);
+
+ /* there is no interrupt mask, so unvector the interrupt */
+ metag_out32(0, vec_addr);
+}
+
+/**
+ * meta_intc_unmask_edge_irq_nomask() - unmask an edge irq by revectoring
+ * @data: data for the external irq to unmask
+ *
+ * This is the version of the unmask function for older versions which don't
+ * have HWMASKEXT registers, or a SoC level means of masking IRQs. Instead the
+ * IRQ is revectored back to the core and retriggered if necessary.
+ *
+ * The retriggering done by this function is specific to edge interrupts.
+ */
+static void meta_intc_unmask_edge_irq_nomask(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ unsigned int bit = 1 << meta_intc_offset(hw);
+ void __iomem *stat_addr = meta_intc_stat_addr(hw);
+ void __iomem *vec_addr = meta_intc_vec_addr(hw);
+ unsigned int thread = hard_processor_id();
+
+ record_irq_is_unmasked(data);
+
+ /* there is no interrupt mask, so revector the interrupt */
+ metag_out32(TBI_TRIG_VEC(TBID_SIGNUM_TR2(thread)), vec_addr);
+
+ /*
+ * Re-trigger interrupt
+ *
+ * Writing a 1 toggles, and a 0->1 transition triggers. We only
+ * retrigger if the status bit is already set, which means we
+ * need to clear it first. Retriggering is fundamentally racy
+ * because if the interrupt fires again after we clear it we
+ * could end up clearing it again and the interrupt handler
+ * thinking it hasn't fired. Therefore we need to keep trying to
+ * retrigger until the bit is set.
+ */
+ if (metag_in32(stat_addr) & bit) {
+ metag_out32(bit, stat_addr);
+ while (!(metag_in32(stat_addr) & bit))
+ metag_out32(bit, stat_addr);
+ }
+}
+
+/**
+ * meta_intc_unmask_level_irq_nomask() - unmask a level irq by revectoring
+ * @data: data for the external irq to unmask
+ *
+ * This is the version of the unmask function for older versions which don't
+ * have HWMASKEXT registers, or a SoC level means of masking IRQs. Instead the
+ * IRQ is revectored back to the core and retriggered if necessary.
+ *
+ * The retriggering done by this function is specific to level interrupts.
+ */
+static void meta_intc_unmask_level_irq_nomask(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ unsigned int bit = 1 << meta_intc_offset(hw);
+ void __iomem *stat_addr = meta_intc_stat_addr(hw);
+ void __iomem *vec_addr = meta_intc_vec_addr(hw);
+ unsigned int thread = hard_processor_id();
+
+ record_irq_is_unmasked(data);
+
+ /* there is no interrupt mask, so revector the interrupt */
+ metag_out32(TBI_TRIG_VEC(TBID_SIGNUM_TR2(thread)), vec_addr);
+
+ /* Re-trigger interrupt */
+ /* Writing a 1 triggers interrupt */
+ if (metag_in32(stat_addr) & bit)
+ metag_out32(bit, stat_addr);
+}
+
+/**
+ * meta_intc_irq_set_type() - set the type of an external irq
+ * @data: data for the external irq to set the type of
+ * @flow_type: new irq flow type
+ *
+ * Set the flow type of an external interrupt. This updates the irq chip and irq
+ * handler depending on whether the irq is edge or level sensitive (the polarity
+ * is ignored), and also sets up the bit in HWLEVELEXT so the hardware knows
+ * when to trigger.
+ */
+static int meta_intc_irq_set_type(struct irq_data *data, unsigned int flow_type)
+{
+#ifdef CONFIG_METAG_SUSPEND_MEM
+ struct meta_intc_priv *priv = &meta_intc_priv;
+#endif
+ unsigned int irq = data->irq;
+ irq_hw_number_t hw = data->hwirq;
+ unsigned int bit = 1 << meta_intc_offset(hw);
+ void __iomem *level_addr = meta_intc_level_addr(hw);
+ unsigned long flags;
+ unsigned int level;
+
+ /* update the chip/handler */
+ if (flow_type & IRQ_TYPE_LEVEL_MASK)
+ __irq_set_chip_handler_name_locked(irq, &meta_intc_level_chip,
+ handle_level_irq, NULL);
+ else
+ __irq_set_chip_handler_name_locked(irq, &meta_intc_edge_chip,
+ handle_edge_irq, NULL);
+
+ /* and clear/set the bit in HWLEVELEXT */
+ __global_lock2(flags);
+ level = metag_in32(level_addr);
+ if (flow_type & IRQ_TYPE_LEVEL_MASK)
+ level |= bit;
+ else
+ level &= ~bit;
+ metag_out32(level, level_addr);
+#ifdef CONFIG_METAG_SUSPEND_MEM
+ priv->levels_altered[meta_intc_bank(hw)] |= bit;
+#endif
+ __global_unlock2(flags);
+
+ return 0;
+}
+
+/**
+ * meta_intc_irq_demux() - external irq de-multiplexer
+ * @irq: the virtual interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * The cpu receives an interrupt on TR2 when a SoC interrupt has occurred. It is
+ * this function's job to demux this irq and figure out exactly which external
+ * irq needs servicing.
+ *
+ * Whilst using TR2 to detect external interrupts is a software convention it is
+ * (hopefully) unlikely to change.
+ */
+static void meta_intc_irq_demux(unsigned int irq, struct irq_desc *desc)
+{
+ struct meta_intc_priv *priv = &meta_intc_priv;
+ irq_hw_number_t hw;
+ unsigned int bank, irq_no, status;
+ void __iomem *stat_addr = meta_intc_stat_addr(0);
+
+ /*
+ * Locate which interrupt has caused our handler to run.
+ */
+ for (bank = 0; bank < priv->nr_banks; ++bank) {
+ /* Which interrupts are currently pending in this bank? */
+recalculate:
+ status = metag_in32(stat_addr) & priv->unmasked[bank];
+
+ for (hw = bank*32; status; status >>= 1, ++hw) {
+ if (status & 0x1) {
+ /*
+ * Map the hardware IRQ number to a virtual
+ * Linux IRQ number.
+ */
+ irq_no = irq_linear_revmap(priv->domain, hw);
+
+ /*
+ * Only fire off external interrupts that are
+ * registered to be handled by the kernel.
+ * Other external interrupts are probably being
+ * handled by other Meta hardware threads.
+ */
+ generic_handle_irq(irq_no);
+
+ /*
+ * The handler may have re-enabled interrupts
+ * which could have caused a nested invocation
+ * of this code and make the copy of the
+ * status register we are using invalid.
+ */
+ goto recalculate;
+ }
+ }
+ stat_addr += HWSTAT_STRIDE;
+ }
+}
+
+#ifdef CONFIG_SMP
+/**
+ * meta_intc_set_affinity() - set the affinity for an interrupt
+ * @data: data for the external irq to set the affinity of
+ * @cpumask: cpu mask representing cpus which can handle the interrupt
+ * @force: whether to force (ignored)
+ *
+ * Revector the specified external irq onto a specific cpu's TR2 trigger, so
+ * that that cpu tends to be the one who handles it.
+ */
+static int meta_intc_set_affinity(struct irq_data *data,
+ const struct cpumask *cpumask, bool force)
+{
+ irq_hw_number_t hw = data->hwirq;
+ void __iomem *vec_addr = meta_intc_vec_addr(hw);
+ unsigned int cpu, thread;
+
+ /*
+ * Wire up this interrupt from HWVECxEXT to the Meta core.
+ *
+ * Note that we can't wire up HWVECxEXT to interrupt more than
+ * one cpu (the interrupt code doesn't support it), so we just
+ * pick the first cpu we find in 'cpumask'.
+ */
+ cpu = cpumask_any(cpumask);
+ thread = cpu_2_hwthread_id[cpu];
+
+ metag_out32(TBI_TRIG_VEC(TBID_SIGNUM_TR2(thread)), vec_addr);
+
+ return 0;
+}
+#else
+#define meta_intc_set_affinity NULL
+#endif
+
+#ifdef CONFIG_PM_SLEEP
+#define META_INTC_CHIP_FLAGS (IRQCHIP_MASK_ON_SUSPEND \
+ | IRQCHIP_SKIP_SET_WAKE)
+#else
+#define META_INTC_CHIP_FLAGS 0
+#endif
+
+/* public edge/level irq chips which SoCs can override */
+
+struct irq_chip meta_intc_edge_chip = {
+ .irq_startup = meta_intc_startup_irq,
+ .irq_shutdown = meta_intc_shutdown_irq,
+ .irq_ack = meta_intc_ack_irq,
+ .irq_mask = meta_intc_mask_irq,
+ .irq_unmask = meta_intc_unmask_irq,
+ .irq_set_type = meta_intc_irq_set_type,
+ .irq_set_affinity = meta_intc_set_affinity,
+ .flags = META_INTC_CHIP_FLAGS,
+};
+
+struct irq_chip meta_intc_level_chip = {
+ .irq_startup = meta_intc_startup_irq,
+ .irq_shutdown = meta_intc_shutdown_irq,
+ .irq_set_type = meta_intc_irq_set_type,
+ .irq_mask = meta_intc_mask_irq,
+ .irq_unmask = meta_intc_unmask_irq,
+ .irq_set_affinity = meta_intc_set_affinity,
+ .flags = META_INTC_CHIP_FLAGS,
+};
+
+/**
+ * meta_intc_map() - map an external irq
+ * @d: irq domain of external trigger block
+ * @irq: virtual irq number
+ * @hw: hardware irq number within external trigger block
+ *
+ * This sets up a virtual irq for a specified hardware interrupt. The irq chip
+ * and handler is configured, using the HWLEVELEXT registers to determine
+ * edge/level flow type. These registers will have been set when the irq type is
+ * set (or set to a default at init time).
+ */
+static int meta_intc_map(struct irq_domain *d, unsigned int irq,
+ irq_hw_number_t hw)
+{
+ unsigned int bit = 1 << meta_intc_offset(hw);
+ void __iomem *level_addr = meta_intc_level_addr(hw);
+
+ /* Go by the current sense in the HWLEVELEXT register */
+ if (metag_in32(level_addr) & bit)
+ irq_set_chip_and_handler(irq, &meta_intc_level_chip,
+ handle_level_irq);
+ else
+ irq_set_chip_and_handler(irq, &meta_intc_edge_chip,
+ handle_edge_irq);
+ return 0;
+}
+
+static const struct irq_domain_ops meta_intc_domain_ops = {
+ .map = meta_intc_map,
+ .xlate = irq_domain_xlate_twocell,
+};
+
+#ifdef CONFIG_METAG_SUSPEND_MEM
+
+/**
+ * struct meta_intc_context - suspend context
+ * @levels: State of HWLEVELEXT registers
+ * @masks: State of HWMASKEXT registers
+ * @vectors: State of HWVECEXT registers
+ * @txvecint: State of TxVECINT registers
+ *
+ * This structure stores the IRQ state across suspend.
+ */
+struct meta_intc_context {
+ u32 levels[4];
+ u32 masks[4];
+ u8 vectors[4*32];
+
+ u8 txvecint[4][4];
+};
+
+/* suspend context */
+static struct meta_intc_context *meta_intc_context;
+
+/**
+ * meta_intc_suspend() - store irq state
+ *
+ * To avoid interfering with other threads we only save the IRQ state of IRQs in
+ * use by Linux.
+ */
+static int meta_intc_suspend(void)
+{
+ struct meta_intc_priv *priv = &meta_intc_priv;
+ int i, j;
+ irq_hw_number_t hw;
+ unsigned int bank;
+ unsigned long flags;
+ struct meta_intc_context *context;
+ void __iomem *level_addr, *mask_addr, *vec_addr;
+ u32 mask, bit;
+
+ context = kzalloc(sizeof(*context), GFP_ATOMIC);
+ if (!context)
+ return -ENOMEM;
+
+ hw = 0;
+ level_addr = meta_intc_level_addr(0);
+ mask_addr = meta_intc_mask_addr(0);
+ for (bank = 0; bank < priv->nr_banks; ++bank) {
+ vec_addr = meta_intc_vec_addr(hw);
+
+ /* create mask of interrupts in use */
+ mask = 0;
+ for (bit = 1; bit; bit <<= 1) {
+ i = irq_linear_revmap(priv->domain, hw);
+ /* save mapped irqs which are enabled or have actions */
+ if (i && (!irqd_irq_disabled(irq_get_irq_data(i)) ||
+ irq_has_action(i))) {
+ mask |= bit;
+
+ /* save trigger vector */
+ context->vectors[hw] = metag_in32(vec_addr);
+ }
+
+ ++hw;
+ vec_addr += HWVECnEXT_STRIDE;
+ }
+
+ /* save level state if any IRQ levels altered */
+ if (priv->levels_altered[bank])
+ context->levels[bank] = metag_in32(level_addr);
+ /* save mask state if any IRQs in use */
+ if (mask)
+ context->masks[bank] = metag_in32(mask_addr);
+
+ level_addr += HWSTAT_STRIDE;
+ mask_addr += HWSTAT_STRIDE;
+ }
+
+ /* save trigger matrixing */
+ __global_lock2(flags);
+ for (i = 0; i < 4; ++i)
+ for (j = 0; j < 4; ++j)
+ context->txvecint[i][j] = metag_in32(T0VECINT_BHALT +
+ TnVECINT_STRIDE*i +
+ 8*j);
+ __global_unlock2(flags);
+
+ meta_intc_context = context;
+ return 0;
+}
+
+/**
+ * meta_intc_resume() - restore saved irq state
+ *
+ * Restore the saved IRQ state and drop it.
+ */
+static void meta_intc_resume(void)
+{
+ struct meta_intc_priv *priv = &meta_intc_priv;
+ int i, j;
+ irq_hw_number_t hw;
+ unsigned int bank;
+ unsigned long flags;
+ struct meta_intc_context *context = meta_intc_context;
+ void __iomem *level_addr, *mask_addr, *vec_addr;
+ u32 mask, bit, tmp;
+
+ meta_intc_context = NULL;
+
+ hw = 0;
+ level_addr = meta_intc_level_addr(0);
+ mask_addr = meta_intc_mask_addr(0);
+ for (bank = 0; bank < priv->nr_banks; ++bank) {
+ vec_addr = meta_intc_vec_addr(hw);
+
+ /* create mask of interrupts in use */
+ mask = 0;
+ for (bit = 1; bit; bit <<= 1) {
+ i = irq_linear_revmap(priv->domain, hw);
+ /* restore mapped irqs, enabled or with actions */
+ if (i && (!irqd_irq_disabled(irq_get_irq_data(i)) ||
+ irq_has_action(i))) {
+ mask |= bit;
+
+ /* restore trigger vector */
+ metag_out32(context->vectors[hw], vec_addr);
+ }
+
+ ++hw;
+ vec_addr += HWVECnEXT_STRIDE;
+ }
+
+ if (mask) {
+ /* restore mask state */
+ __global_lock2(flags);
+ tmp = metag_in32(mask_addr);
+ tmp = (tmp & ~mask) | (context->masks[bank] & mask);
+ metag_out32(tmp, mask_addr);
+ __global_unlock2(flags);
+ }
+
+ mask = priv->levels_altered[bank];
+ if (mask) {
+ /* restore level state */
+ __global_lock2(flags);
+ tmp = metag_in32(level_addr);
+ tmp = (tmp & ~mask) | (context->levels[bank] & mask);
+ metag_out32(tmp, level_addr);
+ __global_unlock2(flags);
+ }
+
+ level_addr += HWSTAT_STRIDE;
+ mask_addr += HWSTAT_STRIDE;
+ }
+
+ /* restore trigger matrixing */
+ __global_lock2(flags);
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) {
+ metag_out32(context->txvecint[i][j],
+ T0VECINT_BHALT +
+ TnVECINT_STRIDE*i +
+ 8*j);
+ }
+ }
+ __global_unlock2(flags);
+
+ kfree(context);
+}
+
+static struct syscore_ops meta_intc_syscore_ops = {
+ .suspend = meta_intc_suspend,
+ .resume = meta_intc_resume,
+};
+
+static void __init meta_intc_init_syscore_ops(struct meta_intc_priv *priv)
+{
+ register_syscore_ops(&meta_intc_syscore_ops);
+}
+#else
+#define meta_intc_init_syscore_ops(priv) do {} while (0)
+#endif
+
+/**
+ * meta_intc_init_cpu() - register with a Meta cpu
+ * @priv: private interrupt controller data
+ * @cpu: the CPU to register on
+ *
+ * Configure @cpu's TR2 irq so that we can demux external irqs.
+ */
+static void __init meta_intc_init_cpu(struct meta_intc_priv *priv, int cpu)
+{
+ unsigned int thread = cpu_2_hwthread_id[cpu];
+ unsigned int signum = TBID_SIGNUM_TR2(thread);
+ int irq = tbisig_map(signum);
+
+ /* Register the multiplexed IRQ handler */
+ irq_set_chained_handler(irq, meta_intc_irq_demux);
+ irq_set_irq_type(irq, IRQ_TYPE_LEVEL_LOW);
+}
+
+/**
+ * meta_intc_no_mask() - indicate lack of HWMASKEXT registers
+ *
+ * Called from SoC code (or init code below) to dynamically indicate the lack of
+ * HWMASKEXT registers (for example depending on some SoC revision register).
+ * This alters the irq mask and unmask callbacks to use the fallback
+ * unvectoring/retriggering technique instead of using HWMASKEXT registers.
+ */
+void __init meta_intc_no_mask(void)
+{
+ meta_intc_edge_chip.irq_mask = meta_intc_mask_irq_nomask;
+ meta_intc_edge_chip.irq_unmask = meta_intc_unmask_edge_irq_nomask;
+ meta_intc_level_chip.irq_mask = meta_intc_mask_irq_nomask;
+ meta_intc_level_chip.irq_unmask = meta_intc_unmask_level_irq_nomask;
+}
+
+/**
+ * init_external_IRQ() - initialise the external irq controller
+ *
+ * Set up the external irq controller using device tree properties. This is
+ * called from init_IRQ().
+ */
+int __init init_external_IRQ(void)
+{
+ struct meta_intc_priv *priv = &meta_intc_priv;
+ struct device_node *node;
+ int ret, cpu;
+ u32 val;
+ bool no_masks = false;
+
+ node = of_find_compatible_node(NULL, NULL, "img,meta-intc");
+ if (!node)
+ return -ENOENT;
+
+ /* Get number of banks */
+ ret = of_property_read_u32(node, "num-banks", &val);
+ if (ret) {
+ pr_err("meta-intc: No num-banks property found\n");
+ return ret;
+ }
+ if (val < 1 || val > 4) {
+ pr_err("meta-intc: num-banks (%u) out of range\n", val);
+ return -EINVAL;
+ }
+ priv->nr_banks = val;
+
+ /* Are any mask registers present? */
+ if (of_get_property(node, "no-mask", NULL))
+ no_masks = true;
+
+ /* No HWMASKEXT registers present? */
+ if (no_masks)
+ meta_intc_no_mask();
+
+ /* Set up an IRQ domain */
+ /*
+ * This is a legacy IRQ domain for now until all the platform setup code
+ * has been converted to devicetree.
+ */
+ priv->domain = irq_domain_add_linear(node, priv->nr_banks*32,
+ &meta_intc_domain_ops, priv);
+ if (unlikely(!priv->domain)) {
+ pr_err("meta-intc: cannot add IRQ domain\n");
+ return -ENOMEM;
+ }
+
+ /* Setup TR2 for all cpus. */
+ for_each_possible_cpu(cpu)
+ meta_intc_init_cpu(priv, cpu);
+
+ /* Set up system suspend/resume callbacks */
+ meta_intc_init_syscore_ops(priv);
+
+ pr_info("meta-intc: External IRQ controller initialised (%u IRQs)\n",
+ priv->nr_banks*32);
+
+ return 0;
+}
diff --git a/drivers/irqchip/irq-metag.c b/drivers/irqchip/irq-metag.c
new file mode 100644
index 00000000000..8e94d7a3b20
--- /dev/null
+++ b/drivers/irqchip/irq-metag.c
@@ -0,0 +1,343 @@
+/*
+ * Meta internal (HWSTATMETA) interrupt code.
+ *
+ * Copyright (C) 2011-2012 Imagination Technologies Ltd.
+ *
+ * This code is based on the code in SoC/common/irq.c and SoC/comet/irq.c
+ * The code base could be generalised/merged as a lot of the functionality is
+ * similar. Until this is done, we try to keep the code simple here.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irqdomain.h>
+
+#include <asm/irq.h>
+#include <asm/hwthread.h>
+
+#define PERF0VECINT 0x04820580
+#define PERF1VECINT 0x04820588
+#define PERF0TRIG_OFFSET 16
+#define PERF1TRIG_OFFSET 17
+
+/**
+ * struct metag_internal_irq_priv - private meta internal interrupt data
+ * @domain: IRQ domain for all internal Meta IRQs (HWSTATMETA)
+ * @unmasked: Record of unmasked IRQs
+ */
+struct metag_internal_irq_priv {
+ struct irq_domain *domain;
+
+ unsigned long unmasked;
+};
+
+/* Private data for the one and only internal interrupt controller */
+static struct metag_internal_irq_priv metag_internal_irq_priv;
+
+static unsigned int metag_internal_irq_startup(struct irq_data *data);
+static void metag_internal_irq_shutdown(struct irq_data *data);
+static void metag_internal_irq_ack(struct irq_data *data);
+static void metag_internal_irq_mask(struct irq_data *data);
+static void metag_internal_irq_unmask(struct irq_data *data);
+#ifdef CONFIG_SMP
+static int metag_internal_irq_set_affinity(struct irq_data *data,
+ const struct cpumask *cpumask, bool force);
+#endif
+
+static struct irq_chip internal_irq_edge_chip = {
+ .name = "HWSTATMETA-IRQ",
+ .irq_startup = metag_internal_irq_startup,
+ .irq_shutdown = metag_internal_irq_shutdown,
+ .irq_ack = metag_internal_irq_ack,
+ .irq_mask = metag_internal_irq_mask,
+ .irq_unmask = metag_internal_irq_unmask,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = metag_internal_irq_set_affinity,
+#endif
+};
+
+/*
+ * metag_hwvec_addr - get the address of *VECINT regs of irq
+ *
+ * This function is a table of supported triggers on HWSTATMETA
+ * Could do with a structure, but better keep it simple. Changes
+ * in this code should be rare.
+ */
+static inline void __iomem *metag_hwvec_addr(irq_hw_number_t hw)
+{
+ void __iomem *addr;
+
+ switch (hw) {
+ case PERF0TRIG_OFFSET:
+ addr = (void __iomem *)PERF0VECINT;
+ break;
+ case PERF1TRIG_OFFSET:
+ addr = (void __iomem *)PERF1VECINT;
+ break;
+ default:
+ addr = NULL;
+ break;
+ }
+ return addr;
+}
+
+/*
+ * metag_internal_startup - setup an internal irq
+ * @irq: the irq to startup
+ *
+ * Multiplex interrupts for @irq onto TR1. Clear any pending
+ * interrupts.
+ */
+static unsigned int metag_internal_irq_startup(struct irq_data *data)
+{
+ /* Clear (toggle) the bit in HWSTATMETA for our interrupt. */
+ metag_internal_irq_ack(data);
+
+ /* Enable the interrupt by unmasking it */
+ metag_internal_irq_unmask(data);
+
+ return 0;
+}
+
+/*
+ * metag_internal_irq_shutdown - turn off the irq
+ * @irq: the irq number to turn off
+ *
+ * Mask @irq and clear any pending interrupts.
+ * Stop muxing @irq onto TR1.
+ */
+static void metag_internal_irq_shutdown(struct irq_data *data)
+{
+ /* Disable the IRQ at the core by masking it. */
+ metag_internal_irq_mask(data);
+
+ /* Clear (toggle) the bit in HWSTATMETA for our interrupt. */
+ metag_internal_irq_ack(data);
+}
+
+/*
+ * metag_internal_irq_ack - acknowledge irq
+ * @irq: the irq to ack
+ */
+static void metag_internal_irq_ack(struct irq_data *data)
+{
+ irq_hw_number_t hw = data->hwirq;
+ unsigned int bit = 1 << hw;
+
+ if (metag_in32(HWSTATMETA) & bit)
+ metag_out32(bit, HWSTATMETA);
+}
+
+/**
+ * metag_internal_irq_mask() - mask an internal irq by unvectoring
+ * @data: data for the internal irq to mask
+ *
+ * HWSTATMETA has no mask register. Instead the IRQ is unvectored from the core
+ * and retriggered if necessary later.
+ */
+static void metag_internal_irq_mask(struct irq_data *data)
+{
+ struct metag_internal_irq_priv *priv = &metag_internal_irq_priv;
+ irq_hw_number_t hw = data->hwirq;
+ void __iomem *vec_addr = metag_hwvec_addr(hw);
+
+ clear_bit(hw, &priv->unmasked);
+
+ /* there is no interrupt mask, so unvector the interrupt */
+ metag_out32(0, vec_addr);
+}
+
+/**
+ * meta_intc_unmask_edge_irq_nomask() - unmask an edge irq by revectoring
+ * @data: data for the internal irq to unmask
+ *
+ * HWSTATMETA has no mask register. Instead the IRQ is revectored back to the
+ * core and retriggered if necessary.
+ */
+static void metag_internal_irq_unmask(struct irq_data *data)
+{
+ struct metag_internal_irq_priv *priv = &metag_internal_irq_priv;
+ irq_hw_number_t hw = data->hwirq;
+ unsigned int bit = 1 << hw;
+ void __iomem *vec_addr = metag_hwvec_addr(hw);
+ unsigned int thread = hard_processor_id();
+
+ set_bit(hw, &priv->unmasked);
+
+ /* there is no interrupt mask, so revector the interrupt */
+ metag_out32(TBI_TRIG_VEC(TBID_SIGNUM_TR1(thread)), vec_addr);
+
+ /*
+ * Re-trigger interrupt
+ *
+ * Writing a 1 toggles, and a 0->1 transition triggers. We only
+ * retrigger if the status bit is already set, which means we
+ * need to clear it first. Retriggering is fundamentally racy
+ * because if the interrupt fires again after we clear it we
+ * could end up clearing it again and the interrupt handler
+ * thinking it hasn't fired. Therefore we need to keep trying to
+ * retrigger until the bit is set.
+ */
+ if (metag_in32(HWSTATMETA) & bit) {
+ metag_out32(bit, HWSTATMETA);
+ while (!(metag_in32(HWSTATMETA) & bit))
+ metag_out32(bit, HWSTATMETA);
+ }
+}
+
+#ifdef CONFIG_SMP
+/*
+ * metag_internal_irq_set_affinity - set the affinity for an interrupt
+ */
+static int metag_internal_irq_set_affinity(struct irq_data *data,
+ const struct cpumask *cpumask, bool force)
+{
+ unsigned int cpu, thread;
+ irq_hw_number_t hw = data->hwirq;
+ /*
+ * Wire up this interrupt from *VECINT to the Meta core.
+ *
+ * Note that we can't wire up *VECINT to interrupt more than
+ * one cpu (the interrupt code doesn't support it), so we just
+ * pick the first cpu we find in 'cpumask'.
+ */
+ cpu = cpumask_any(cpumask);
+ thread = cpu_2_hwthread_id[cpu];
+
+ metag_out32(TBI_TRIG_VEC(TBID_SIGNUM_TR1(thread)),
+ metag_hwvec_addr(hw));
+
+ return 0;
+}
+#endif
+
+/*
+ * metag_internal_irq_demux - irq de-multiplexer
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * The cpu receives an interrupt on TR1 when an interrupt has
+ * occurred. It is this function's job to demux this irq and
+ * figure out exactly which trigger needs servicing.
+ */
+static void metag_internal_irq_demux(unsigned int irq, struct irq_desc *desc)
+{
+ struct metag_internal_irq_priv *priv = irq_desc_get_handler_data(desc);
+ irq_hw_number_t hw;
+ unsigned int irq_no;
+ u32 status;
+
+recalculate:
+ status = metag_in32(HWSTATMETA) & priv->unmasked;
+
+ for (hw = 0; status != 0; status >>= 1, ++hw) {
+ if (status & 0x1) {
+ /*
+ * Map the hardware IRQ number to a virtual Linux IRQ
+ * number.
+ */
+ irq_no = irq_linear_revmap(priv->domain, hw);
+
+ /*
+ * Only fire off interrupts that are
+ * registered to be handled by the kernel.
+ * Other interrupts are probably being
+ * handled by other Meta hardware threads.
+ */
+ generic_handle_irq(irq_no);
+
+ /*
+ * The handler may have re-enabled interrupts
+ * which could have caused a nested invocation
+ * of this code and make the copy of the
+ * status register we are using invalid.
+ */
+ goto recalculate;
+ }
+ }
+}
+
+/**
+ * internal_irq_map() - Map an internal meta IRQ to a virtual IRQ number.
+ * @hw: Number of the internal IRQ. Must be in range.
+ *
+ * Returns: The virtual IRQ number of the Meta internal IRQ specified by
+ * @hw.
+ */
+int internal_irq_map(unsigned int hw)
+{
+ struct metag_internal_irq_priv *priv = &metag_internal_irq_priv;
+ if (!priv->domain)
+ return -ENODEV;
+ return irq_create_mapping(priv->domain, hw);
+}
+
+/**
+ * metag_internal_irq_init_cpu - regsister with the Meta cpu
+ * @cpu: the CPU to register on
+ *
+ * Configure @cpu's TR1 irq so that we can demux irqs.
+ */
+static void metag_internal_irq_init_cpu(struct metag_internal_irq_priv *priv,
+ int cpu)
+{
+ unsigned int thread = cpu_2_hwthread_id[cpu];
+ unsigned int signum = TBID_SIGNUM_TR1(thread);
+ int irq = tbisig_map(signum);
+
+ /* Register the multiplexed IRQ handler */
+ irq_set_handler_data(irq, priv);
+ irq_set_chained_handler(irq, metag_internal_irq_demux);
+ irq_set_irq_type(irq, IRQ_TYPE_LEVEL_LOW);
+}
+
+/**
+ * metag_internal_intc_map() - map an internal irq
+ * @d: irq domain of internal trigger block
+ * @irq: virtual irq number
+ * @hw: hardware irq number within internal trigger block
+ *
+ * This sets up a virtual irq for a specified hardware interrupt. The irq chip
+ * and handler is configured.
+ */
+static int metag_internal_intc_map(struct irq_domain *d, unsigned int irq,
+ irq_hw_number_t hw)
+{
+ /* only register interrupt if it is mapped */
+ if (!metag_hwvec_addr(hw))
+ return -EINVAL;
+
+ irq_set_chip_and_handler(irq, &internal_irq_edge_chip,
+ handle_edge_irq);
+ return 0;
+}
+
+static const struct irq_domain_ops metag_internal_intc_domain_ops = {
+ .map = metag_internal_intc_map,
+};
+
+/**
+ * metag_internal_irq_register - register internal IRQs
+ *
+ * Register the irq chip and handler function for all internal IRQs
+ */
+int __init init_internal_IRQ(void)
+{
+ struct metag_internal_irq_priv *priv = &metag_internal_irq_priv;
+ unsigned int cpu;
+
+ /* Set up an IRQ domain */
+ priv->domain = irq_domain_add_linear(NULL, 32,
+ &metag_internal_intc_domain_ops,
+ priv);
+ if (unlikely(!priv->domain)) {
+ pr_err("meta-internal-intc: cannot add IRQ domain\n");
+ return -ENOMEM;
+ }
+
+ /* Setup TR1 for all cpus. */
+ for_each_possible_cpu(cpu)
+ metag_internal_irq_init_cpu(priv, cpu);
+
+ return 0;
+};
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 91a02eeeb31..4d8d90b4fe7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -154,17 +154,6 @@ config MD_RAID456
If unsure, say Y.
-config MULTICORE_RAID456
- bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
- depends on MD_RAID456
- depends on SMP
- depends on EXPERIMENTAL
- ---help---
- Enable the raid456 module to dispatch per-stripe raid operations to a
- thread pool.
-
- If unsure, say N.
-
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
@@ -210,7 +199,7 @@ config DM_DEBUG
config DM_BUFIO
tristate
- depends on BLK_DEV_DM && EXPERIMENTAL
+ depends on BLK_DEV_DM
---help---
This interface allows you to do buffered I/O on a device and acts
as a cache, holding recently-read blocks in memory and performing
@@ -218,7 +207,7 @@ config DM_BUFIO
config DM_BIO_PRISON
tristate
- depends on BLK_DEV_DM && EXPERIMENTAL
+ depends on BLK_DEV_DM
---help---
Some bio locking schemes used by other device-mapper targets
including thin provisioning.
@@ -251,8 +240,8 @@ config DM_SNAPSHOT
Allow volume managers to take writable snapshots of a device.
config DM_THIN_PROVISIONING
- tristate "Thin provisioning target (EXPERIMENTAL)"
- depends on BLK_DEV_DM && EXPERIMENTAL
+ tristate "Thin provisioning target"
+ depends on BLK_DEV_DM
select DM_PERSISTENT_DATA
select DM_BIO_PRISON
---help---
@@ -268,6 +257,37 @@ config DM_DEBUG_BLOCK_STACK_TRACING
If unsure, say N.
+config DM_CACHE
+ tristate "Cache target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ default n
+ select DM_PERSISTENT_DATA
+ select DM_BIO_PRISON
+ ---help---
+ dm-cache attempts to improve performance of a block device by
+ moving frequently used data to a smaller, higher performance
+ device. Different 'policy' plugins can be used to change the
+ algorithms used to select which blocks are promoted, demoted,
+ cleaned etc. It supports writeback and writethrough modes.
+
+config DM_CACHE_MQ
+ tristate "MQ Cache Policy (EXPERIMENTAL)"
+ depends on DM_CACHE
+ default y
+ ---help---
+ A cache policy that uses a multiqueue ordered by recent hit
+ count to select which blocks should be promoted and demoted.
+ This is meant to be a general purpose policy. It prioritises
+ reads over writes.
+
+config DM_CACHE_CLEANER
+ tristate "Cleaner Cache Policy (EXPERIMENTAL)"
+ depends on DM_CACHE
+ default y
+ ---help---
+ A simple cache policy that writes back all data to the
+ origin. Used when decommissioning a dm-cache.
+
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
@@ -302,8 +322,8 @@ config DM_RAID
in one of the available parity distribution methods.
config DM_LOG_USERSPACE
- tristate "Mirror userspace logging (EXPERIMENTAL)"
- depends on DM_MIRROR && EXPERIMENTAL && NET
+ tristate "Mirror userspace logging"
+ depends on DM_MIRROR && NET
select CONNECTOR
---help---
The userspace logging module provides a mechanism for
@@ -350,8 +370,8 @@ config DM_MULTIPATH_ST
If unsure, say N.
config DM_DELAY
- tristate "I/O delaying target (EXPERIMENTAL)"
- depends on BLK_DEV_DM && EXPERIMENTAL
+ tristate "I/O delaying target"
+ depends on BLK_DEV_DM
---help---
A target that delays reads and/or writes and can send
them to different devices. Useful for testing.
@@ -365,14 +385,14 @@ config DM_UEVENT
Generate udev events for DM events.
config DM_FLAKEY
- tristate "Flakey target (EXPERIMENTAL)"
- depends on BLK_DEV_DM && EXPERIMENTAL
+ tristate "Flakey target"
+ depends on BLK_DEV_DM
---help---
A target that intermittently fails I/O for debugging purposes.
config DM_VERITY
- tristate "Verity target support (EXPERIMENTAL)"
- depends on BLK_DEV_DM && EXPERIMENTAL
+ tristate "Verity target support"
+ depends on BLK_DEV_DM
select CRYPTO
select CRYPTO_HASH
select DM_BUFIO
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 94dce8b4932..7ceeaefc0e9 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,6 +11,9 @@ dm-mirror-y += dm-raid1.o
dm-log-userspace-y \
+= dm-log-userspace-base.o dm-log-userspace-transfer.o
dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
+dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-mq-y += dm-cache-policy-mq.o
+dm-cache-cleaner-y += dm-cache-policy-cleaner.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
@@ -44,6 +47,9 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o
obj-$(CONFIG_DM_RAID) += dm-raid.o
obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
obj-$(CONFIG_DM_VERITY) += dm-verity.o
+obj-$(CONFIG_DM_CACHE) += dm-cache.o
+obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index d9d3f1c7b66..85f0b707425 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,14 +14,6 @@
/*----------------------------------------------------------------*/
-struct dm_bio_prison_cell {
- struct hlist_node list;
- struct dm_bio_prison *prison;
- struct dm_cell_key key;
- struct bio *holder;
- struct bio_list bios;
-};
-
struct dm_bio_prison {
spinlock_t lock;
mempool_t *cell_pool;
@@ -87,6 +79,19 @@ void dm_bio_prison_destroy(struct dm_bio_prison *prison)
}
EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
+struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp)
+{
+ return mempool_alloc(prison->cell_pool, gfp);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell);
+
+void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell)
+{
+ mempool_free(cell, prison->cell_pool);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
+
static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
{
const unsigned long BIG_PRIME = 4294967291UL;
@@ -114,91 +119,95 @@ static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
return NULL;
}
-/*
- * This may block if a new cell needs allocating. You must ensure that
- * cells will be unlocked even if the calling thread is blocked.
- *
- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
- */
-int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
- struct bio *inmate, struct dm_bio_prison_cell **ref)
+static void __setup_new_cell(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *holder,
+ uint32_t hash,
+ struct dm_bio_prison_cell *cell)
{
- int r = 1;
- unsigned long flags;
- uint32_t hash = hash_key(prison, key);
- struct dm_bio_prison_cell *cell, *cell2;
-
- BUG_ON(hash > prison->nr_buckets);
-
- spin_lock_irqsave(&prison->lock, flags);
-
- cell = __search_bucket(prison->cells + hash, key);
- if (cell) {
- bio_list_add(&cell->bios, inmate);
- goto out;
- }
+ memcpy(&cell->key, key, sizeof(cell->key));
+ cell->holder = holder;
+ bio_list_init(&cell->bios);
+ hlist_add_head(&cell->list, prison->cells + hash);
+}
- /*
- * Allocate a new cell
- */
- spin_unlock_irqrestore(&prison->lock, flags);
- cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
- spin_lock_irqsave(&prison->lock, flags);
+static int __bio_detain(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *inmate,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result)
+{
+ uint32_t hash = hash_key(prison, key);
+ struct dm_bio_prison_cell *cell;
- /*
- * We've been unlocked, so we have to double check that
- * nobody else has inserted this cell in the meantime.
- */
cell = __search_bucket(prison->cells + hash, key);
if (cell) {
- mempool_free(cell2, prison->cell_pool);
- bio_list_add(&cell->bios, inmate);
- goto out;
+ if (inmate)
+ bio_list_add(&cell->bios, inmate);
+ *cell_result = cell;
+ return 1;
}
- /*
- * Use new cell.
- */
- cell = cell2;
-
- cell->prison = prison;
- memcpy(&cell->key, key, sizeof(cell->key));
- cell->holder = inmate;
- bio_list_init(&cell->bios);
- hlist_add_head(&cell->list, prison->cells + hash);
+ __setup_new_cell(prison, key, inmate, hash, cell_prealloc);
+ *cell_result = cell_prealloc;
+ return 0;
+}
- r = 0;
+static int bio_detain(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *inmate,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ unsigned long flags;
-out:
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
spin_unlock_irqrestore(&prison->lock, flags);
- *ref = cell;
-
return r;
}
+
+int dm_bio_detain(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *inmate,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result)
+{
+ return bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+}
EXPORT_SYMBOL_GPL(dm_bio_detain);
+int dm_get_cell(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result)
+{
+ return bio_detain(prison, key, NULL, cell_prealloc, cell_result);
+}
+EXPORT_SYMBOL_GPL(dm_get_cell);
+
/*
* @inmates must have been initialised prior to this call
*/
-static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+static void __cell_release(struct dm_bio_prison_cell *cell,
+ struct bio_list *inmates)
{
- struct dm_bio_prison *prison = cell->prison;
-
hlist_del(&cell->list);
if (inmates) {
- bio_list_add(inmates, cell->holder);
+ if (cell->holder)
+ bio_list_add(inmates, cell->holder);
bio_list_merge(inmates, &cell->bios);
}
-
- mempool_free(cell, prison->cell_pool);
}
-void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
+void dm_cell_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *bios)
{
unsigned long flags;
- struct dm_bio_prison *prison = cell->prison;
spin_lock_irqsave(&prison->lock, flags);
__cell_release(cell, bios);
@@ -209,20 +218,18 @@ EXPORT_SYMBOL_GPL(dm_cell_release);
/*
* Sometimes we don't want the holder, just the additional bios.
*/
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
+ struct bio_list *inmates)
{
- struct dm_bio_prison *prison = cell->prison;
-
hlist_del(&cell->list);
bio_list_merge(inmates, &cell->bios);
-
- mempool_free(cell, prison->cell_pool);
}
-void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+void dm_cell_release_no_holder(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *inmates)
{
unsigned long flags;
- struct dm_bio_prison *prison = cell->prison;
spin_lock_irqsave(&prison->lock, flags);
__cell_release_no_holder(cell, inmates);
@@ -230,9 +237,9 @@ void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list
}
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
-void dm_cell_error(struct dm_bio_prison_cell *cell)
+void dm_cell_error(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell)
{
- struct dm_bio_prison *prison = cell->prison;
struct bio_list bios;
struct bio *bio;
unsigned long flags;
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 53d1a7a84e2..3f833190ead 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -22,7 +22,6 @@
* subsequently unlocked the bios become available.
*/
struct dm_bio_prison;
-struct dm_bio_prison_cell;
/* FIXME: this needs to be more abstract */
struct dm_cell_key {
@@ -31,21 +30,62 @@ struct dm_cell_key {
dm_block_t block;
};
+/*
+ * Treat this as opaque, only in header so callers can manage allocation
+ * themselves.
+ */
+struct dm_bio_prison_cell {
+ struct hlist_node list;
+ struct dm_cell_key key;
+ struct bio *holder;
+ struct bio_list bios;
+};
+
struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
void dm_bio_prison_destroy(struct dm_bio_prison *prison);
/*
- * This may block if a new cell needs allocating. You must ensure that
- * cells will be unlocked even if the calling thread is blocked.
+ * These two functions just wrap a mempool. This is a transitory step:
+ * Eventually all bio prison clients should manage their own cell memory.
*
- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
+ * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called
+ * in interrupt context or passed GFP_NOWAIT.
*/
-int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
- struct bio *inmate, struct dm_bio_prison_cell **ref);
+struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison,
+ gfp_t gfp);
+void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell);
-void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios);
-void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates);
-void dm_cell_error(struct dm_bio_prison_cell *cell);
+/*
+ * Creates, or retrieves a cell for the given key.
+ *
+ * Returns 1 if pre-existing cell returned, zero if new cell created using
+ * @cell_prealloc.
+ */
+int dm_get_cell(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result);
+
+/*
+ * An atomic op that combines retrieving a cell, and adding a bio to it.
+ *
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
+ */
+int dm_bio_detain(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *inmate,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result);
+
+void dm_cell_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *bios);
+void dm_cell_release_no_holder(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *inmates);
+void dm_cell_error(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell);
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 93205e32a00..3c955e10a61 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
int dm_bufio_issue_flush(struct dm_bufio_client *c)
{
struct dm_io_request io_req = {
- .bi_rw = REQ_FLUSH,
+ .bi_rw = WRITE_FLUSH,
.mem.type = DM_IO_KMEM,
.mem.ptr.addr = NULL,
.client = c->dm_io,
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
new file mode 100644
index 00000000000..bed4ad4e1b7
--- /dev/null
+++ b/drivers/md/dm-cache-block-types.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_BLOCK_TYPES_H
+#define DM_CACHE_BLOCK_TYPES_H
+
+#include "persistent-data/dm-block-manager.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * It's helpful to get sparse to differentiate between indexes into the
+ * origin device, indexes into the cache device, and indexes into the
+ * discard bitset.
+ */
+
+typedef dm_block_t __bitwise__ dm_oblock_t;
+typedef uint32_t __bitwise__ dm_cblock_t;
+typedef dm_block_t __bitwise__ dm_dblock_t;
+
+static inline dm_oblock_t to_oblock(dm_block_t b)
+{
+ return (__force dm_oblock_t) b;
+}
+
+static inline dm_block_t from_oblock(dm_oblock_t b)
+{
+ return (__force dm_block_t) b;
+}
+
+static inline dm_cblock_t to_cblock(uint32_t b)
+{
+ return (__force dm_cblock_t) b;
+}
+
+static inline uint32_t from_cblock(dm_cblock_t b)
+{
+ return (__force uint32_t) b;
+}
+
+static inline dm_dblock_t to_dblock(dm_block_t b)
+{
+ return (__force dm_dblock_t) b;
+}
+
+static inline dm_block_t from_dblock(dm_dblock_t b)
+{
+ return (__force dm_block_t) b;
+}
+
+#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
new file mode 100644
index 00000000000..fbd3625f274
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.c
@@ -0,0 +1,1146 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-metadata.h"
+
+#include "persistent-data/dm-array.h"
+#include "persistent-data/dm-bitset.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-disk.h"
+#include "persistent-data/dm-transaction-manager.h"
+
+#include <linux/device-mapper.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "cache metadata"
+
+#define CACHE_SUPERBLOCK_MAGIC 06142003
+#define CACHE_SUPERBLOCK_LOCATION 0
+#define CACHE_VERSION 1
+#define CACHE_METADATA_CACHE_SIZE 64
+
+/*
+ * 3 for btree insert +
+ * 2 for btree lookup used within space map
+ */
+#define CACHE_MAX_CONCURRENT_LOCKS 5
+#define SPACE_MAP_ROOT_SIZE 128
+
+enum superblock_flag_bits {
+ /* for spotting crashes that would invalidate the dirty bitset */
+ CLEAN_SHUTDOWN,
+};
+
+/*
+ * Each mapping from cache block -> origin block carries a set of flags.
+ */
+enum mapping_bits {
+ /*
+ * A valid mapping. Because we're using an array we clear this
+ * flag for an non existant mapping.
+ */
+ M_VALID = 1,
+
+ /*
+ * The data on the cache is different from that on the origin.
+ */
+ M_DIRTY = 2
+};
+
+struct cache_disk_superblock {
+ __le32 csum;
+ __le32 flags;
+ __le64 blocknr;
+
+ __u8 uuid[16];
+ __le64 magic;
+ __le32 version;
+
+ __u8 policy_name[CACHE_POLICY_NAME_SIZE];
+ __le32 policy_hint_size;
+
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+ __le64 mapping_root;
+ __le64 hint_root;
+
+ __le64 discard_root;
+ __le64 discard_block_size;
+ __le64 discard_nr_blocks;
+
+ __le32 data_block_size;
+ __le32 metadata_block_size;
+ __le32 cache_blocks;
+
+ __le32 compat_flags;
+ __le32 compat_ro_flags;
+ __le32 incompat_flags;
+
+ __le32 read_hits;
+ __le32 read_misses;
+ __le32 write_hits;
+ __le32 write_misses;
+} __packed;
+
+struct dm_cache_metadata {
+ struct block_device *bdev;
+ struct dm_block_manager *bm;
+ struct dm_space_map *metadata_sm;
+ struct dm_transaction_manager *tm;
+
+ struct dm_array_info info;
+ struct dm_array_info hint_info;
+ struct dm_disk_bitset discard_info;
+
+ struct rw_semaphore root_lock;
+ dm_block_t root;
+ dm_block_t hint_root;
+ dm_block_t discard_root;
+
+ sector_t discard_block_size;
+ dm_dblock_t discard_nr_blocks;
+
+ sector_t data_block_size;
+ dm_cblock_t cache_blocks;
+ bool changed:1;
+ bool clean_when_opened:1;
+
+ char policy_name[CACHE_POLICY_NAME_SIZE];
+ size_t policy_hint_size;
+ struct dm_cache_statistics stats;
+};
+
+/*-------------------------------------------------------------------
+ * superblock validator
+ *-----------------------------------------------------------------*/
+
+#define SUPERBLOCK_CSUM_XOR 9031977
+
+static void sb_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t sb_block_size)
+{
+ struct cache_disk_superblock *disk_super = dm_block_data(b);
+
+ disk_super->blocknr = cpu_to_le64(dm_block_location(b));
+ disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+ sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR));
+}
+
+static int sb_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t sb_block_size)
+{
+ struct cache_disk_superblock *disk_super = dm_block_data(b);
+ __le32 csum_le;
+
+ if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
+ DMERR("sb_check failed: blocknr %llu: wanted %llu",
+ le64_to_cpu(disk_super->blocknr),
+ (unsigned long long)dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) {
+ DMERR("sb_check failed: magic %llu: wanted %llu",
+ le64_to_cpu(disk_super->magic),
+ (unsigned long long)CACHE_SUPERBLOCK_MAGIC);
+ return -EILSEQ;
+ }
+
+ csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+ sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR));
+ if (csum_le != disk_super->csum) {
+ DMERR("sb_check failed: csum %u: wanted %u",
+ le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+static struct dm_block_validator sb_validator = {
+ .name = "superblock",
+ .prepare_for_write = sb_prepare_for_write,
+ .check = sb_check
+};
+
+/*----------------------------------------------------------------*/
+
+static int superblock_read_lock(struct dm_cache_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+static int superblock_lock_zero(struct dm_cache_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+static int superblock_lock(struct dm_cache_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+/*----------------------------------------------------------------*/
+
+static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+{
+ int r;
+ unsigned i;
+ struct dm_block *b;
+ __le64 *data_le, zero = cpu_to_le64(0);
+ unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
+
+ /*
+ * We can't use a validator here - it may be all zeroes.
+ */
+ r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b);
+ if (r)
+ return r;
+
+ data_le = dm_block_data(b);
+ *result = 1;
+ for (i = 0; i < sb_block_size; i++) {
+ if (data_le[i] != zero) {
+ *result = 0;
+ break;
+ }
+ }
+
+ return dm_bm_unlock(b);
+}
+
+static void __setup_mapping_info(struct dm_cache_metadata *cmd)
+{
+ struct dm_btree_value_type vt;
+
+ vt.context = NULL;
+ vt.size = sizeof(__le64);
+ vt.inc = NULL;
+ vt.dec = NULL;
+ vt.equal = NULL;
+ dm_array_info_init(&cmd->info, cmd->tm, &vt);
+
+ if (cmd->policy_hint_size) {
+ vt.size = sizeof(__le32);
+ dm_array_info_init(&cmd->hint_info, cmd->tm, &vt);
+ }
+}
+
+static int __write_initial_superblock(struct dm_cache_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ size_t metadata_len;
+ struct cache_disk_superblock *disk_super;
+ sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
+
+ /* FIXME: see if we can lose the max sectors limit */
+ if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
+ bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
+
+ r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+ if (r < 0)
+ return r;
+
+ r = dm_tm_pre_commit(cmd->tm);
+ if (r < 0)
+ return r;
+
+ r = superblock_lock_zero(cmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ disk_super->flags = 0;
+ memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
+ disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
+ disk_super->version = cpu_to_le32(CACHE_VERSION);
+ memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE);
+ disk_super->policy_hint_size = 0;
+
+ r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
+ metadata_len);
+ if (r < 0)
+ goto bad_locked;
+
+ disk_super->mapping_root = cpu_to_le64(cmd->root);
+ disk_super->hint_root = cpu_to_le64(cmd->hint_root);
+ disk_super->discard_root = cpu_to_le64(cmd->discard_root);
+ disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
+ disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
+ disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+ disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
+ disk_super->cache_blocks = cpu_to_le32(0);
+ memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
+
+ disk_super->read_hits = cpu_to_le32(0);
+ disk_super->read_misses = cpu_to_le32(0);
+ disk_super->write_hits = cpu_to_le32(0);
+ disk_super->write_misses = cpu_to_le32(0);
+
+ return dm_tm_commit(cmd->tm, sblock);
+
+bad_locked:
+ dm_bm_unlock(sblock);
+ return r;
+}
+
+static int __format_metadata(struct dm_cache_metadata *cmd)
+{
+ int r;
+
+ r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ &cmd->tm, &cmd->metadata_sm);
+ if (r < 0) {
+ DMERR("tm_create_with_sm failed");
+ return r;
+ }
+
+ __setup_mapping_info(cmd);
+
+ r = dm_array_empty(&cmd->info, &cmd->root);
+ if (r < 0)
+ goto bad;
+
+ dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
+
+ r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
+ if (r < 0)
+ goto bad;
+
+ cmd->discard_block_size = 0;
+ cmd->discard_nr_blocks = 0;
+
+ r = __write_initial_superblock(cmd);
+ if (r)
+ goto bad;
+
+ cmd->clean_when_opened = true;
+ return 0;
+
+bad:
+ dm_tm_destroy(cmd->tm);
+ dm_sm_destroy(cmd->metadata_sm);
+
+ return r;
+}
+
+static int __check_incompat_features(struct cache_disk_superblock *disk_super,
+ struct dm_cache_metadata *cmd)
+{
+ uint32_t features;
+
+ features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
+ if (features) {
+ DMERR("could not access metadata due to unsupported optional features (%lx).",
+ (unsigned long)features);
+ return -EINVAL;
+ }
+
+ /*
+ * Check for read-only metadata to skip the following RDWR checks.
+ */
+ if (get_disk_ro(cmd->bdev->bd_disk))
+ return 0;
+
+ features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP;
+ if (features) {
+ DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
+ (unsigned long)features);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __open_metadata(struct dm_cache_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct cache_disk_superblock *disk_super;
+ unsigned long sb_flags;
+
+ r = superblock_read_lock(cmd, &sblock);
+ if (r < 0) {
+ DMERR("couldn't read lock superblock");
+ return r;
+ }
+
+ disk_super = dm_block_data(sblock);
+
+ r = __check_incompat_features(disk_super, cmd);
+ if (r < 0)
+ goto bad;
+
+ r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ disk_super->metadata_space_map_root,
+ sizeof(disk_super->metadata_space_map_root),
+ &cmd->tm, &cmd->metadata_sm);
+ if (r < 0) {
+ DMERR("tm_open_with_sm failed");
+ goto bad;
+ }
+
+ __setup_mapping_info(cmd);
+ dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
+ sb_flags = le32_to_cpu(disk_super->flags);
+ cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
+ return dm_bm_unlock(sblock);
+
+bad:
+ dm_bm_unlock(sblock);
+ return r;
+}
+
+static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
+ bool format_device)
+{
+ int r, unformatted;
+
+ r = __superblock_all_zeroes(cmd->bm, &unformatted);
+ if (r)
+ return r;
+
+ if (unformatted)
+ return format_device ? __format_metadata(cmd) : -EPERM;
+
+ return __open_metadata(cmd);
+}
+
+static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
+ bool may_format_device)
+{
+ int r;
+ cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
+ CACHE_METADATA_CACHE_SIZE,
+ CACHE_MAX_CONCURRENT_LOCKS);
+ if (IS_ERR(cmd->bm)) {
+ DMERR("could not create block manager");
+ return PTR_ERR(cmd->bm);
+ }
+
+ r = __open_or_format_metadata(cmd, may_format_device);
+ if (r)
+ dm_block_manager_destroy(cmd->bm);
+
+ return r;
+}
+
+static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd)
+{
+ dm_sm_destroy(cmd->metadata_sm);
+ dm_tm_destroy(cmd->tm);
+ dm_block_manager_destroy(cmd->bm);
+}
+
+typedef unsigned long (*flags_mutator)(unsigned long);
+
+static void update_flags(struct cache_disk_superblock *disk_super,
+ flags_mutator mutator)
+{
+ uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags));
+ disk_super->flags = cpu_to_le32(sb_flags);
+}
+
+static unsigned long set_clean_shutdown(unsigned long flags)
+{
+ set_bit(CLEAN_SHUTDOWN, &flags);
+ return flags;
+}
+
+static unsigned long clear_clean_shutdown(unsigned long flags)
+{
+ clear_bit(CLEAN_SHUTDOWN, &flags);
+ return flags;
+}
+
+static void read_superblock_fields(struct dm_cache_metadata *cmd,
+ struct cache_disk_superblock *disk_super)
+{
+ cmd->root = le64_to_cpu(disk_super->mapping_root);
+ cmd->hint_root = le64_to_cpu(disk_super->hint_root);
+ cmd->discard_root = le64_to_cpu(disk_super->discard_root);
+ cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
+ cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
+ cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
+ cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
+ strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
+ cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
+
+ cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
+ cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses);
+ cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
+ cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
+
+ cmd->changed = false;
+}
+
+/*
+ * The mutator updates the superblock flags.
+ */
+static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
+ flags_mutator mutator)
+{
+ int r;
+ struct cache_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ r = superblock_lock(cmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ update_flags(disk_super, mutator);
+ read_superblock_fields(cmd, disk_super);
+
+ return dm_bm_flush_and_unlock(cmd->bm, sblock);
+}
+
+static int __begin_transaction(struct dm_cache_metadata *cmd)
+{
+ int r;
+ struct cache_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ /*
+ * We re-read the superblock every time. Shouldn't need to do this
+ * really.
+ */
+ r = superblock_read_lock(cmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ read_superblock_fields(cmd, disk_super);
+ dm_bm_unlock(sblock);
+
+ return 0;
+}
+
+static int __commit_transaction(struct dm_cache_metadata *cmd,
+ flags_mutator mutator)
+{
+ int r;
+ size_t metadata_len;
+ struct cache_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ /*
+ * We need to know if the cache_disk_superblock exceeds a 512-byte sector.
+ */
+ BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
+
+ r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
+ &cmd->discard_root);
+ if (r)
+ return r;
+
+ r = dm_tm_pre_commit(cmd->tm);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+ if (r < 0)
+ return r;
+
+ r = superblock_lock(cmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+
+ if (mutator)
+ update_flags(disk_super, mutator);
+
+ disk_super->mapping_root = cpu_to_le64(cmd->root);
+ disk_super->hint_root = cpu_to_le64(cmd->hint_root);
+ disk_super->discard_root = cpu_to_le64(cmd->discard_root);
+ disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
+ disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
+ disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
+ strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
+
+ disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
+ disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
+ disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
+ disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
+
+ r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
+ metadata_len);
+ if (r < 0) {
+ dm_bm_unlock(sblock);
+ return r;
+ }
+
+ return dm_tm_commit(cmd->tm, sblock);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The mappings are held in a dm-array that has 64-bit values stored in
+ * little-endian format. The index is the cblock, the high 48bits of the
+ * value are the oblock and the low 16 bit the flags.
+ */
+#define FLAGS_MASK ((1 << 16) - 1)
+
+static __le64 pack_value(dm_oblock_t block, unsigned flags)
+{
+ uint64_t value = from_oblock(block);
+ value <<= 16;
+ value = value | (flags & FLAGS_MASK);
+ return cpu_to_le64(value);
+}
+
+static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
+{
+ uint64_t value = le64_to_cpu(value_le);
+ uint64_t b = value >> 16;
+ *block = to_oblock(b);
+ *flags = value & FLAGS_MASK;
+}
+
+/*----------------------------------------------------------------*/
+
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size)
+{
+ int r;
+ struct dm_cache_metadata *cmd;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd) {
+ DMERR("could not allocate metadata struct");
+ return NULL;
+ }
+
+ init_rwsem(&cmd->root_lock);
+ cmd->bdev = bdev;
+ cmd->data_block_size = data_block_size;
+ cmd->cache_blocks = 0;
+ cmd->policy_hint_size = policy_hint_size;
+ cmd->changed = true;
+
+ r = __create_persistent_data_objects(cmd, may_format_device);
+ if (r) {
+ kfree(cmd);
+ return ERR_PTR(r);
+ }
+
+ r = __begin_transaction_flags(cmd, clear_clean_shutdown);
+ if (r < 0) {
+ dm_cache_metadata_close(cmd);
+ return ERR_PTR(r);
+ }
+
+ return cmd;
+}
+
+void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
+{
+ __destroy_persistent_data_objects(cmd);
+ kfree(cmd);
+}
+
+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
+{
+ int r;
+ __le64 null_mapping = pack_value(0, 0);
+
+ down_write(&cmd->root_lock);
+ __dm_bless_for_disk(&null_mapping);
+ r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
+ from_cblock(new_cache_size),
+ &null_mapping, &cmd->root);
+ if (!r)
+ cmd->cache_blocks = new_cache_size;
+ cmd->changed = true;
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
+ sector_t discard_block_size,
+ dm_dblock_t new_nr_entries)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = dm_bitset_resize(&cmd->discard_info,
+ cmd->discard_root,
+ from_dblock(cmd->discard_nr_blocks),
+ from_dblock(new_nr_entries),
+ false, &cmd->discard_root);
+ if (!r) {
+ cmd->discard_block_size = discard_block_size;
+ cmd->discard_nr_blocks = new_nr_entries;
+ }
+
+ cmd->changed = true;
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
+{
+ return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
+ from_dblock(b), &cmd->discard_root);
+}
+
+static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
+{
+ return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
+ from_dblock(b), &cmd->discard_root);
+}
+
+static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
+ bool *is_discarded)
+{
+ return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
+ from_dblock(b), &cmd->discard_root,
+ is_discarded);
+}
+
+static int __discard(struct dm_cache_metadata *cmd,
+ dm_dblock_t dblock, bool discard)
+{
+ int r;
+
+ r = (discard ? __set_discard : __clear_discard)(cmd, dblock);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return 0;
+}
+
+int dm_cache_set_discard(struct dm_cache_metadata *cmd,
+ dm_dblock_t dblock, bool discard)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = __discard(cmd, dblock, discard);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+static int __load_discards(struct dm_cache_metadata *cmd,
+ load_discard_fn fn, void *context)
+{
+ int r = 0;
+ dm_block_t b;
+ bool discard;
+
+ for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ dm_dblock_t dblock = to_dblock(b);
+
+ if (cmd->clean_when_opened) {
+ r = __is_discarded(cmd, dblock, &discard);
+ if (r)
+ return r;
+ } else
+ discard = false;
+
+ r = fn(context, cmd->discard_block_size, dblock, discard);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
+ load_discard_fn fn, void *context)
+{
+ int r;
+
+ down_read(&cmd->root_lock);
+ r = __load_discards(cmd, fn, context);
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
+{
+ dm_cblock_t r;
+
+ down_read(&cmd->root_lock);
+ r = cmd->cache_blocks;
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
+{
+ int r;
+ __le64 value = pack_value(0, 0);
+
+ __dm_bless_for_disk(&value);
+ r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+ &value, &cmd->root);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return 0;
+}
+
+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = __remove(cmd, cblock);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+static int __insert(struct dm_cache_metadata *cmd,
+ dm_cblock_t cblock, dm_oblock_t oblock)
+{
+ int r;
+ __le64 value = pack_value(oblock, M_VALID);
+ __dm_bless_for_disk(&value);
+
+ r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+ &value, &cmd->root);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return 0;
+}
+
+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
+ dm_cblock_t cblock, dm_oblock_t oblock)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = __insert(cmd, cblock, oblock);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+struct thunk {
+ load_mapping_fn fn;
+ void *context;
+
+ struct dm_cache_metadata *cmd;
+ bool respect_dirty_flags;
+ bool hints_valid;
+};
+
+static bool hints_array_initialized(struct dm_cache_metadata *cmd)
+{
+ return cmd->hint_root && cmd->policy_hint_size;
+}
+
+static bool hints_array_available(struct dm_cache_metadata *cmd,
+ const char *policy_name)
+{
+ bool policy_names_match = !strncmp(cmd->policy_name, policy_name,
+ sizeof(cmd->policy_name));
+
+ return cmd->clean_when_opened && policy_names_match &&
+ hints_array_initialized(cmd);
+}
+
+static int __load_mapping(void *context, uint64_t cblock, void *leaf)
+{
+ int r = 0;
+ bool dirty;
+ __le64 value;
+ __le32 hint_value = 0;
+ dm_oblock_t oblock;
+ unsigned flags;
+ struct thunk *thunk = context;
+ struct dm_cache_metadata *cmd = thunk->cmd;
+
+ memcpy(&value, leaf, sizeof(value));
+ unpack_value(value, &oblock, &flags);
+
+ if (flags & M_VALID) {
+ if (thunk->hints_valid) {
+ r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
+ cblock, &hint_value);
+ if (r && r != -ENODATA)
+ return r;
+ }
+
+ dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
+ r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
+ dirty, le32_to_cpu(hint_value), thunk->hints_valid);
+ }
+
+ return r;
+}
+
+static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
+ load_mapping_fn fn, void *context)
+{
+ struct thunk thunk;
+
+ thunk.fn = fn;
+ thunk.context = context;
+
+ thunk.cmd = cmd;
+ thunk.respect_dirty_flags = cmd->clean_when_opened;
+ thunk.hints_valid = hints_array_available(cmd, policy_name);
+
+ return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
+}
+
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
+ load_mapping_fn fn, void *context)
+{
+ int r;
+
+ down_read(&cmd->root_lock);
+ r = __load_mappings(cmd, policy_name, fn, context);
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+static int __dump_mapping(void *context, uint64_t cblock, void *leaf)
+{
+ int r = 0;
+ __le64 value;
+ dm_oblock_t oblock;
+ unsigned flags;
+
+ memcpy(&value, leaf, sizeof(value));
+ unpack_value(value, &oblock, &flags);
+
+ return r;
+}
+
+static int __dump_mappings(struct dm_cache_metadata *cmd)
+{
+ return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL);
+}
+
+void dm_cache_dump(struct dm_cache_metadata *cmd)
+{
+ down_read(&cmd->root_lock);
+ __dump_mappings(cmd);
+ up_read(&cmd->root_lock);
+}
+
+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
+{
+ int r;
+
+ down_read(&cmd->root_lock);
+ r = cmd->changed;
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty)
+{
+ int r;
+ unsigned flags;
+ dm_oblock_t oblock;
+ __le64 value;
+
+ r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value);
+ if (r)
+ return r;
+
+ unpack_value(value, &oblock, &flags);
+
+ if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty))
+ /* nothing to be done */
+ return 0;
+
+ value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0));
+ __dm_bless_for_disk(&value);
+
+ r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+ &value, &cmd->root);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return 0;
+
+}
+
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
+ dm_cblock_t cblock, bool dirty)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = __dirty(cmd, cblock, dirty);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
+ struct dm_cache_statistics *stats)
+{
+ down_read(&cmd->root_lock);
+ memcpy(stats, &cmd->stats, sizeof(*stats));
+ up_read(&cmd->root_lock);
+}
+
+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
+ struct dm_cache_statistics *stats)
+{
+ down_write(&cmd->root_lock);
+ memcpy(&cmd->stats, stats, sizeof(*stats));
+ up_write(&cmd->root_lock);
+}
+
+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
+{
+ int r;
+ flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
+ clear_clean_shutdown);
+
+ down_write(&cmd->root_lock);
+ r = __commit_transaction(cmd, mutator);
+ if (r)
+ goto out;
+
+ r = __begin_transaction(cmd);
+
+out:
+ up_write(&cmd->root_lock);
+ return r;
+}
+
+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&cmd->root_lock);
+ r = dm_sm_get_nr_free(cmd->metadata_sm, result);
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&cmd->root_lock);
+ r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+ int r;
+ __le32 value;
+ size_t hint_size;
+ const char *policy_name = dm_cache_policy_get_name(policy);
+
+ if (!policy_name[0] ||
+ (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
+ return -EINVAL;
+
+ if (strcmp(cmd->policy_name, policy_name)) {
+ strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+
+ hint_size = dm_cache_policy_get_hint_size(policy);
+ if (!hint_size)
+ return 0; /* short-circuit hints initialization */
+ cmd->policy_hint_size = hint_size;
+
+ if (cmd->hint_root) {
+ r = dm_array_del(&cmd->hint_info, cmd->hint_root);
+ if (r)
+ return r;
+ }
+
+ r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
+ if (r)
+ return r;
+
+ value = cpu_to_le32(0);
+ __dm_bless_for_disk(&value);
+ r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
+ from_cblock(cmd->cache_blocks),
+ &value, &cmd->hint_root);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = begin_hints(cmd, policy);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
+ uint32_t hint)
+{
+ int r;
+ __le32 value = cpu_to_le32(hint);
+ __dm_bless_for_disk(&value);
+
+ r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
+ from_cblock(cblock), &value, &cmd->hint_root);
+ cmd->changed = true;
+
+ return r;
+}
+
+int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
+ uint32_t hint)
+{
+ int r;
+
+ if (!hints_array_initialized(cmd))
+ return 0;
+
+ down_write(&cmd->root_lock);
+ r = save_hint(cmd, cblock, hint);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
new file mode 100644
index 00000000000..135864ea0ee
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_METADATA_H
+#define DM_CACHE_METADATA_H
+
+#include "dm-cache-block-types.h"
+#include "dm-cache-policy-internal.h"
+
+/*----------------------------------------------------------------*/
+
+#define DM_CACHE_METADATA_BLOCK_SIZE 4096
+
+/* FIXME: remove this restriction */
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries. Each
+ * index entry contains allocation info about 16k metadata blocks.
+ */
+#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Ext[234]-style compat feature flags.
+ *
+ * A new feature which old metadata will still be compatible with should
+ * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful).
+ *
+ * A new feature that is not compatible with old code should define a
+ * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with
+ * that flag.
+ *
+ * A new feature that is not compatible with old code accessing the
+ * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and
+ * guard the relevant code with that flag.
+ *
+ * As these various flags are defined they should be added to the
+ * following masks.
+ */
+#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL
+#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
+#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
+
+/*
+ * Reopens or creates a new, empty metadata volume.
+ * Returns an ERR_PTR on failure.
+ */
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size);
+
+void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
+
+/*
+ * The metadata needs to know how many cache blocks there are. We don't
+ * care about the origin, assuming the core target is giving us valid
+ * origin blocks to map to.
+ */
+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
+
+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
+ sector_t discard_block_size,
+ dm_dblock_t new_nr_entries);
+
+typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
+ dm_dblock_t dblock, bool discarded);
+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
+ load_discard_fn fn, void *context);
+
+int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
+
+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd);
+
+typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
+ dm_cblock_t cblock, bool dirty,
+ uint32_t hint, bool hint_valid);
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
+ const char *policy_name,
+ load_mapping_fn fn,
+ void *context);
+
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
+
+struct dm_cache_statistics {
+ uint32_t read_hits;
+ uint32_t read_misses;
+ uint32_t write_hits;
+ uint32_t write_misses;
+};
+
+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
+ struct dm_cache_statistics *stats);
+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
+ struct dm_cache_statistics *stats);
+
+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown);
+
+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
+ dm_block_t *result);
+
+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
+ dm_block_t *result);
+
+void dm_cache_dump(struct dm_cache_metadata *cmd);
+
+/*
+ * The policy is invited to save a 32bit hint value for every cblock (eg,
+ * for a hit count). These are stored against the policy name. If
+ * policies are changed, then hints will be lost. If the machine crashes,
+ * hints will be lost.
+ *
+ * The hints are indexed by the cblock, but many policies will not
+ * neccessarily have a fast way of accessing efficiently via cblock. So
+ * rather than querying the policy for each cblock, we let it walk its data
+ * structures and fill in the hints in whatever order it wishes.
+ */
+
+int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
+
+/*
+ * requests hints for every cblock and stores in the metadata device.
+ */
+int dm_cache_save_hint(struct dm_cache_metadata *cmd,
+ dm_cblock_t cblock, uint32_t hint);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
new file mode 100644
index 00000000000..cc05d70b3cb
--- /dev/null
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * writeback cache policy supporting flushing out dirty cache blocks.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "cache cleaner"
+#define CLEANER_VERSION "1.0.0"
+
+/* Cache entry struct. */
+struct wb_cache_entry {
+ struct list_head list;
+ struct hlist_node hlist;
+
+ dm_oblock_t oblock;
+ dm_cblock_t cblock;
+ bool dirty:1;
+ bool pending:1;
+};
+
+struct hash {
+ struct hlist_head *table;
+ dm_block_t hash_bits;
+ unsigned nr_buckets;
+};
+
+struct policy {
+ struct dm_cache_policy policy;
+ spinlock_t lock;
+
+ struct list_head free;
+ struct list_head clean;
+ struct list_head clean_pending;
+ struct list_head dirty;
+
+ /*
+ * We know exactly how many cblocks will be needed,
+ * so we can allocate them up front.
+ */
+ dm_cblock_t cache_size, nr_cblocks_allocated;
+ struct wb_cache_entry *cblocks;
+ struct hash chash;
+};
+
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Low-level functions.
+ */
+static unsigned next_power(unsigned n, unsigned min)
+{
+ return roundup_pow_of_two(max(n, min));
+}
+
+static struct policy *to_policy(struct dm_cache_policy *p)
+{
+ return container_of(p, struct policy, policy);
+}
+
+static struct list_head *list_pop(struct list_head *q)
+{
+ struct list_head *r = q->next;
+
+ list_del(r);
+
+ return r;
+}
+
+/*----------------------------------------------------------------------------*/
+
+/* Allocate/free various resources. */
+static int alloc_hash(struct hash *hash, unsigned elts)
+{
+ hash->nr_buckets = next_power(elts >> 4, 16);
+ hash->hash_bits = ffs(hash->nr_buckets) - 1;
+ hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
+
+ return hash->table ? 0 : -ENOMEM;
+}
+
+static void free_hash(struct hash *hash)
+{
+ vfree(hash->table);
+}
+
+static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
+{
+ int r = -ENOMEM;
+
+ p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
+ if (p->cblocks) {
+ unsigned u = from_cblock(cache_size);
+
+ while (u--)
+ list_add(&p->cblocks[u].list, &p->free);
+
+ p->nr_cblocks_allocated = 0;
+
+ /* Cache entries hash. */
+ r = alloc_hash(&p->chash, from_cblock(cache_size));
+ if (r)
+ vfree(p->cblocks);
+ }
+
+ return r;
+}
+
+static void free_cache_blocks_and_hash(struct policy *p)
+{
+ free_hash(&p->chash);
+ vfree(p->cblocks);
+}
+
+static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
+{
+ struct wb_cache_entry *e;
+
+ BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
+
+ e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
+ p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
+
+ return e;
+}
+
+/*----------------------------------------------------------------------------*/
+
+/* Hash functions (lookup, insert, remove). */
+static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
+{
+ struct hash *hash = &p->chash;
+ unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
+ struct wb_cache_entry *cur;
+ struct hlist_head *bucket = &hash->table[h];
+
+ hlist_for_each_entry(cur, bucket, hlist) {
+ if (cur->oblock == oblock) {
+ /* Move upfront bucket for faster access. */
+ hlist_del(&cur->hlist);
+ hlist_add_head(&cur->hlist, bucket);
+ return cur;
+ }
+ }
+
+ return NULL;
+}
+
+static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
+{
+ unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
+
+ hlist_add_head(&e->hlist, &p->chash.table[h]);
+}
+
+static void remove_cache_hash_entry(struct wb_cache_entry *e)
+{
+ hlist_del(&e->hlist);
+}
+
+/* Public interface (see dm-cache-policy.h */
+static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool discarded_oblock,
+ struct bio *bio, struct policy_result *result)
+{
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ result->op = POLICY_MISS;
+
+ if (can_block)
+ spin_lock_irqsave(&p->lock, flags);
+
+ else if (!spin_trylock_irqsave(&p->lock, flags))
+ return -EWOULDBLOCK;
+
+ e = lookup_cache_entry(p, oblock);
+ if (e) {
+ result->op = POLICY_HIT;
+ result->cblock = e->cblock;
+
+ }
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ return 0;
+}
+
+static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+ int r;
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&p->lock, flags))
+ return -EWOULDBLOCK;
+
+ e = lookup_cache_entry(p, oblock);
+ if (e) {
+ *cblock = e->cblock;
+ r = 0;
+
+ } else
+ r = -ENOENT;
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ return r;
+}
+
+static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
+{
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+
+ e = lookup_cache_entry(p, oblock);
+ BUG_ON(!e);
+
+ if (set) {
+ if (!e->dirty) {
+ e->dirty = true;
+ list_move(&e->list, &p->dirty);
+ }
+
+ } else {
+ if (e->dirty) {
+ e->pending = false;
+ e->dirty = false;
+ list_move(&e->list, &p->clean);
+ }
+ }
+}
+
+static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+ struct policy *p = to_policy(pe);
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+ __set_clear_dirty(pe, oblock, true);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+ struct policy *p = to_policy(pe);
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+ __set_clear_dirty(pe, oblock, false);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
+{
+ insert_cache_hash_entry(p, e);
+ if (e->dirty)
+ list_add(&e->list, &p->dirty);
+ else
+ list_add(&e->list, &p->clean);
+}
+
+static int wb_load_mapping(struct dm_cache_policy *pe,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ uint32_t hint, bool hint_valid)
+{
+ int r;
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e = alloc_cache_entry(p);
+
+ if (e) {
+ e->cblock = cblock;
+ e->oblock = oblock;
+ e->dirty = false; /* blocks default to clean */
+ add_cache_entry(p, e);
+ r = 0;
+
+ } else
+ r = -ENOMEM;
+
+ return r;
+}
+
+static void wb_destroy(struct dm_cache_policy *pe)
+{
+ struct policy *p = to_policy(pe);
+
+ free_cache_blocks_and_hash(p);
+ kfree(p);
+}
+
+static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
+{
+ struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
+
+ BUG_ON(!r);
+
+ remove_cache_hash_entry(r);
+ list_del(&r->list);
+
+ return r;
+}
+
+static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+ e = __wb_force_remove_mapping(p, oblock);
+ list_add_tail(&e->list, &p->free);
+ BUG_ON(!from_cblock(p->nr_cblocks_allocated));
+ p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void wb_force_mapping(struct dm_cache_policy *pe,
+ dm_oblock_t current_oblock, dm_oblock_t oblock)
+{
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+ e = __wb_force_remove_mapping(p, current_oblock);
+ e->oblock = oblock;
+ add_cache_entry(p, e);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
+{
+ struct list_head *l;
+ struct wb_cache_entry *r;
+
+ if (list_empty(&p->dirty))
+ return NULL;
+
+ l = list_pop(&p->dirty);
+ r = container_of(l, struct wb_cache_entry, list);
+ list_add(l, &p->clean_pending);
+
+ return r;
+}
+
+static int wb_writeback_work(struct dm_cache_policy *pe,
+ dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
+{
+ int r = -ENOENT;
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+
+ e = get_next_dirty_entry(p);
+ if (e) {
+ *oblock = e->oblock;
+ *cblock = e->cblock;
+ r = 0;
+ }
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ return r;
+}
+
+static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
+{
+ return to_policy(pe)->nr_cblocks_allocated;
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct policy *p)
+{
+ p->policy.destroy = wb_destroy;
+ p->policy.map = wb_map;
+ p->policy.lookup = wb_lookup;
+ p->policy.set_dirty = wb_set_dirty;
+ p->policy.clear_dirty = wb_clear_dirty;
+ p->policy.load_mapping = wb_load_mapping;
+ p->policy.walk_mappings = NULL;
+ p->policy.remove_mapping = wb_remove_mapping;
+ p->policy.writeback_work = wb_writeback_work;
+ p->policy.force_mapping = wb_force_mapping;
+ p->policy.residency = wb_residency;
+ p->policy.tick = NULL;
+}
+
+static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ int r;
+ struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
+
+ if (!p)
+ return NULL;
+
+ init_policy_functions(p);
+ INIT_LIST_HEAD(&p->free);
+ INIT_LIST_HEAD(&p->clean);
+ INIT_LIST_HEAD(&p->clean_pending);
+ INIT_LIST_HEAD(&p->dirty);
+
+ p->cache_size = cache_size;
+ spin_lock_init(&p->lock);
+
+ /* Allocate cache entry structs and add them to free list. */
+ r = alloc_cache_blocks_with_hash(p, cache_size);
+ if (!r)
+ return &p->policy;
+
+ kfree(p);
+
+ return NULL;
+}
+/*----------------------------------------------------------------------------*/
+
+static struct dm_cache_policy_type wb_policy_type = {
+ .name = "cleaner",
+ .hint_size = 0,
+ .owner = THIS_MODULE,
+ .create = wb_create
+};
+
+static int __init wb_init(void)
+{
+ int r = dm_cache_policy_register(&wb_policy_type);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+ else
+ DMINFO("version " CLEANER_VERSION " loaded");
+
+ return r;
+}
+
+static void __exit wb_exit(void)
+{
+ dm_cache_policy_unregister(&wb_policy_type);
+}
+
+module_init(wb_init);
+module_exit(wb_exit);
+
+MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
new file mode 100644
index 00000000000..52a75beeced
--- /dev/null
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_POLICY_INTERNAL_H
+#define DM_CACHE_POLICY_INTERNAL_H
+
+#include "dm-cache-policy.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Little inline functions that simplify calling the policy methods.
+ */
+static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool discarded_oblock,
+ struct bio *bio, struct policy_result *result)
+{
+ return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
+}
+
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+ BUG_ON(!p->lookup);
+ return p->lookup(p, oblock, cblock);
+}
+
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ if (p->set_dirty)
+ p->set_dirty(p, oblock);
+}
+
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ if (p->clear_dirty)
+ p->clear_dirty(p, oblock);
+}
+
+static inline int policy_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ uint32_t hint, bool hint_valid)
+{
+ return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+}
+
+static inline int policy_walk_mappings(struct dm_cache_policy *p,
+ policy_walk_fn fn, void *context)
+{
+ return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
+}
+
+static inline int policy_writeback_work(struct dm_cache_policy *p,
+ dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
+{
+ return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+}
+
+static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ return p->remove_mapping(p, oblock);
+}
+
+static inline void policy_force_mapping(struct dm_cache_policy *p,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ return p->force_mapping(p, current_oblock, new_oblock);
+}
+
+static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
+{
+ return p->residency(p);
+}
+
+static inline void policy_tick(struct dm_cache_policy *p)
+{
+ if (p->tick)
+ return p->tick(p);
+}
+
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+{
+ ssize_t sz = 0;
+ if (p->emit_config_values)
+ return p->emit_config_values(p, result, maxlen);
+
+ DMEMIT("0");
+ return 0;
+}
+
+static inline int policy_set_config_value(struct dm_cache_policy *p,
+ const char *key, const char *value)
+{
+ return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
+ */
+struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
+ sector_t origin_size, sector_t block_size);
+
+/*
+ * Destroys the policy. This drops references to the policy module as well
+ * as calling it's destroy method. So always use this rather than calling
+ * the policy->destroy method directly.
+ */
+void dm_cache_policy_destroy(struct dm_cache_policy *p);
+
+/*
+ * In case we've forgotten.
+ */
+const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
+
+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_CACHE_POLICY_INTERNAL_H */
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
new file mode 100644
index 00000000000..96415325507
--- /dev/null
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -0,0 +1,1195 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "cache-policy-mq"
+#define MQ_VERSION "1.0.0"
+
+static struct kmem_cache *mq_entry_cache;
+
+/*----------------------------------------------------------------*/
+
+static unsigned next_power(unsigned n, unsigned min)
+{
+ return roundup_pow_of_two(max(n, min));
+}
+
+/*----------------------------------------------------------------*/
+
+static unsigned long *alloc_bitset(unsigned nr_entries)
+{
+ size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+ return vzalloc(s);
+}
+
+static void free_bitset(unsigned long *bits)
+{
+ vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Large, sequential ios are probably better left on the origin device since
+ * spindles tend to have good bandwidth.
+ *
+ * The io_tracker tries to spot when the io is in one of these sequential
+ * modes.
+ *
+ * Two thresholds to switch between random and sequential io mode are defaulting
+ * as follows and can be adjusted via the constructor and message interfaces.
+ */
+#define RANDOM_THRESHOLD_DEFAULT 4
+#define SEQUENTIAL_THRESHOLD_DEFAULT 512
+
+enum io_pattern {
+ PATTERN_SEQUENTIAL,
+ PATTERN_RANDOM
+};
+
+struct io_tracker {
+ enum io_pattern pattern;
+
+ unsigned nr_seq_samples;
+ unsigned nr_rand_samples;
+ unsigned thresholds[2];
+
+ dm_oblock_t last_end_oblock;
+};
+
+static void iot_init(struct io_tracker *t,
+ int sequential_threshold, int random_threshold)
+{
+ t->pattern = PATTERN_RANDOM;
+ t->nr_seq_samples = 0;
+ t->nr_rand_samples = 0;
+ t->last_end_oblock = 0;
+ t->thresholds[PATTERN_RANDOM] = random_threshold;
+ t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
+}
+
+static enum io_pattern iot_pattern(struct io_tracker *t)
+{
+ return t->pattern;
+}
+
+static void iot_update_stats(struct io_tracker *t, struct bio *bio)
+{
+ if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
+ t->nr_seq_samples++;
+ else {
+ /*
+ * Just one non-sequential IO is enough to reset the
+ * counters.
+ */
+ if (t->nr_seq_samples) {
+ t->nr_seq_samples = 0;
+ t->nr_rand_samples = 0;
+ }
+
+ t->nr_rand_samples++;
+ }
+
+ t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
+}
+
+static void iot_check_for_pattern_switch(struct io_tracker *t)
+{
+ switch (t->pattern) {
+ case PATTERN_SEQUENTIAL:
+ if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
+ t->pattern = PATTERN_RANDOM;
+ t->nr_seq_samples = t->nr_rand_samples = 0;
+ }
+ break;
+
+ case PATTERN_RANDOM:
+ if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
+ t->pattern = PATTERN_SEQUENTIAL;
+ t->nr_seq_samples = t->nr_rand_samples = 0;
+ }
+ break;
+ }
+}
+
+static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
+{
+ iot_update_stats(t, bio);
+ iot_check_for_pattern_switch(t);
+}
+
+/*----------------------------------------------------------------*/
+
+
+/*
+ * This queue is divided up into different levels. Allowing us to push
+ * entries to the back of any of the levels. Think of it as a partially
+ * sorted queue.
+ */
+#define NR_QUEUE_LEVELS 16u
+
+struct queue {
+ struct list_head qs[NR_QUEUE_LEVELS];
+};
+
+static void queue_init(struct queue *q)
+{
+ unsigned i;
+
+ for (i = 0; i < NR_QUEUE_LEVELS; i++)
+ INIT_LIST_HEAD(q->qs + i);
+}
+
+/*
+ * Insert an entry to the back of the given level.
+ */
+static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
+{
+ list_add_tail(elt, q->qs + level);
+}
+
+static void queue_remove(struct list_head *elt)
+{
+ list_del(elt);
+}
+
+/*
+ * Shifts all regions down one level. This has no effect on the order of
+ * the queue.
+ */
+static void queue_shift_down(struct queue *q)
+{
+ unsigned level;
+
+ for (level = 1; level < NR_QUEUE_LEVELS; level++)
+ list_splice_init(q->qs + level, q->qs + level - 1);
+}
+
+/*
+ * Gives us the oldest entry of the lowest popoulated level. If the first
+ * level is emptied then we shift down one level.
+ */
+static struct list_head *queue_pop(struct queue *q)
+{
+ unsigned level;
+ struct list_head *r;
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ if (!list_empty(q->qs + level)) {
+ r = q->qs[level].next;
+ list_del(r);
+
+ /* have we just emptied the bottom level? */
+ if (level == 0 && list_empty(q->qs))
+ queue_shift_down(q);
+
+ return r;
+ }
+
+ return NULL;
+}
+
+static struct list_head *list_pop(struct list_head *lh)
+{
+ struct list_head *r = lh->next;
+
+ BUG_ON(!r);
+ list_del_init(r);
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Describes a cache entry. Used in both the cache and the pre_cache.
+ */
+struct entry {
+ struct hlist_node hlist;
+ struct list_head list;
+ dm_oblock_t oblock;
+ dm_cblock_t cblock; /* valid iff in_cache */
+
+ /*
+ * FIXME: pack these better
+ */
+ bool in_cache:1;
+ unsigned hit_count;
+ unsigned generation;
+ unsigned tick;
+};
+
+struct mq_policy {
+ struct dm_cache_policy policy;
+
+ /* protects everything */
+ struct mutex lock;
+ dm_cblock_t cache_size;
+ struct io_tracker tracker;
+
+ /*
+ * We maintain two queues of entries. The cache proper contains
+ * the currently active mappings. Whereas the pre_cache tracks
+ * blocks that are being hit frequently and potential candidates
+ * for promotion to the cache.
+ */
+ struct queue pre_cache;
+ struct queue cache;
+
+ /*
+ * Keeps track of time, incremented by the core. We use this to
+ * avoid attributing multiple hits within the same tick.
+ *
+ * Access to tick_protected should be done with the spin lock held.
+ * It's copied to tick at the start of the map function (within the
+ * mutex).
+ */
+ spinlock_t tick_lock;
+ unsigned tick_protected;
+ unsigned tick;
+
+ /*
+ * A count of the number of times the map function has been called
+ * and found an entry in the pre_cache or cache. Currently used to
+ * calculate the generation.
+ */
+ unsigned hit_count;
+
+ /*
+ * A generation is a longish period that is used to trigger some
+ * book keeping effects. eg, decrementing hit counts on entries.
+ * This is needed to allow the cache to evolve as io patterns
+ * change.
+ */
+ unsigned generation;
+ unsigned generation_period; /* in lookups (will probably change) */
+
+ /*
+ * Entries in the pre_cache whose hit count passes the promotion
+ * threshold move to the cache proper. Working out the correct
+ * value for the promotion_threshold is crucial to this policy.
+ */
+ unsigned promote_threshold;
+
+ /*
+ * We need cache_size entries for the cache, and choose to have
+ * cache_size entries for the pre_cache too. One motivation for
+ * using the same size is to make the hit counts directly
+ * comparable between pre_cache and cache.
+ */
+ unsigned nr_entries;
+ unsigned nr_entries_allocated;
+ struct list_head free;
+
+ /*
+ * Cache blocks may be unallocated. We store this info in a
+ * bitset.
+ */
+ unsigned long *allocation_bitset;
+ unsigned nr_cblocks_allocated;
+ unsigned find_free_nr_words;
+ unsigned find_free_last_word;
+
+ /*
+ * The hash table allows us to quickly find an entry by origin
+ * block. Both pre_cache and cache entries are in here.
+ */
+ unsigned nr_buckets;
+ dm_block_t hash_bits;
+ struct hlist_head *table;
+};
+
+/*----------------------------------------------------------------*/
+/* Free/alloc mq cache entry structures. */
+static void takeout_queue(struct list_head *lh, struct queue *q)
+{
+ unsigned level;
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ list_splice(q->qs + level, lh);
+}
+
+static void free_entries(struct mq_policy *mq)
+{
+ struct entry *e, *tmp;
+
+ takeout_queue(&mq->free, &mq->pre_cache);
+ takeout_queue(&mq->free, &mq->cache);
+
+ list_for_each_entry_safe(e, tmp, &mq->free, list)
+ kmem_cache_free(mq_entry_cache, e);
+}
+
+static int alloc_entries(struct mq_policy *mq, unsigned elts)
+{
+ unsigned u = mq->nr_entries;
+
+ INIT_LIST_HEAD(&mq->free);
+ mq->nr_entries_allocated = 0;
+
+ while (u--) {
+ struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
+
+ if (!e) {
+ free_entries(mq);
+ return -ENOMEM;
+ }
+
+
+ list_add(&e->list, &mq->free);
+ }
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Simple hash table implementation. Should replace with the standard hash
+ * table that's making its way upstream.
+ */
+static void hash_insert(struct mq_policy *mq, struct entry *e)
+{
+ unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
+
+ hlist_add_head(&e->hlist, mq->table + h);
+}
+
+static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
+{
+ unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
+ struct hlist_head *bucket = mq->table + h;
+ struct entry *e;
+
+ hlist_for_each_entry(e, bucket, hlist)
+ if (e->oblock == oblock) {
+ hlist_del(&e->hlist);
+ hlist_add_head(&e->hlist, bucket);
+ return e;
+ }
+
+ return NULL;
+}
+
+static void hash_remove(struct entry *e)
+{
+ hlist_del(&e->hlist);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Allocates a new entry structure. The memory is allocated in one lump,
+ * so we just handing it out here. Returns NULL if all entries have
+ * already been allocated. Cannot fail otherwise.
+ */
+static struct entry *alloc_entry(struct mq_policy *mq)
+{
+ struct entry *e;
+
+ if (mq->nr_entries_allocated >= mq->nr_entries) {
+ BUG_ON(!list_empty(&mq->free));
+ return NULL;
+ }
+
+ e = list_entry(list_pop(&mq->free), struct entry, list);
+ INIT_LIST_HEAD(&e->list);
+ INIT_HLIST_NODE(&e->hlist);
+
+ mq->nr_entries_allocated++;
+ return e;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Mark cache blocks allocated or not in the bitset.
+ */
+static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
+{
+ BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
+ BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
+
+ set_bit(from_cblock(cblock), mq->allocation_bitset);
+ mq->nr_cblocks_allocated++;
+}
+
+static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
+{
+ BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
+ BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
+
+ clear_bit(from_cblock(cblock), mq->allocation_bitset);
+ mq->nr_cblocks_allocated--;
+}
+
+static bool any_free_cblocks(struct mq_policy *mq)
+{
+ return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
+}
+
+/*
+ * Fills result out with a cache block that isn't in use, or return
+ * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is
+ * reponsible for that.
+ */
+static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
+ dm_cblock_t *result, unsigned *last_word)
+{
+ int r = -ENOSPC;
+ unsigned w;
+
+ for (w = begin; w < end; w++) {
+ /*
+ * ffz is undefined if no zero exists
+ */
+ if (mq->allocation_bitset[w] != ~0UL) {
+ *last_word = w;
+ *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
+ if (from_cblock(*result) < from_cblock(mq->cache_size))
+ r = 0;
+
+ break;
+ }
+ }
+
+ return r;
+}
+
+static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
+{
+ int r;
+
+ if (!any_free_cblocks(mq))
+ return -ENOSPC;
+
+ r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
+ if (r == -ENOSPC && mq->find_free_last_word)
+ r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Now we get to the meat of the policy. This section deals with deciding
+ * when to to add entries to the pre_cache and cache, and move between
+ * them.
+ */
+
+/*
+ * The queue level is based on the log2 of the hit count.
+ */
+static unsigned queue_level(struct entry *e)
+{
+ return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
+}
+
+/*
+ * Inserts the entry into the pre_cache or the cache. Ensures the cache
+ * block is marked as allocated if necc. Inserts into the hash table. Sets the
+ * tick which records when the entry was last moved about.
+ */
+static void push(struct mq_policy *mq, struct entry *e)
+{
+ e->tick = mq->tick;
+ hash_insert(mq, e);
+
+ if (e->in_cache) {
+ alloc_cblock(mq, e->cblock);
+ queue_push(&mq->cache, queue_level(e), &e->list);
+ } else
+ queue_push(&mq->pre_cache, queue_level(e), &e->list);
+}
+
+/*
+ * Removes an entry from pre_cache or cache. Removes from the hash table.
+ * Frees off the cache block if necc.
+ */
+static void del(struct mq_policy *mq, struct entry *e)
+{
+ queue_remove(&e->list);
+ hash_remove(e);
+ if (e->in_cache)
+ free_cblock(mq, e->cblock);
+}
+
+/*
+ * Like del, except it removes the first entry in the queue (ie. the least
+ * recently used).
+ */
+static struct entry *pop(struct mq_policy *mq, struct queue *q)
+{
+ struct entry *e = container_of(queue_pop(q), struct entry, list);
+
+ if (e) {
+ hash_remove(e);
+
+ if (e->in_cache)
+ free_cblock(mq, e->cblock);
+ }
+
+ return e;
+}
+
+/*
+ * Has this entry already been updated?
+ */
+static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
+{
+ return mq->tick == e->tick;
+}
+
+/*
+ * The promotion threshold is adjusted every generation. As are the counts
+ * of the entries.
+ *
+ * At the moment the threshold is taken by averaging the hit counts of some
+ * of the entries in the cache (the first 20 entries of the first level).
+ *
+ * We can be much cleverer than this though. For example, each promotion
+ * could bump up the threshold helping to prevent churn. Much more to do
+ * here.
+ */
+
+#define MAX_TO_AVERAGE 20
+
+static void check_generation(struct mq_policy *mq)
+{
+ unsigned total = 0, nr = 0, count = 0, level;
+ struct list_head *head;
+ struct entry *e;
+
+ if ((mq->hit_count >= mq->generation_period) &&
+ (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
+
+ mq->hit_count = 0;
+ mq->generation++;
+
+ for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
+ head = mq->cache.qs + level;
+ list_for_each_entry(e, head, list) {
+ nr++;
+ total += e->hit_count;
+
+ if (++count >= MAX_TO_AVERAGE)
+ break;
+ }
+ }
+
+ mq->promote_threshold = nr ? total / nr : 1;
+ if (mq->promote_threshold * nr < total)
+ mq->promote_threshold++;
+ }
+}
+
+/*
+ * Whenever we use an entry we bump up it's hit counter, and push it to the
+ * back to it's current level.
+ */
+static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
+{
+ if (updated_this_tick(mq, e))
+ return;
+
+ e->hit_count++;
+ mq->hit_count++;
+ check_generation(mq);
+
+ /* generation adjustment, to stop the counts increasing forever. */
+ /* FIXME: divide? */
+ /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
+ e->generation = mq->generation;
+
+ del(mq, e);
+ push(mq, e);
+}
+
+/*
+ * Demote the least recently used entry from the cache to the pre_cache.
+ * Returns the new cache entry to use, and the old origin block it was
+ * mapped to.
+ *
+ * We drop the hit count on the demoted entry back to 1 to stop it bouncing
+ * straight back into the cache if it's subsequently hit. There are
+ * various options here, and more experimentation would be good:
+ *
+ * - just forget about the demoted entry completely (ie. don't insert it
+ into the pre_cache).
+ * - divide the hit count rather that setting to some hard coded value.
+ * - set the hit count to a hard coded value other than 1, eg, is it better
+ * if it goes in at level 2?
+ */
+static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+{
+ dm_cblock_t result;
+ struct entry *demoted = pop(mq, &mq->cache);
+
+ BUG_ON(!demoted);
+ result = demoted->cblock;
+ *oblock = demoted->oblock;
+ demoted->in_cache = false;
+ demoted->hit_count = 1;
+ push(mq, demoted);
+
+ return result;
+}
+
+/*
+ * We modify the basic promotion_threshold depending on the specific io.
+ *
+ * If the origin block has been discarded then there's no cost to copy it
+ * to the cache.
+ *
+ * We bias towards reads, since they can be demoted at no cost if they
+ * haven't been dirtied.
+ */
+#define DISCARDED_PROMOTE_THRESHOLD 1
+#define READ_PROMOTE_THRESHOLD 4
+#define WRITE_PROMOTE_THRESHOLD 8
+
+static unsigned adjusted_promote_threshold(struct mq_policy *mq,
+ bool discarded_oblock, int data_dir)
+{
+ if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE)
+ /*
+ * We don't need to do any copying at all, so give this a
+ * very low threshold. In practice this only triggers
+ * during initial population after a format.
+ */
+ return DISCARDED_PROMOTE_THRESHOLD;
+
+ return data_dir == READ ?
+ (mq->promote_threshold + READ_PROMOTE_THRESHOLD) :
+ (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD);
+}
+
+static bool should_promote(struct mq_policy *mq, struct entry *e,
+ bool discarded_oblock, int data_dir)
+{
+ return e->hit_count >=
+ adjusted_promote_threshold(mq, discarded_oblock, data_dir);
+}
+
+static int cache_entry_found(struct mq_policy *mq,
+ struct entry *e,
+ struct policy_result *result)
+{
+ requeue_and_update_tick(mq, e);
+
+ if (e->in_cache) {
+ result->op = POLICY_HIT;
+ result->cblock = e->cblock;
+ }
+
+ return 0;
+}
+
+/*
+ * Moves and entry from the pre_cache to the cache. The main work is
+ * finding which cache block to use.
+ */
+static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
+ struct policy_result *result)
+{
+ dm_cblock_t cblock;
+
+ if (find_free_cblock(mq, &cblock) == -ENOSPC) {
+ result->op = POLICY_REPLACE;
+ cblock = demote_cblock(mq, &result->old_oblock);
+ } else
+ result->op = POLICY_NEW;
+
+ result->cblock = e->cblock = cblock;
+
+ del(mq, e);
+ e->in_cache = true;
+ push(mq, e);
+
+ return 0;
+}
+
+static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
+ bool can_migrate, bool discarded_oblock,
+ int data_dir, struct policy_result *result)
+{
+ int r = 0;
+ bool updated = updated_this_tick(mq, e);
+
+ requeue_and_update_tick(mq, e);
+
+ if ((!discarded_oblock && updated) ||
+ !should_promote(mq, e, discarded_oblock, data_dir))
+ result->op = POLICY_MISS;
+ else if (!can_migrate)
+ r = -EWOULDBLOCK;
+ else
+ r = pre_cache_to_cache(mq, e, result);
+
+ return r;
+}
+
+static void insert_in_pre_cache(struct mq_policy *mq,
+ dm_oblock_t oblock)
+{
+ struct entry *e = alloc_entry(mq);
+
+ if (!e)
+ /*
+ * There's no spare entry structure, so we grab the least
+ * used one from the pre_cache.
+ */
+ e = pop(mq, &mq->pre_cache);
+
+ if (unlikely(!e)) {
+ DMWARN("couldn't pop from pre cache");
+ return;
+ }
+
+ e->in_cache = false;
+ e->oblock = oblock;
+ e->hit_count = 1;
+ e->generation = mq->generation;
+ push(mq, e);
+}
+
+static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
+ struct policy_result *result)
+{
+ struct entry *e;
+ dm_cblock_t cblock;
+
+ if (find_free_cblock(mq, &cblock) == -ENOSPC) {
+ result->op = POLICY_MISS;
+ insert_in_pre_cache(mq, oblock);
+ return;
+ }
+
+ e = alloc_entry(mq);
+ if (unlikely(!e)) {
+ result->op = POLICY_MISS;
+ return;
+ }
+
+ e->oblock = oblock;
+ e->cblock = cblock;
+ e->in_cache = true;
+ e->hit_count = 1;
+ e->generation = mq->generation;
+ push(mq, e);
+
+ result->op = POLICY_NEW;
+ result->cblock = e->cblock;
+}
+
+static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
+ bool can_migrate, bool discarded_oblock,
+ int data_dir, struct policy_result *result)
+{
+ if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
+ if (can_migrate)
+ insert_in_cache(mq, oblock, result);
+ else
+ return -EWOULDBLOCK;
+ } else {
+ insert_in_pre_cache(mq, oblock);
+ result->op = POLICY_MISS;
+ }
+
+ return 0;
+}
+
+/*
+ * Looks the oblock up in the hash table, then decides whether to put in
+ * pre_cache, or cache etc.
+ */
+static int map(struct mq_policy *mq, dm_oblock_t oblock,
+ bool can_migrate, bool discarded_oblock,
+ int data_dir, struct policy_result *result)
+{
+ int r = 0;
+ struct entry *e = hash_lookup(mq, oblock);
+
+ if (e && e->in_cache)
+ r = cache_entry_found(mq, e, result);
+ else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
+ result->op = POLICY_MISS;
+ else if (e)
+ r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
+ data_dir, result);
+ else
+ r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
+ data_dir, result);
+
+ if (r == -EWOULDBLOCK)
+ result->op = POLICY_MISS;
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Public interface, via the policy struct. See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
+{
+ return container_of(p, struct mq_policy, policy);
+}
+
+static void mq_destroy(struct dm_cache_policy *p)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ free_bitset(mq->allocation_bitset);
+ kfree(mq->table);
+ free_entries(mq);
+ kfree(mq);
+}
+
+static void copy_tick(struct mq_policy *mq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&mq->tick_lock, flags);
+ mq->tick = mq->tick_protected;
+ spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool discarded_oblock,
+ struct bio *bio, struct policy_result *result)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ result->op = POLICY_MISS;
+
+ if (can_block)
+ mutex_lock(&mq->lock);
+ else if (!mutex_trylock(&mq->lock))
+ return -EWOULDBLOCK;
+
+ copy_tick(mq);
+
+ iot_examine_bio(&mq->tracker, bio);
+ r = map(mq, oblock, can_migrate, discarded_oblock,
+ bio_data_dir(bio), result);
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+ struct entry *e;
+
+ if (!mutex_trylock(&mq->lock))
+ return -EWOULDBLOCK;
+
+ e = hash_lookup(mq, oblock);
+ if (e && e->in_cache) {
+ *cblock = e->cblock;
+ r = 0;
+ } else
+ r = -ENOENT;
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static int mq_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ uint32_t hint, bool hint_valid)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+ struct entry *e;
+
+ e = alloc_entry(mq);
+ if (!e)
+ return -ENOMEM;
+
+ e->cblock = cblock;
+ e->oblock = oblock;
+ e->in_cache = true;
+ e->hit_count = hint_valid ? hint : 1;
+ e->generation = mq->generation;
+ push(mq, e);
+
+ return 0;
+}
+
+static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+ void *context)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+ int r = 0;
+ struct entry *e;
+ unsigned level;
+
+ mutex_lock(&mq->lock);
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ list_for_each_entry(e, &mq->cache.qs[level], list) {
+ r = fn(context, e->cblock, e->oblock, e->hit_count);
+ if (r)
+ goto out;
+ }
+
+out:
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
+{
+ struct entry *e = hash_lookup(mq, oblock);
+
+ BUG_ON(!e || !e->in_cache);
+
+ del(mq, e);
+ e->in_cache = false;
+ push(mq, e);
+}
+
+static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ remove_mapping(mq, oblock);
+ mutex_unlock(&mq->lock);
+}
+
+static void force_mapping(struct mq_policy *mq,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct entry *e = hash_lookup(mq, current_oblock);
+
+ BUG_ON(!e || !e->in_cache);
+
+ del(mq, e);
+ e->oblock = new_oblock;
+ push(mq, e);
+}
+
+static void mq_force_mapping(struct dm_cache_policy *p,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ force_mapping(mq, current_oblock, new_oblock);
+ mutex_unlock(&mq->lock);
+}
+
+static dm_cblock_t mq_residency(struct dm_cache_policy *p)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ /* FIXME: lock mutex, not sure we can block here */
+ return to_cblock(mq->nr_cblocks_allocated);
+}
+
+static void mq_tick(struct dm_cache_policy *p)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mq->tick_lock, flags);
+ mq->tick_protected++;
+ spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static int mq_set_config_value(struct dm_cache_policy *p,
+ const char *key, const char *value)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+ enum io_pattern pattern;
+ unsigned long tmp;
+
+ if (!strcasecmp(key, "random_threshold"))
+ pattern = PATTERN_RANDOM;
+ else if (!strcasecmp(key, "sequential_threshold"))
+ pattern = PATTERN_SEQUENTIAL;
+ else
+ return -EINVAL;
+
+ if (kstrtoul(value, 10, &tmp))
+ return -EINVAL;
+
+ mq->tracker.thresholds[pattern] = tmp;
+
+ return 0;
+}
+
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+{
+ ssize_t sz = 0;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ DMEMIT("4 random_threshold %u sequential_threshold %u",
+ mq->tracker.thresholds[PATTERN_RANDOM],
+ mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
+
+ return 0;
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct mq_policy *mq)
+{
+ mq->policy.destroy = mq_destroy;
+ mq->policy.map = mq_map;
+ mq->policy.lookup = mq_lookup;
+ mq->policy.load_mapping = mq_load_mapping;
+ mq->policy.walk_mappings = mq_walk_mappings;
+ mq->policy.remove_mapping = mq_remove_mapping;
+ mq->policy.writeback_work = NULL;
+ mq->policy.force_mapping = mq_force_mapping;
+ mq->policy.residency = mq_residency;
+ mq->policy.tick = mq_tick;
+ mq->policy.emit_config_values = mq_emit_config_values;
+ mq->policy.set_config_value = mq_set_config_value;
+}
+
+static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ int r;
+ struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
+
+ if (!mq)
+ return NULL;
+
+ init_policy_functions(mq);
+ iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
+
+ mq->cache_size = cache_size;
+ mq->tick_protected = 0;
+ mq->tick = 0;
+ mq->hit_count = 0;
+ mq->generation = 0;
+ mq->promote_threshold = 0;
+ mutex_init(&mq->lock);
+ spin_lock_init(&mq->tick_lock);
+ mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
+ mq->find_free_last_word = 0;
+
+ queue_init(&mq->pre_cache);
+ queue_init(&mq->cache);
+ mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
+
+ mq->nr_entries = 2 * from_cblock(cache_size);
+ r = alloc_entries(mq, mq->nr_entries);
+ if (r)
+ goto bad_cache_alloc;
+
+ mq->nr_entries_allocated = 0;
+ mq->nr_cblocks_allocated = 0;
+
+ mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
+ mq->hash_bits = ffs(mq->nr_buckets) - 1;
+ mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
+ if (!mq->table)
+ goto bad_alloc_table;
+
+ mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
+ if (!mq->allocation_bitset)
+ goto bad_alloc_bitset;
+
+ return &mq->policy;
+
+bad_alloc_bitset:
+ kfree(mq->table);
+bad_alloc_table:
+ free_entries(mq);
+bad_cache_alloc:
+ kfree(mq);
+
+ return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type mq_policy_type = {
+ .name = "mq",
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = mq_create
+};
+
+static struct dm_cache_policy_type default_policy_type = {
+ .name = "default",
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = mq_create
+};
+
+static int __init mq_init(void)
+{
+ int r;
+
+ mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
+ sizeof(struct entry),
+ __alignof__(struct entry),
+ 0, NULL);
+ if (!mq_entry_cache)
+ goto bad;
+
+ r = dm_cache_policy_register(&mq_policy_type);
+ if (r) {
+ DMERR("register failed %d", r);
+ goto bad_register_mq;
+ }
+
+ r = dm_cache_policy_register(&default_policy_type);
+ if (!r) {
+ DMINFO("version " MQ_VERSION " loaded");
+ return 0;
+ }
+
+ DMERR("register failed (as default) %d", r);
+
+ dm_cache_policy_unregister(&mq_policy_type);
+bad_register_mq:
+ kmem_cache_destroy(mq_entry_cache);
+bad:
+ return -ENOMEM;
+}
+
+static void __exit mq_exit(void)
+{
+ dm_cache_policy_unregister(&mq_policy_type);
+ dm_cache_policy_unregister(&default_policy_type);
+
+ kmem_cache_destroy(mq_entry_cache);
+}
+
+module_init(mq_init);
+module_exit(mq_exit);
+
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("mq cache policy");
+
+MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
new file mode 100644
index 00000000000..2cbf5fdaac5
--- /dev/null
+++ b/drivers/md/dm-cache-policy.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "cache-policy"
+
+static DEFINE_SPINLOCK(register_lock);
+static LIST_HEAD(register_list);
+
+static struct dm_cache_policy_type *__find_policy(const char *name)
+{
+ struct dm_cache_policy_type *t;
+
+ list_for_each_entry(t, &register_list, list)
+ if (!strcmp(t->name, name))
+ return t;
+
+ return NULL;
+}
+
+static struct dm_cache_policy_type *__get_policy_once(const char *name)
+{
+ struct dm_cache_policy_type *t = __find_policy(name);
+
+ if (t && !try_module_get(t->owner)) {
+ DMWARN("couldn't get module %s", name);
+ t = ERR_PTR(-EINVAL);
+ }
+
+ return t;
+}
+
+static struct dm_cache_policy_type *get_policy_once(const char *name)
+{
+ struct dm_cache_policy_type *t;
+
+ spin_lock(&register_lock);
+ t = __get_policy_once(name);
+ spin_unlock(&register_lock);
+
+ return t;
+}
+
+static struct dm_cache_policy_type *get_policy(const char *name)
+{
+ struct dm_cache_policy_type *t;
+
+ t = get_policy_once(name);
+ if (IS_ERR(t))
+ return NULL;
+
+ if (t)
+ return t;
+
+ request_module("dm-cache-%s", name);
+
+ t = get_policy_once(name);
+ if (IS_ERR(t))
+ return NULL;
+
+ return t;
+}
+
+static void put_policy(struct dm_cache_policy_type *t)
+{
+ module_put(t->owner);
+}
+
+int dm_cache_policy_register(struct dm_cache_policy_type *type)
+{
+ int r;
+
+ /* One size fits all for now */
+ if (type->hint_size != 0 && type->hint_size != 4) {
+ DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size);
+ return -EINVAL;
+ }
+
+ spin_lock(&register_lock);
+ if (__find_policy(type->name)) {
+ DMWARN("attempt to register policy under duplicate name %s", type->name);
+ r = -EINVAL;
+ } else {
+ list_add(&type->list, &register_list);
+ r = 0;
+ }
+ spin_unlock(&register_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_register);
+
+void dm_cache_policy_unregister(struct dm_cache_policy_type *type)
+{
+ spin_lock(&register_lock);
+ list_del_init(&type->list);
+ spin_unlock(&register_lock);
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_unregister);
+
+struct dm_cache_policy *dm_cache_policy_create(const char *name,
+ dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ struct dm_cache_policy *p = NULL;
+ struct dm_cache_policy_type *type;
+
+ type = get_policy(name);
+ if (!type) {
+ DMWARN("unknown policy type");
+ return NULL;
+ }
+
+ p = type->create(cache_size, origin_size, cache_block_size);
+ if (!p) {
+ put_policy(type);
+ return NULL;
+ }
+ p->private = type;
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_create);
+
+void dm_cache_policy_destroy(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ p->destroy(p);
+ put_policy(t);
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_destroy);
+
+const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ return t->name;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
+
+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ return t->hint_size;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
new file mode 100644
index 00000000000..f0f51b26054
--- /dev/null
+++ b/drivers/md/dm-cache-policy.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_POLICY_H
+#define DM_CACHE_POLICY_H
+
+#include "dm-cache-block-types.h"
+
+#include <linux/device-mapper.h>
+
+/*----------------------------------------------------------------*/
+
+/* FIXME: make it clear which methods are optional. Get debug policy to
+ * double check this at start.
+ */
+
+/*
+ * The cache policy makes the important decisions about which blocks get to
+ * live on the faster cache device.
+ *
+ * When the core target has to remap a bio it calls the 'map' method of the
+ * policy. This returns an instruction telling the core target what to do.
+ *
+ * POLICY_HIT:
+ * That block is in the cache. Remap to the cache and carry on.
+ *
+ * POLICY_MISS:
+ * This block is on the origin device. Remap and carry on.
+ *
+ * POLICY_NEW:
+ * This block is currently on the origin device, but the policy wants to
+ * move it. The core should:
+ *
+ * - hold any further io to this origin block
+ * - copy the origin to the given cache block
+ * - release all the held blocks
+ * - remap the original block to the cache
+ *
+ * POLICY_REPLACE:
+ * This block is currently on the origin device. The policy wants to
+ * move it to the cache, with the added complication that the destination
+ * cache block needs a writeback first. The core should:
+ *
+ * - hold any further io to this origin block
+ * - hold any further io to the origin block that's being written back
+ * - writeback
+ * - copy new block to cache
+ * - release held blocks
+ * - remap bio to cache and reissue.
+ *
+ * Should the core run into trouble while processing a POLICY_NEW or
+ * POLICY_REPLACE instruction it will roll back the policies mapping using
+ * remove_mapping() or force_mapping(). These methods must not fail. This
+ * approach avoids having transactional semantics in the policy (ie, the
+ * core informing the policy when a migration is complete), and hence makes
+ * it easier to write new policies.
+ *
+ * In general policy methods should never block, except in the case of the
+ * map function when can_migrate is set. So be careful to implement using
+ * bounded, preallocated memory.
+ */
+enum policy_operation {
+ POLICY_HIT,
+ POLICY_MISS,
+ POLICY_NEW,
+ POLICY_REPLACE
+};
+
+/*
+ * This is the instruction passed back to the core target.
+ */
+struct policy_result {
+ enum policy_operation op;
+ dm_oblock_t old_oblock; /* POLICY_REPLACE */
+ dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+};
+
+typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
+ dm_oblock_t oblock, uint32_t hint);
+
+/*
+ * The cache policy object. Just a bunch of methods. It is envisaged that
+ * this structure will be embedded in a bigger, policy specific structure
+ * (ie. use container_of()).
+ */
+struct dm_cache_policy {
+
+ /*
+ * FIXME: make it clear which methods are optional, and which may
+ * block.
+ */
+
+ /*
+ * Destroys this object.
+ */
+ void (*destroy)(struct dm_cache_policy *p);
+
+ /*
+ * See large comment above.
+ *
+ * oblock - the origin block we're interested in.
+ *
+ * can_block - indicates whether the current thread is allowed to
+ * block. -EWOULDBLOCK returned if it can't and would.
+ *
+ * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
+ * instructions. If denied and the policy would have
+ * returned one of these instructions it should
+ * return -EWOULDBLOCK.
+ *
+ * discarded_oblock - indicates whether the whole origin block is
+ * in a discarded state (FIXME: better to tell the
+ * policy about this sooner, so it can recycle that
+ * cache block if it wants.)
+ * bio - the bio that triggered this call.
+ * result - gets filled in with the instruction.
+ *
+ * May only return 0, or -EWOULDBLOCK (if !can_migrate)
+ */
+ int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool discarded_oblock,
+ struct bio *bio, struct policy_result *result);
+
+ /*
+ * Sometimes we want to see if a block is in the cache, without
+ * triggering any update of stats. (ie. it's not a real hit).
+ *
+ * Must not block.
+ *
+ * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK
+ * would be typical).
+ */
+ int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
+
+ /*
+ * oblock must be a mapped block. Must not block.
+ */
+ void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+ void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+
+ /*
+ * Called when a cache target is first created. Used to load a
+ * mapping from the metadata device into the policy.
+ */
+ int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
+ dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+
+ int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
+ void *context);
+
+ /*
+ * Override functions used on the error paths of the core target.
+ * They must succeed.
+ */
+ void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
+ void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
+ dm_oblock_t new_oblock);
+
+ int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+
+
+ /*
+ * How full is the cache?
+ */
+ dm_cblock_t (*residency)(struct dm_cache_policy *p);
+
+ /*
+ * Because of where we sit in the block layer, we can be asked to
+ * map a lot of little bios that are all in the same block (no
+ * queue merging has occurred). To stop the policy being fooled by
+ * these the core target sends regular tick() calls to the policy.
+ * The policy should only count an entry as hit once per tick.
+ */
+ void (*tick)(struct dm_cache_policy *p);
+
+ /*
+ * Configuration.
+ */
+ int (*emit_config_values)(struct dm_cache_policy *p,
+ char *result, unsigned maxlen);
+ int (*set_config_value)(struct dm_cache_policy *p,
+ const char *key, const char *value);
+
+ /*
+ * Book keeping ptr for the policy register, not for general use.
+ */
+ void *private;
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * We maintain a little register of the different policy types.
+ */
+#define CACHE_POLICY_NAME_SIZE 16
+
+struct dm_cache_policy_type {
+ /* For use by the register code only. */
+ struct list_head list;
+
+ /*
+ * Policy writers should fill in these fields. The name field is
+ * what gets passed on the target line to select your policy.
+ */
+ char name[CACHE_POLICY_NAME_SIZE];
+
+ /*
+ * Policies may store a hint for each each cache block.
+ * Currently the size of this hint must be 0 or 4 bytes but we
+ * expect to relax this in future.
+ */
+ size_t hint_size;
+
+ struct module *owner;
+ struct dm_cache_policy *(*create)(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t block_size);
+};
+
+int dm_cache_policy_register(struct dm_cache_policy_type *type);
+void dm_cache_policy_unregister(struct dm_cache_policy_type *type);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_CACHE_POLICY_H */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
new file mode 100644
index 00000000000..0f4e84b15c3
--- /dev/null
+++ b/drivers/md/dm-cache-target.c
@@ -0,0 +1,2584 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison.h"
+#include "dm-cache-metadata.h"
+
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "cache"
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
+ "A percentage of time allocated for copying to and/or from cache");
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ * either direction
+ */
+
+/*----------------------------------------------------------------*/
+
+static size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+ return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+
+static unsigned long *alloc_bitset(unsigned nr_entries)
+{
+ size_t s = bitset_size_in_bytes(nr_entries);
+ return vzalloc(s);
+}
+
+static void clear_bitset(void *bitset, unsigned nr_entries)
+{
+ size_t s = bitset_size_in_bytes(nr_entries);
+ memset(bitset, 0, s);
+}
+
+static void free_bitset(unsigned long *bits)
+{
+ vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
+#define PRISON_CELLS 1024
+#define MIGRATION_POOL_SIZE 128
+#define COMMIT_PERIOD HZ
+#define MIGRATION_COUNT_WINDOW 10
+
+/*
+ * The block size of the device holding cache data must be >= 32KB
+ */
+#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
+
+/*
+ * FIXME: the cache is read/write for the time being.
+ */
+enum cache_mode {
+ CM_WRITE, /* metadata may be changed */
+ CM_READ_ONLY, /* metadata may not be changed */
+};
+
+struct cache_features {
+ enum cache_mode mode;
+ bool write_through:1;
+};
+
+struct cache_stats {
+ atomic_t read_hit;
+ atomic_t read_miss;
+ atomic_t write_hit;
+ atomic_t write_miss;
+ atomic_t demotion;
+ atomic_t promotion;
+ atomic_t copies_avoided;
+ atomic_t cache_cell_clash;
+ atomic_t commit_count;
+ atomic_t discard_count;
+};
+
+struct cache {
+ struct dm_target *ti;
+ struct dm_target_callbacks callbacks;
+
+ /*
+ * Metadata is written to this device.
+ */
+ struct dm_dev *metadata_dev;
+
+ /*
+ * The slower of the two data devices. Typically a spindle.
+ */
+ struct dm_dev *origin_dev;
+
+ /*
+ * The faster of the two data devices. Typically an SSD.
+ */
+ struct dm_dev *cache_dev;
+
+ /*
+ * Cache features such as write-through.
+ */
+ struct cache_features features;
+
+ /*
+ * Size of the origin device in _complete_ blocks and native sectors.
+ */
+ dm_oblock_t origin_blocks;
+ sector_t origin_sectors;
+
+ /*
+ * Size of the cache device in blocks.
+ */
+ dm_cblock_t cache_size;
+
+ /*
+ * Fields for converting from sectors to blocks.
+ */
+ uint32_t sectors_per_block;
+ int sectors_per_block_shift;
+
+ struct dm_cache_metadata *cmd;
+
+ spinlock_t lock;
+ struct bio_list deferred_bios;
+ struct bio_list deferred_flush_bios;
+ struct list_head quiesced_migrations;
+ struct list_head completed_migrations;
+ struct list_head need_commit_migrations;
+ sector_t migration_threshold;
+ atomic_t nr_migrations;
+ wait_queue_head_t migration_wait;
+
+ /*
+ * cache_size entries, dirty if set
+ */
+ dm_cblock_t nr_dirty;
+ unsigned long *dirty_bitset;
+
+ /*
+ * origin_blocks entries, discarded if set.
+ */
+ sector_t discard_block_size; /* a power of 2 times sectors per block */
+ dm_dblock_t discard_nr_blocks;
+ unsigned long *discard_bitset;
+
+ struct dm_kcopyd_client *copier;
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ struct delayed_work waker;
+ unsigned long last_commit_jiffies;
+
+ struct dm_bio_prison *prison;
+ struct dm_deferred_set *all_io_ds;
+
+ mempool_t *migration_pool;
+ struct dm_cache_migration *next_migration;
+
+ struct dm_cache_policy *policy;
+ unsigned policy_nr_args;
+
+ bool need_tick_bio:1;
+ bool sized:1;
+ bool quiescing:1;
+ bool commit_requested:1;
+ bool loaded_mappings:1;
+ bool loaded_discards:1;
+
+ struct cache_stats stats;
+
+ /*
+ * Rather than reconstructing the table line for the status we just
+ * save it and regurgitate.
+ */
+ unsigned nr_ctr_args;
+ const char **ctr_args;
+};
+
+struct per_bio_data {
+ bool tick:1;
+ unsigned req_nr:2;
+ struct dm_deferred_entry *all_io_entry;
+};
+
+struct dm_cache_migration {
+ struct list_head list;
+ struct cache *cache;
+
+ unsigned long start_jiffies;
+ dm_oblock_t old_oblock;
+ dm_oblock_t new_oblock;
+ dm_cblock_t cblock;
+
+ bool err:1;
+ bool writeback:1;
+ bool demote:1;
+ bool promote:1;
+
+ struct dm_bio_prison_cell *old_ocell;
+ struct dm_bio_prison_cell *new_ocell;
+};
+
+/*
+ * Processing a bio in the worker thread may require these memory
+ * allocations. We prealloc to avoid deadlocks (the same worker thread
+ * frees them back to the mempool).
+ */
+struct prealloc {
+ struct dm_cache_migration *mg;
+ struct dm_bio_prison_cell *cell1;
+ struct dm_bio_prison_cell *cell2;
+};
+
+static void wake_worker(struct cache *cache)
+{
+ queue_work(cache->wq, &cache->worker);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+{
+ /* FIXME: change to use a local slab. */
+ return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+}
+
+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+ dm_bio_prison_free_cell(cache->prison, cell);
+}
+
+static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
+{
+ if (!p->mg) {
+ p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+ if (!p->mg)
+ return -ENOMEM;
+ }
+
+ if (!p->cell1) {
+ p->cell1 = alloc_prison_cell(cache);
+ if (!p->cell1)
+ return -ENOMEM;
+ }
+
+ if (!p->cell2) {
+ p->cell2 = alloc_prison_cell(cache);
+ if (!p->cell2)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+{
+ if (p->cell2)
+ free_prison_cell(cache, p->cell2);
+
+ if (p->cell1)
+ free_prison_cell(cache, p->cell1);
+
+ if (p->mg)
+ mempool_free(p->mg, cache->migration_pool);
+}
+
+static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+{
+ struct dm_cache_migration *mg = p->mg;
+
+ BUG_ON(!mg);
+ p->mg = NULL;
+
+ return mg;
+}
+
+/*
+ * You must have a cell within the prealloc struct to return. If not this
+ * function will BUG() rather than returning NULL.
+ */
+static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+{
+ struct dm_bio_prison_cell *r = NULL;
+
+ if (p->cell1) {
+ r = p->cell1;
+ p->cell1 = NULL;
+
+ } else if (p->cell2) {
+ r = p->cell2;
+ p->cell2 = NULL;
+ } else
+ BUG();
+
+ return r;
+}
+
+/*
+ * You can't have more than two cells in a prealloc struct. BUG() will be
+ * called if you try and overfill.
+ */
+static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
+{
+ if (!p->cell2)
+ p->cell2 = cell;
+
+ else if (!p->cell1)
+ p->cell1 = cell;
+
+ else
+ BUG();
+}
+
+/*----------------------------------------------------------------*/
+
+static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
+{
+ key->virtual = 0;
+ key->dev = 0;
+ key->block = from_oblock(oblock);
+}
+
+/*
+ * The caller hands in a preallocated cell, and a free function for it.
+ * The cell will be freed if there's an error, or if it wasn't used because
+ * a cell with that key already exists.
+ */
+typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+
+static int bio_detain(struct cache *cache, dm_oblock_t oblock,
+ struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+ cell_free_fn free_fn, void *free_context,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ struct dm_cell_key key;
+
+ build_key(oblock, &key);
+ r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
+ if (r)
+ free_fn(free_context, cell_prealloc);
+
+ return r;
+}
+
+static int get_cell(struct cache *cache,
+ dm_oblock_t oblock,
+ struct prealloc *structs,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ struct dm_cell_key key;
+ struct dm_bio_prison_cell *cell_prealloc;
+
+ cell_prealloc = prealloc_get_cell(structs);
+
+ build_key(oblock, &key);
+ r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
+ if (r)
+ prealloc_put_cell(structs, cell_prealloc);
+
+ return r;
+}
+
+ /*----------------------------------------------------------------*/
+
+static bool is_dirty(struct cache *cache, dm_cblock_t b)
+{
+ return test_bit(from_cblock(b), cache->dirty_bitset);
+}
+
+static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
+ cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
+ policy_set_dirty(cache->policy, oblock);
+ }
+}
+
+static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
+ policy_clear_dirty(cache->policy, oblock);
+ cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
+ if (!from_cblock(cache->nr_dirty))
+ dm_table_event(cache->ti->table);
+ }
+}
+
+/*----------------------------------------------------------------*/
+static bool block_size_is_power_of_two(struct cache *cache)
+{
+ return cache->sectors_per_block_shift >= 0;
+}
+
+static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
+{
+ sector_t discard_blocks = cache->discard_block_size;
+ dm_block_t b = from_oblock(oblock);
+
+ if (!block_size_is_power_of_two(cache))
+ (void) sector_div(discard_blocks, cache->sectors_per_block);
+ else
+ discard_blocks >>= cache->sectors_per_block_shift;
+
+ (void) sector_div(b, discard_blocks);
+
+ return to_dblock(b);
+}
+
+static void set_discard(struct cache *cache, dm_dblock_t b)
+{
+ unsigned long flags;
+
+ atomic_inc(&cache->stats.discard_count);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ set_bit(from_dblock(b), cache->discard_bitset);
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void clear_discard(struct cache *cache, dm_dblock_t b)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ clear_bit(from_dblock(b), cache->discard_bitset);
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static bool is_discarded(struct cache *cache, dm_dblock_t b)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ r = test_bit(from_dblock(b), cache->discard_bitset);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ return r;
+}
+
+static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
+ cache->discard_bitset);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static void load_stats(struct cache *cache)
+{
+ struct dm_cache_statistics stats;
+
+ dm_cache_metadata_get_stats(cache->cmd, &stats);
+ atomic_set(&cache->stats.read_hit, stats.read_hits);
+ atomic_set(&cache->stats.read_miss, stats.read_misses);
+ atomic_set(&cache->stats.write_hit, stats.write_hits);
+ atomic_set(&cache->stats.write_miss, stats.write_misses);
+}
+
+static void save_stats(struct cache *cache)
+{
+ struct dm_cache_statistics stats;
+
+ stats.read_hits = atomic_read(&cache->stats.read_hit);
+ stats.read_misses = atomic_read(&cache->stats.read_miss);
+ stats.write_hits = atomic_read(&cache->stats.write_hit);
+ stats.write_misses = atomic_read(&cache->stats.write_miss);
+
+ dm_cache_metadata_set_stats(cache->cmd, &stats);
+}
+
+/*----------------------------------------------------------------
+ * Per bio data
+ *--------------------------------------------------------------*/
+static struct per_bio_data *get_per_bio_data(struct bio *bio)
+{
+ struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+ BUG_ON(!pb);
+ return pb;
+}
+
+static struct per_bio_data *init_per_bio_data(struct bio *bio)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio);
+
+ pb->tick = false;
+ pb->req_nr = dm_bio_get_target_bio_nr(bio);
+ pb->all_io_entry = NULL;
+
+ return pb;
+}
+
+/*----------------------------------------------------------------
+ * Remapping
+ *--------------------------------------------------------------*/
+static void remap_to_origin(struct cache *cache, struct bio *bio)
+{
+ bio->bi_bdev = cache->origin_dev->bdev;
+}
+
+static void remap_to_cache(struct cache *cache, struct bio *bio,
+ dm_cblock_t cblock)
+{
+ sector_t bi_sector = bio->bi_sector;
+
+ bio->bi_bdev = cache->cache_dev->bdev;
+ if (!block_size_is_power_of_two(cache))
+ bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
+ sector_div(bi_sector, cache->sectors_per_block);
+ else
+ bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
+ (bi_sector & (cache->sectors_per_block - 1));
+}
+
+static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+ struct per_bio_data *pb = get_per_bio_data(bio);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ if (cache->need_tick_bio &&
+ !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
+ pb->tick = true;
+ cache->need_tick_bio = false;
+ }
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock)
+{
+ check_if_tick_bio_needed(cache, bio);
+ remap_to_origin(cache, bio);
+ if (bio_data_dir(bio) == WRITE)
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
+}
+
+static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ remap_to_cache(cache, bio, cblock);
+ if (bio_data_dir(bio) == WRITE) {
+ set_dirty(cache, oblock, cblock);
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
+ }
+}
+
+static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
+{
+ sector_t block_nr = bio->bi_sector;
+
+ if (!block_size_is_power_of_two(cache))
+ (void) sector_div(block_nr, cache->sectors_per_block);
+ else
+ block_nr >>= cache->sectors_per_block_shift;
+
+ return to_oblock(block_nr);
+}
+
+static int bio_triggers_commit(struct cache *cache, struct bio *bio)
+{
+ return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+}
+
+static void issue(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+
+ if (!bio_triggers_commit(cache, bio)) {
+ generic_make_request(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a
+ * single commit for them in do_worker().
+ */
+ spin_lock_irqsave(&cache->lock, flags);
+ cache->commit_requested = true;
+ bio_list_add(&cache->deferred_flush_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+/*----------------------------------------------------------------
+ * Migration processing
+ *
+ * Migration covers moving data from the origin device to the cache, or
+ * vice versa.
+ *--------------------------------------------------------------*/
+static void free_migration(struct dm_cache_migration *mg)
+{
+ mempool_free(mg, mg->cache->migration_pool);
+}
+
+static void inc_nr_migrations(struct cache *cache)
+{
+ atomic_inc(&cache->nr_migrations);
+}
+
+static void dec_nr_migrations(struct cache *cache)
+{
+ atomic_dec(&cache->nr_migrations);
+
+ /*
+ * Wake the worker in case we're suspending the target.
+ */
+ wake_up(&cache->migration_wait);
+}
+
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
+ bool holder)
+{
+ (holder ? dm_cell_release : dm_cell_release_no_holder)
+ (cache->prison, cell, &cache->deferred_bios);
+ free_prison_cell(cache, cell);
+}
+
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
+ bool holder)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ __cell_defer(cache, cell, holder);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void cleanup_migration(struct dm_cache_migration *mg)
+{
+ dec_nr_migrations(mg->cache);
+ free_migration(mg);
+}
+
+static void migration_failure(struct dm_cache_migration *mg)
+{
+ struct cache *cache = mg->cache;
+
+ if (mg->writeback) {
+ DMWARN_LIMIT("writeback failed; couldn't copy block");
+ set_dirty(cache, mg->old_oblock, mg->cblock);
+ cell_defer(cache, mg->old_ocell, false);
+
+ } else if (mg->demote) {
+ DMWARN_LIMIT("demotion failed; couldn't copy block");
+ policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
+
+ cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+ if (mg->promote)
+ cell_defer(cache, mg->new_ocell, 1);
+ } else {
+ DMWARN_LIMIT("promotion failed; couldn't copy block");
+ policy_remove_mapping(cache->policy, mg->new_oblock);
+ cell_defer(cache, mg->new_ocell, 1);
+ }
+
+ cleanup_migration(mg);
+}
+
+static void migration_success_pre_commit(struct dm_cache_migration *mg)
+{
+ unsigned long flags;
+ struct cache *cache = mg->cache;
+
+ if (mg->writeback) {
+ cell_defer(cache, mg->old_ocell, false);
+ clear_dirty(cache, mg->old_oblock, mg->cblock);
+ cleanup_migration(mg);
+ return;
+
+ } else if (mg->demote) {
+ if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
+ DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+ policy_force_mapping(cache->policy, mg->new_oblock,
+ mg->old_oblock);
+ if (mg->promote)
+ cell_defer(cache, mg->new_ocell, true);
+ cleanup_migration(mg);
+ return;
+ }
+ } else {
+ if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
+ DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+ policy_remove_mapping(cache->policy, mg->new_oblock);
+ cleanup_migration(mg);
+ return;
+ }
+ }
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->need_commit_migrations);
+ cache->commit_requested = true;
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void migration_success_post_commit(struct dm_cache_migration *mg)
+{
+ unsigned long flags;
+ struct cache *cache = mg->cache;
+
+ if (mg->writeback) {
+ DMWARN("writeback unexpectedly triggered commit");
+ return;
+
+ } else if (mg->demote) {
+ cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+
+ if (mg->promote) {
+ mg->demote = false;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->quiesced_migrations);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ } else
+ cleanup_migration(mg);
+
+ } else {
+ cell_defer(cache, mg->new_ocell, true);
+ clear_dirty(cache, mg->new_oblock, mg->cblock);
+ cleanup_migration(mg);
+ }
+}
+
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+ unsigned long flags;
+ struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
+ struct cache *cache = mg->cache;
+
+ if (read_err || write_err)
+ mg->err = true;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->completed_migrations);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void issue_copy_real(struct dm_cache_migration *mg)
+{
+ int r;
+ struct dm_io_region o_region, c_region;
+ struct cache *cache = mg->cache;
+
+ o_region.bdev = cache->origin_dev->bdev;
+ o_region.count = cache->sectors_per_block;
+
+ c_region.bdev = cache->cache_dev->bdev;
+ c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
+ c_region.count = cache->sectors_per_block;
+
+ if (mg->writeback || mg->demote) {
+ /* demote */
+ o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
+ r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
+ } else {
+ /* promote */
+ o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
+ r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
+ }
+
+ if (r < 0)
+ migration_failure(mg);
+}
+
+static void avoid_copy(struct dm_cache_migration *mg)
+{
+ atomic_inc(&mg->cache->stats.copies_avoided);
+ migration_success_pre_commit(mg);
+}
+
+static void issue_copy(struct dm_cache_migration *mg)
+{
+ bool avoid;
+ struct cache *cache = mg->cache;
+
+ if (mg->writeback || mg->demote)
+ avoid = !is_dirty(cache, mg->cblock) ||
+ is_discarded_oblock(cache, mg->old_oblock);
+ else
+ avoid = is_discarded_oblock(cache, mg->new_oblock);
+
+ avoid ? avoid_copy(mg) : issue_copy_real(mg);
+}
+
+static void complete_migration(struct dm_cache_migration *mg)
+{
+ if (mg->err)
+ migration_failure(mg);
+ else
+ migration_success_pre_commit(mg);
+}
+
+static void process_migrations(struct cache *cache, struct list_head *head,
+ void (*fn)(struct dm_cache_migration *))
+{
+ unsigned long flags;
+ struct list_head list;
+ struct dm_cache_migration *mg, *tmp;
+
+ INIT_LIST_HEAD(&list);
+ spin_lock_irqsave(&cache->lock, flags);
+ list_splice_init(head, &list);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ list_for_each_entry_safe(mg, tmp, &list, list)
+ fn(mg);
+}
+
+static void __queue_quiesced_migration(struct dm_cache_migration *mg)
+{
+ list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+}
+
+static void queue_quiesced_migration(struct dm_cache_migration *mg)
+{
+ unsigned long flags;
+ struct cache *cache = mg->cache;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ __queue_quiesced_migration(mg);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
+{
+ unsigned long flags;
+ struct dm_cache_migration *mg, *tmp;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_for_each_entry_safe(mg, tmp, work, list)
+ __queue_quiesced_migration(mg);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void check_for_quiesced_migrations(struct cache *cache,
+ struct per_bio_data *pb)
+{
+ struct list_head work;
+
+ if (!pb->all_io_entry)
+ return;
+
+ INIT_LIST_HEAD(&work);
+ if (pb->all_io_entry)
+ dm_deferred_entry_dec(pb->all_io_entry, &work);
+
+ if (!list_empty(&work))
+ queue_quiesced_migrations(cache, &work);
+}
+
+static void quiesce_migration(struct dm_cache_migration *mg)
+{
+ if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
+ queue_quiesced_migration(mg);
+}
+
+static void promote(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->writeback = false;
+ mg->demote = false;
+ mg->promote = true;
+ mg->cache = cache;
+ mg->new_oblock = oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = NULL;
+ mg->new_ocell = cell;
+ mg->start_jiffies = jiffies;
+
+ inc_nr_migrations(cache);
+ quiesce_migration(mg);
+}
+
+static void writeback(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->writeback = true;
+ mg->demote = false;
+ mg->promote = false;
+ mg->cache = cache;
+ mg->old_oblock = oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = cell;
+ mg->new_ocell = NULL;
+ mg->start_jiffies = jiffies;
+
+ inc_nr_migrations(cache);
+ quiesce_migration(mg);
+}
+
+static void demote_then_promote(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t old_oblock, dm_oblock_t new_oblock,
+ dm_cblock_t cblock,
+ struct dm_bio_prison_cell *old_ocell,
+ struct dm_bio_prison_cell *new_ocell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->writeback = false;
+ mg->demote = true;
+ mg->promote = true;
+ mg->cache = cache;
+ mg->old_oblock = old_oblock;
+ mg->new_oblock = new_oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = old_ocell;
+ mg->new_ocell = new_ocell;
+ mg->start_jiffies = jiffies;
+
+ inc_nr_migrations(cache);
+ quiesce_migration(mg);
+}
+
+/*----------------------------------------------------------------
+ * bio processing
+ *--------------------------------------------------------------*/
+static void defer_bio(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_add(&cache->deferred_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void process_flush_bio(struct cache *cache, struct bio *bio)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio);
+
+ BUG_ON(bio->bi_size);
+ if (!pb->req_nr)
+ remap_to_origin(cache, bio);
+ else
+ remap_to_cache(cache, bio, 0);
+
+ issue(cache, bio);
+}
+
+/*
+ * People generally discard large parts of a device, eg, the whole device
+ * when formatting. Splitting these large discards up into cache block
+ * sized ios and then quiescing (always neccessary for discard) takes too
+ * long.
+ *
+ * We keep it simple, and allow any size of discard to come in, and just
+ * mark off blocks on the discard bitset. No passdown occurs!
+ *
+ * To implement passdown we need to change the bio_prison such that a cell
+ * can have a key that spans many blocks.
+ */
+static void process_discard_bio(struct cache *cache, struct bio *bio)
+{
+ dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
+ cache->discard_block_size);
+ dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
+ dm_block_t b;
+
+ (void) sector_div(end_block, cache->discard_block_size);
+
+ for (b = start_block; b < end_block; b++)
+ set_discard(cache, to_dblock(b));
+
+ bio_endio(bio, 0);
+}
+
+static bool spare_migration_bandwidth(struct cache *cache)
+{
+ sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
+ cache->sectors_per_block;
+ return current_volume < cache->migration_threshold;
+}
+
+static bool is_writethrough_io(struct cache *cache, struct bio *bio,
+ dm_cblock_t cblock)
+{
+ return bio_data_dir(bio) == WRITE &&
+ cache->features.write_through && !is_dirty(cache, cblock);
+}
+
+static void inc_hit_counter(struct cache *cache, struct bio *bio)
+{
+ atomic_inc(bio_data_dir(bio) == READ ?
+ &cache->stats.read_hit : &cache->stats.write_hit);
+}
+
+static void inc_miss_counter(struct cache *cache, struct bio *bio)
+{
+ atomic_inc(bio_data_dir(bio) == READ ?
+ &cache->stats.read_miss : &cache->stats.write_miss);
+}
+
+static void process_bio(struct cache *cache, struct prealloc *structs,
+ struct bio *bio)
+{
+ int r;
+ bool release_cell = true;
+ dm_oblock_t block = get_bio_block(cache, bio);
+ struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
+ struct policy_result lookup_result;
+ struct per_bio_data *pb = get_per_bio_data(bio);
+ bool discarded_block = is_discarded_oblock(cache, block);
+ bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
+
+ /*
+ * Check to see if that block is currently migrating.
+ */
+ cell_prealloc = prealloc_get_cell(structs);
+ r = bio_detain(cache, block, bio, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ structs, &new_ocell);
+ if (r > 0)
+ return;
+
+ r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
+ bio, &lookup_result);
+
+ if (r == -EWOULDBLOCK)
+ /* migration has been denied */
+ lookup_result.op = POLICY_MISS;
+
+ switch (lookup_result.op) {
+ case POLICY_HIT:
+ inc_hit_counter(cache, bio);
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+
+ if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
+ /*
+ * No need to mark anything dirty in write through mode.
+ */
+ pb->req_nr == 0 ?
+ remap_to_cache(cache, bio, lookup_result.cblock) :
+ remap_to_origin_clear_discard(cache, bio, block);
+ } else
+ remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+
+ issue(cache, bio);
+ break;
+
+ case POLICY_MISS:
+ inc_miss_counter(cache, bio);
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+
+ if (pb->req_nr != 0) {
+ /*
+ * This is a duplicate writethrough io that is no
+ * longer needed because the block has been demoted.
+ */
+ bio_endio(bio, 0);
+ } else {
+ remap_to_origin_clear_discard(cache, bio, block);
+ issue(cache, bio);
+ }
+ break;
+
+ case POLICY_NEW:
+ atomic_inc(&cache->stats.promotion);
+ promote(cache, structs, block, lookup_result.cblock, new_ocell);
+ release_cell = false;
+ break;
+
+ case POLICY_REPLACE:
+ cell_prealloc = prealloc_get_cell(structs);
+ r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ structs, &old_ocell);
+ if (r > 0) {
+ /*
+ * We have to be careful to avoid lock inversion of
+ * the cells. So we back off, and wait for the
+ * old_ocell to become free.
+ */
+ policy_force_mapping(cache->policy, block,
+ lookup_result.old_oblock);
+ atomic_inc(&cache->stats.cache_cell_clash);
+ break;
+ }
+ atomic_inc(&cache->stats.demotion);
+ atomic_inc(&cache->stats.promotion);
+
+ demote_then_promote(cache, structs, lookup_result.old_oblock,
+ block, lookup_result.cblock,
+ old_ocell, new_ocell);
+ release_cell = false;
+ break;
+
+ default:
+ DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+ (unsigned) lookup_result.op);
+ bio_io_error(bio);
+ }
+
+ if (release_cell)
+ cell_defer(cache, new_ocell, false);
+}
+
+static int need_commit_due_to_time(struct cache *cache)
+{
+ return jiffies < cache->last_commit_jiffies ||
+ jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+}
+
+static int commit_if_needed(struct cache *cache)
+{
+ if (dm_cache_changed_this_transaction(cache->cmd) &&
+ (cache->commit_requested || need_commit_due_to_time(cache))) {
+ atomic_inc(&cache->stats.commit_count);
+ cache->last_commit_jiffies = jiffies;
+ cache->commit_requested = false;
+ return dm_cache_commit(cache->cmd, false);
+ }
+
+ return 0;
+}
+
+static void process_deferred_bios(struct cache *cache)
+{
+ unsigned long flags;
+ struct bio_list bios;
+ struct bio *bio;
+ struct prealloc structs;
+
+ memset(&structs, 0, sizeof(structs));
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&bios, &cache->deferred_bios);
+ bio_list_init(&cache->deferred_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ while (!bio_list_empty(&bios)) {
+ /*
+ * If we've got no free migration structs, and processing
+ * this bio might require one, we pause until there are some
+ * prepared mappings to process.
+ */
+ if (prealloc_data_structs(cache, &structs)) {
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&cache->deferred_bios, &bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+ break;
+ }
+
+ bio = bio_list_pop(&bios);
+
+ if (bio->bi_rw & REQ_FLUSH)
+ process_flush_bio(cache, bio);
+ else if (bio->bi_rw & REQ_DISCARD)
+ process_discard_bio(cache, bio);
+ else
+ process_bio(cache, &structs, bio);
+ }
+
+ prealloc_free_structs(cache, &structs);
+}
+
+static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+{
+ unsigned long flags;
+ struct bio_list bios;
+ struct bio *bio;
+
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&bios, &cache->deferred_flush_bios);
+ bio_list_init(&cache->deferred_flush_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ while ((bio = bio_list_pop(&bios)))
+ submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+}
+
+static void writeback_some_dirty_blocks(struct cache *cache)
+{
+ int r = 0;
+ dm_oblock_t oblock;
+ dm_cblock_t cblock;
+ struct prealloc structs;
+ struct dm_bio_prison_cell *old_ocell;
+
+ memset(&structs, 0, sizeof(structs));
+
+ while (spare_migration_bandwidth(cache)) {
+ if (prealloc_data_structs(cache, &structs))
+ break;
+
+ r = policy_writeback_work(cache->policy, &oblock, &cblock);
+ if (r)
+ break;
+
+ r = get_cell(cache, oblock, &structs, &old_ocell);
+ if (r) {
+ policy_set_dirty(cache->policy, oblock);
+ break;
+ }
+
+ writeback(cache, &structs, oblock, cblock, old_ocell);
+ }
+
+ prealloc_free_structs(cache, &structs);
+}
+
+/*----------------------------------------------------------------
+ * Main worker loop
+ *--------------------------------------------------------------*/
+static void start_quiescing(struct cache *cache)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ cache->quiescing = 1;
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void stop_quiescing(struct cache *cache)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ cache->quiescing = 0;
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static bool is_quiescing(struct cache *cache)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ r = cache->quiescing;
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ return r;
+}
+
+static void wait_for_migrations(struct cache *cache)
+{
+ wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
+}
+
+static void stop_worker(struct cache *cache)
+{
+ cancel_delayed_work(&cache->waker);
+ flush_workqueue(cache->wq);
+}
+
+static void requeue_deferred_io(struct cache *cache)
+{
+ struct bio *bio;
+ struct bio_list bios;
+
+ bio_list_init(&bios);
+ bio_list_merge(&bios, &cache->deferred_bios);
+ bio_list_init(&cache->deferred_bios);
+
+ while ((bio = bio_list_pop(&bios)))
+ bio_endio(bio, DM_ENDIO_REQUEUE);
+}
+
+static int more_work(struct cache *cache)
+{
+ if (is_quiescing(cache))
+ return !list_empty(&cache->quiesced_migrations) ||
+ !list_empty(&cache->completed_migrations) ||
+ !list_empty(&cache->need_commit_migrations);
+ else
+ return !bio_list_empty(&cache->deferred_bios) ||
+ !bio_list_empty(&cache->deferred_flush_bios) ||
+ !list_empty(&cache->quiesced_migrations) ||
+ !list_empty(&cache->completed_migrations) ||
+ !list_empty(&cache->need_commit_migrations);
+}
+
+static void do_worker(struct work_struct *ws)
+{
+ struct cache *cache = container_of(ws, struct cache, worker);
+
+ do {
+ if (!is_quiescing(cache))
+ process_deferred_bios(cache);
+
+ process_migrations(cache, &cache->quiesced_migrations, issue_copy);
+ process_migrations(cache, &cache->completed_migrations, complete_migration);
+
+ writeback_some_dirty_blocks(cache);
+
+ if (commit_if_needed(cache)) {
+ process_deferred_flush_bios(cache, false);
+
+ /*
+ * FIXME: rollback metadata or just go into a
+ * failure mode and error everything
+ */
+ } else {
+ process_deferred_flush_bios(cache, true);
+ process_migrations(cache, &cache->need_commit_migrations,
+ migration_success_post_commit);
+ }
+ } while (more_work(cache));
+}
+
+/*
+ * We want to commit periodically so that not too much
+ * unwritten metadata builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+ struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+ wake_worker(cache);
+ queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
+}
+
+/*----------------------------------------------------------------*/
+
+static int is_congested(struct dm_dev *dev, int bdi_bits)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+ return bdi_congested(&q->backing_dev_info, bdi_bits);
+}
+
+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+ struct cache *cache = container_of(cb, struct cache, callbacks);
+
+ return is_congested(cache->origin_dev, bdi_bits) ||
+ is_congested(cache->cache_dev, bdi_bits);
+}
+
+/*----------------------------------------------------------------
+ * Target methods
+ *--------------------------------------------------------------*/
+
+/*
+ * This function gets called on the error paths of the constructor, so we
+ * have to cope with a partially initialised struct.
+ */
+static void destroy(struct cache *cache)
+{
+ unsigned i;
+
+ if (cache->next_migration)
+ mempool_free(cache->next_migration, cache->migration_pool);
+
+ if (cache->migration_pool)
+ mempool_destroy(cache->migration_pool);
+
+ if (cache->all_io_ds)
+ dm_deferred_set_destroy(cache->all_io_ds);
+
+ if (cache->prison)
+ dm_bio_prison_destroy(cache->prison);
+
+ if (cache->wq)
+ destroy_workqueue(cache->wq);
+
+ if (cache->dirty_bitset)
+ free_bitset(cache->dirty_bitset);
+
+ if (cache->discard_bitset)
+ free_bitset(cache->discard_bitset);
+
+ if (cache->copier)
+ dm_kcopyd_client_destroy(cache->copier);
+
+ if (cache->cmd)
+ dm_cache_metadata_close(cache->cmd);
+
+ if (cache->metadata_dev)
+ dm_put_device(cache->ti, cache->metadata_dev);
+
+ if (cache->origin_dev)
+ dm_put_device(cache->ti, cache->origin_dev);
+
+ if (cache->cache_dev)
+ dm_put_device(cache->ti, cache->cache_dev);
+
+ if (cache->policy)
+ dm_cache_policy_destroy(cache->policy);
+
+ for (i = 0; i < cache->nr_ctr_args ; i++)
+ kfree(cache->ctr_args[i]);
+ kfree(cache->ctr_args);
+
+ kfree(cache);
+}
+
+static void cache_dtr(struct dm_target *ti)
+{
+ struct cache *cache = ti->private;
+
+ destroy(cache);
+}
+
+static sector_t get_dev_size(struct dm_dev *dev)
+{
+ return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Construct a cache device mapping.
+ *
+ * cache <metadata dev> <cache dev> <origin dev> <block size>
+ * <#feature args> [<feature arg>]*
+ * <policy> <#policy args> [<policy arg>]*
+ *
+ * metadata dev : fast device holding the persistent metadata
+ * cache dev : fast device holding cached data blocks
+ * origin dev : slow device holding original data blocks
+ * block size : cache unit size in sectors
+ *
+ * #feature args : number of feature arguments passed
+ * feature args : writethrough. (The default is writeback.)
+ *
+ * policy : the replacement policy to use
+ * #policy args : an even number of policy arguments corresponding
+ * to key/value pairs passed to the policy
+ * policy args : key/value pairs passed to the policy
+ * E.g. 'sequential_threshold 1024'
+ * See cache-policies.txt for details.
+ *
+ * Optional feature arguments are:
+ * writethrough : write through caching that prohibits cache block
+ * content from being different from origin block content.
+ * Without this argument, the default behaviour is to write
+ * back cache block contents later for performance reasons,
+ * so they may differ from the corresponding origin blocks.
+ */
+struct cache_args {
+ struct dm_target *ti;
+
+ struct dm_dev *metadata_dev;
+
+ struct dm_dev *cache_dev;
+ sector_t cache_sectors;
+
+ struct dm_dev *origin_dev;
+ sector_t origin_sectors;
+
+ uint32_t block_size;
+
+ const char *policy_name;
+ int policy_argc;
+ const char **policy_argv;
+
+ struct cache_features features;
+};
+
+static void destroy_cache_args(struct cache_args *ca)
+{
+ if (ca->metadata_dev)
+ dm_put_device(ca->ti, ca->metadata_dev);
+
+ if (ca->cache_dev)
+ dm_put_device(ca->ti, ca->cache_dev);
+
+ if (ca->origin_dev)
+ dm_put_device(ca->ti, ca->origin_dev);
+
+ kfree(ca);
+}
+
+static bool at_least_one_arg(struct dm_arg_set *as, char **error)
+{
+ if (!as->argc) {
+ *error = "Insufficient args";
+ return false;
+ }
+
+ return true;
+}
+
+static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ int r;
+ sector_t metadata_dev_size;
+ char b[BDEVNAME_SIZE];
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &ca->metadata_dev);
+ if (r) {
+ *error = "Error opening metadata device";
+ return r;
+ }
+
+ metadata_dev_size = get_dev_size(ca->metadata_dev);
+ if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
+ DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+ bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
+
+ return 0;
+}
+
+static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ int r;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &ca->cache_dev);
+ if (r) {
+ *error = "Error opening cache device";
+ return r;
+ }
+ ca->cache_sectors = get_dev_size(ca->cache_dev);
+
+ return 0;
+}
+
+static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ int r;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &ca->origin_dev);
+ if (r) {
+ *error = "Error opening origin device";
+ return r;
+ }
+
+ ca->origin_sectors = get_dev_size(ca->origin_dev);
+ if (ca->ti->len > ca->origin_sectors) {
+ *error = "Device size larger than cached device";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ unsigned long tmp;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
+ tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
+ tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
+ *error = "Invalid data block size";
+ return -EINVAL;
+ }
+
+ if (tmp > ca->cache_sectors) {
+ *error = "Data block size is larger than the cache device";
+ return -EINVAL;
+ }
+
+ ca->block_size = tmp;
+
+ return 0;
+}
+
+static void init_features(struct cache_features *cf)
+{
+ cf->mode = CM_WRITE;
+ cf->write_through = false;
+}
+
+static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ static struct dm_arg _args[] = {
+ {0, 1, "Invalid number of cache feature arguments"},
+ };
+
+ int r;
+ unsigned argc;
+ const char *arg;
+ struct cache_features *cf = &ca->features;
+
+ init_features(cf);
+
+ r = dm_read_arg_group(_args, as, &argc, error);
+ if (r)
+ return -EINVAL;
+
+ while (argc--) {
+ arg = dm_shift_arg(as);
+
+ if (!strcasecmp(arg, "writeback"))
+ cf->write_through = false;
+
+ else if (!strcasecmp(arg, "writethrough"))
+ cf->write_through = true;
+
+ else {
+ *error = "Unrecognised cache feature requested";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ static struct dm_arg _args[] = {
+ {0, 1024, "Invalid number of policy arguments"},
+ };
+
+ int r;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ ca->policy_name = dm_shift_arg(as);
+
+ r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
+ if (r)
+ return -EINVAL;
+
+ ca->policy_argv = (const char **)as->argv;
+ dm_consume_args(as, ca->policy_argc);
+
+ return 0;
+}
+
+static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
+ char **error)
+{
+ int r;
+ struct dm_arg_set as;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ r = parse_metadata_dev(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_cache_dev(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_origin_dev(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_block_size(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_features(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_policy(ca, &as, error);
+ if (r)
+ return r;
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct kmem_cache *migration_cache;
+
+static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
+{
+ int r = 0;
+
+ if (argc & 1) {
+ DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
+ return -EINVAL;
+ }
+
+ while (argc) {
+ r = policy_set_config_value(p, argv[0], argv[1]);
+ if (r) {
+ DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
+ argv[0], argv[1]);
+ return r;
+ }
+
+ argc -= 2;
+ argv += 2;
+ }
+
+ return r;
+}
+
+static int create_cache_policy(struct cache *cache, struct cache_args *ca,
+ char **error)
+{
+ int r;
+
+ cache->policy = dm_cache_policy_create(ca->policy_name,
+ cache->cache_size,
+ cache->origin_sectors,
+ cache->sectors_per_block);
+ if (!cache->policy) {
+ *error = "Error creating cache's policy";
+ return -ENOMEM;
+ }
+
+ r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
+ if (r)
+ dm_cache_policy_destroy(cache->policy);
+
+ return r;
+}
+
+/*
+ * We want the discard block size to be a power of two, at least the size
+ * of the cache block size, and have no more than 2^14 discard blocks
+ * across the origin.
+ */
+#define MAX_DISCARD_BLOCKS (1 << 14)
+
+static bool too_many_discard_blocks(sector_t discard_block_size,
+ sector_t origin_size)
+{
+ (void) sector_div(origin_size, discard_block_size);
+
+ return origin_size > MAX_DISCARD_BLOCKS;
+}
+
+static sector_t calculate_discard_block_size(sector_t cache_block_size,
+ sector_t origin_size)
+{
+ sector_t discard_block_size;
+
+ discard_block_size = roundup_pow_of_two(cache_block_size);
+
+ if (origin_size)
+ while (too_many_discard_blocks(discard_block_size, origin_size))
+ discard_block_size *= 2;
+
+ return discard_block_size;
+}
+
+#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
+
+static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
+
+static int cache_create(struct cache_args *ca, struct cache **result)
+{
+ int r = 0;
+ char **error = &ca->ti->error;
+ struct cache *cache;
+ struct dm_target *ti = ca->ti;
+ dm_block_t origin_blocks;
+ struct dm_cache_metadata *cmd;
+ bool may_format = ca->features.mode == CM_WRITE;
+
+ cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+ if (!cache)
+ return -ENOMEM;
+
+ cache->ti = ca->ti;
+ ti->private = cache;
+ ti->per_bio_data_size = sizeof(struct per_bio_data);
+ ti->num_flush_bios = 2;
+ ti->flush_supported = true;
+
+ ti->num_discard_bios = 1;
+ ti->discards_supported = true;
+ ti->discard_zeroes_data_unsupported = true;
+
+ memcpy(&cache->features, &ca->features, sizeof(cache->features));
+
+ if (cache->features.write_through)
+ ti->num_write_bios = cache_num_write_bios;
+
+ cache->callbacks.congested_fn = cache_is_congested;
+ dm_table_add_target_callbacks(ti->table, &cache->callbacks);
+
+ cache->metadata_dev = ca->metadata_dev;
+ cache->origin_dev = ca->origin_dev;
+ cache->cache_dev = ca->cache_dev;
+
+ ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
+
+ /* FIXME: factor out this whole section */
+ origin_blocks = cache->origin_sectors = ca->origin_sectors;
+ (void) sector_div(origin_blocks, ca->block_size);
+ cache->origin_blocks = to_oblock(origin_blocks);
+
+ cache->sectors_per_block = ca->block_size;
+ if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (ca->block_size & (ca->block_size - 1)) {
+ dm_block_t cache_size = ca->cache_sectors;
+
+ cache->sectors_per_block_shift = -1;
+ (void) sector_div(cache_size, ca->block_size);
+ cache->cache_size = to_cblock(cache_size);
+ } else {
+ cache->sectors_per_block_shift = __ffs(ca->block_size);
+ cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
+ }
+
+ r = create_cache_policy(cache, ca, error);
+ if (r)
+ goto bad;
+ cache->policy_nr_args = ca->policy_argc;
+
+ cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
+ ca->block_size, may_format,
+ dm_cache_policy_get_hint_size(cache->policy));
+ if (IS_ERR(cmd)) {
+ *error = "Error creating metadata object";
+ r = PTR_ERR(cmd);
+ goto bad;
+ }
+ cache->cmd = cmd;
+
+ spin_lock_init(&cache->lock);
+ bio_list_init(&cache->deferred_bios);
+ bio_list_init(&cache->deferred_flush_bios);
+ INIT_LIST_HEAD(&cache->quiesced_migrations);
+ INIT_LIST_HEAD(&cache->completed_migrations);
+ INIT_LIST_HEAD(&cache->need_commit_migrations);
+ cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
+ atomic_set(&cache->nr_migrations, 0);
+ init_waitqueue_head(&cache->migration_wait);
+
+ cache->nr_dirty = 0;
+ cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
+ if (!cache->dirty_bitset) {
+ *error = "could not allocate dirty bitset";
+ goto bad;
+ }
+ clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
+
+ cache->discard_block_size =
+ calculate_discard_block_size(cache->sectors_per_block,
+ cache->origin_sectors);
+ cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
+ cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
+ if (!cache->discard_bitset) {
+ *error = "could not allocate discard bitset";
+ goto bad;
+ }
+ clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+
+ cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+ if (IS_ERR(cache->copier)) {
+ *error = "could not create kcopyd client";
+ r = PTR_ERR(cache->copier);
+ goto bad;
+ }
+
+ cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+ if (!cache->wq) {
+ *error = "could not create workqueue for metadata object";
+ goto bad;
+ }
+ INIT_WORK(&cache->worker, do_worker);
+ INIT_DELAYED_WORK(&cache->waker, do_waker);
+ cache->last_commit_jiffies = jiffies;
+
+ cache->prison = dm_bio_prison_create(PRISON_CELLS);
+ if (!cache->prison) {
+ *error = "could not create bio prison";
+ goto bad;
+ }
+
+ cache->all_io_ds = dm_deferred_set_create();
+ if (!cache->all_io_ds) {
+ *error = "could not create all_io deferred set";
+ goto bad;
+ }
+
+ cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
+ migration_cache);
+ if (!cache->migration_pool) {
+ *error = "Error creating cache's migration mempool";
+ goto bad;
+ }
+
+ cache->next_migration = NULL;
+
+ cache->need_tick_bio = true;
+ cache->sized = false;
+ cache->quiescing = false;
+ cache->commit_requested = false;
+ cache->loaded_mappings = false;
+ cache->loaded_discards = false;
+
+ load_stats(cache);
+
+ atomic_set(&cache->stats.demotion, 0);
+ atomic_set(&cache->stats.promotion, 0);
+ atomic_set(&cache->stats.copies_avoided, 0);
+ atomic_set(&cache->stats.cache_cell_clash, 0);
+ atomic_set(&cache->stats.commit_count, 0);
+ atomic_set(&cache->stats.discard_count, 0);
+
+ *result = cache;
+ return 0;
+
+bad:
+ destroy(cache);
+ return r;
+}
+
+static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
+{
+ unsigned i;
+ const char **copy;
+
+ copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
+ if (!copy)
+ return -ENOMEM;
+ for (i = 0; i < argc; i++) {
+ copy[i] = kstrdup(argv[i], GFP_KERNEL);
+ if (!copy[i]) {
+ while (i--)
+ kfree(copy[i]);
+ kfree(copy);
+ return -ENOMEM;
+ }
+ }
+
+ cache->nr_ctr_args = argc;
+ cache->ctr_args = copy;
+
+ return 0;
+}
+
+static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r = -EINVAL;
+ struct cache_args *ca;
+ struct cache *cache = NULL;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca) {
+ ti->error = "Error allocating memory for cache";
+ return -ENOMEM;
+ }
+ ca->ti = ti;
+
+ r = parse_cache_args(ca, argc, argv, &ti->error);
+ if (r)
+ goto out;
+
+ r = cache_create(ca, &cache);
+
+ r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
+ if (r) {
+ destroy(cache);
+ goto out;
+ }
+
+ ti->private = cache;
+
+out:
+ destroy_cache_args(ca);
+ return r;
+}
+
+static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
+{
+ int r;
+ struct cache *cache = ti->private;
+ dm_oblock_t block = get_bio_block(cache, bio);
+ dm_cblock_t cblock;
+
+ r = policy_lookup(cache->policy, block, &cblock);
+ if (r < 0)
+ return 2; /* assume the worst */
+
+ return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
+}
+
+static int cache_map(struct dm_target *ti, struct bio *bio)
+{
+ struct cache *cache = ti->private;
+
+ int r;
+ dm_oblock_t block = get_bio_block(cache, bio);
+ bool can_migrate = false;
+ bool discarded_block;
+ struct dm_bio_prison_cell *cell;
+ struct policy_result lookup_result;
+ struct per_bio_data *pb;
+
+ if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
+ /*
+ * This can only occur if the io goes to a partial block at
+ * the end of the origin device. We don't cache these.
+ * Just remap to the origin and carry on.
+ */
+ remap_to_origin_clear_discard(cache, bio, block);
+ return DM_MAPIO_REMAPPED;
+ }
+
+ pb = init_per_bio_data(bio);
+
+ if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+ defer_bio(cache, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /*
+ * Check to see if that block is currently migrating.
+ */
+ cell = alloc_prison_cell(cache);
+ if (!cell) {
+ defer_bio(cache, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ r = bio_detain(cache, block, bio, cell,
+ (cell_free_fn) free_prison_cell,
+ cache, &cell);
+ if (r) {
+ if (r < 0)
+ defer_bio(cache, bio);
+
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ discarded_block = is_discarded_oblock(cache, block);
+
+ r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
+ bio, &lookup_result);
+ if (r == -EWOULDBLOCK) {
+ cell_defer(cache, cell, true);
+ return DM_MAPIO_SUBMITTED;
+
+ } else if (r) {
+ DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ switch (lookup_result.op) {
+ case POLICY_HIT:
+ inc_hit_counter(cache, bio);
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+
+ if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
+ /*
+ * No need to mark anything dirty in write through mode.
+ */
+ pb->req_nr == 0 ?
+ remap_to_cache(cache, bio, lookup_result.cblock) :
+ remap_to_origin_clear_discard(cache, bio, block);
+ cell_defer(cache, cell, false);
+ } else {
+ remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ cell_defer(cache, cell, false);
+ }
+ break;
+
+ case POLICY_MISS:
+ inc_miss_counter(cache, bio);
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+
+ if (pb->req_nr != 0) {
+ /*
+ * This is a duplicate writethrough io that is no
+ * longer needed because the block has been demoted.
+ */
+ bio_endio(bio, 0);
+ cell_defer(cache, cell, false);
+ return DM_MAPIO_SUBMITTED;
+ } else {
+ remap_to_origin_clear_discard(cache, bio, block);
+ cell_defer(cache, cell, false);
+ }
+ break;
+
+ default:
+ DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+ (unsigned) lookup_result.op);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+ struct cache *cache = ti->private;
+ unsigned long flags;
+ struct per_bio_data *pb = get_per_bio_data(bio);
+
+ if (pb->tick) {
+ policy_tick(cache->policy);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ cache->need_tick_bio = true;
+ spin_unlock_irqrestore(&cache->lock, flags);
+ }
+
+ check_for_quiesced_migrations(cache, pb);
+
+ return 0;
+}
+
+static int write_dirty_bitset(struct cache *cache)
+{
+ unsigned i, r;
+
+ for (i = 0; i < from_cblock(cache->cache_size); i++) {
+ r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
+ is_dirty(cache, to_cblock(i)));
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int write_discard_bitset(struct cache *cache)
+{
+ unsigned i, r;
+
+ r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
+ cache->discard_nr_blocks);
+ if (r) {
+ DMERR("could not resize on-disk discard bitset");
+ return r;
+ }
+
+ for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
+ r = dm_cache_set_discard(cache->cmd, to_dblock(i),
+ is_discarded(cache, to_dblock(i)));
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
+ uint32_t hint)
+{
+ struct cache *cache = context;
+ return dm_cache_save_hint(cache->cmd, cblock, hint);
+}
+
+static int write_hints(struct cache *cache)
+{
+ int r;
+
+ r = dm_cache_begin_hints(cache->cmd, cache->policy);
+ if (r) {
+ DMERR("dm_cache_begin_hints failed");
+ return r;
+ }
+
+ r = policy_walk_mappings(cache->policy, save_hint, cache);
+ if (r)
+ DMERR("policy_walk_mappings failed");
+
+ return r;
+}
+
+/*
+ * returns true on success
+ */
+static bool sync_metadata(struct cache *cache)
+{
+ int r1, r2, r3, r4;
+
+ r1 = write_dirty_bitset(cache);
+ if (r1)
+ DMERR("could not write dirty bitset");
+
+ r2 = write_discard_bitset(cache);
+ if (r2)
+ DMERR("could not write discard bitset");
+
+ save_stats(cache);
+
+ r3 = write_hints(cache);
+ if (r3)
+ DMERR("could not write hints");
+
+ /*
+ * If writing the above metadata failed, we still commit, but don't
+ * set the clean shutdown flag. This will effectively force every
+ * dirty bit to be set on reload.
+ */
+ r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+ if (r4)
+ DMERR("could not write cache metadata. Data loss may occur.");
+
+ return !r1 && !r2 && !r3 && !r4;
+}
+
+static void cache_postsuspend(struct dm_target *ti)
+{
+ struct cache *cache = ti->private;
+
+ start_quiescing(cache);
+ wait_for_migrations(cache);
+ stop_worker(cache);
+ requeue_deferred_io(cache);
+ stop_quiescing(cache);
+
+ (void) sync_metadata(cache);
+}
+
+static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
+ bool dirty, uint32_t hint, bool hint_valid)
+{
+ int r;
+ struct cache *cache = context;
+
+ r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+ if (r)
+ return r;
+
+ if (dirty)
+ set_dirty(cache, oblock, cblock);
+ else
+ clear_dirty(cache, oblock, cblock);
+
+ return 0;
+}
+
+static int load_discard(void *context, sector_t discard_block_size,
+ dm_dblock_t dblock, bool discard)
+{
+ struct cache *cache = context;
+
+ /* FIXME: handle mis-matched block size */
+
+ if (discard)
+ set_discard(cache, dblock);
+ else
+ clear_discard(cache, dblock);
+
+ return 0;
+}
+
+static int cache_preresume(struct dm_target *ti)
+{
+ int r = 0;
+ struct cache *cache = ti->private;
+ sector_t actual_cache_size = get_dev_size(cache->cache_dev);
+ (void) sector_div(actual_cache_size, cache->sectors_per_block);
+
+ /*
+ * Check to see if the cache has resized.
+ */
+ if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
+ cache->cache_size = to_cblock(actual_cache_size);
+
+ r = dm_cache_resize(cache->cmd, cache->cache_size);
+ if (r) {
+ DMERR("could not resize cache metadata");
+ return r;
+ }
+
+ cache->sized = true;
+ }
+
+ if (!cache->loaded_mappings) {
+ r = dm_cache_load_mappings(cache->cmd,
+ dm_cache_policy_get_name(cache->policy),
+ load_mapping, cache);
+ if (r) {
+ DMERR("could not load cache mappings");
+ return r;
+ }
+
+ cache->loaded_mappings = true;
+ }
+
+ if (!cache->loaded_discards) {
+ r = dm_cache_load_discards(cache->cmd, load_discard, cache);
+ if (r) {
+ DMERR("could not load origin discards");
+ return r;
+ }
+
+ cache->loaded_discards = true;
+ }
+
+ return r;
+}
+
+static void cache_resume(struct dm_target *ti)
+{
+ struct cache *cache = ti->private;
+
+ cache->need_tick_bio = true;
+ do_waker(&cache->waker.work);
+}
+
+/*
+ * Status format:
+ *
+ * <#used metadata blocks>/<#total metadata blocks>
+ * <#read hits> <#read misses> <#write hits> <#write misses>
+ * <#demotions> <#promotions> <#blocks in cache> <#dirty>
+ * <#features> <features>*
+ * <#core args> <core args>
+ * <#policy args> <policy args>*
+ */
+static void cache_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ int r = 0;
+ unsigned i;
+ ssize_t sz = 0;
+ dm_block_t nr_free_blocks_metadata = 0;
+ dm_block_t nr_blocks_metadata = 0;
+ char buf[BDEVNAME_SIZE];
+ struct cache *cache = ti->private;
+ dm_cblock_t residency;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ /* Commit to ensure statistics aren't out-of-date */
+ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
+ r = dm_cache_commit(cache->cmd, false);
+ if (r)
+ DMERR("could not commit metadata for accurate status");
+ }
+
+ r = dm_cache_get_free_metadata_block_count(cache->cmd,
+ &nr_free_blocks_metadata);
+ if (r) {
+ DMERR("could not get metadata free block count");
+ goto err;
+ }
+
+ r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
+ if (r) {
+ DMERR("could not get metadata device size");
+ goto err;
+ }
+
+ residency = policy_residency(cache->policy);
+
+ DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
+ (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
+ (unsigned long long)nr_blocks_metadata,
+ (unsigned) atomic_read(&cache->stats.read_hit),
+ (unsigned) atomic_read(&cache->stats.read_miss),
+ (unsigned) atomic_read(&cache->stats.write_hit),
+ (unsigned) atomic_read(&cache->stats.write_miss),
+ (unsigned) atomic_read(&cache->stats.demotion),
+ (unsigned) atomic_read(&cache->stats.promotion),
+ (unsigned long long) from_cblock(residency),
+ cache->nr_dirty);
+
+ if (cache->features.write_through)
+ DMEMIT("1 writethrough ");
+ else
+ DMEMIT("0 ");
+
+ DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
+ if (sz < maxlen) {
+ r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+ if (r)
+ DMERR("policy_emit_config_values returned %d", r);
+ }
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+ format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+ format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
+ DMEMIT("%s", buf);
+
+ for (i = 0; i < cache->nr_ctr_args - 1; i++)
+ DMEMIT(" %s", cache->ctr_args[i]);
+ if (cache->nr_ctr_args)
+ DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
+ }
+
+ return;
+
+err:
+ DMEMIT("Error");
+}
+
+#define NOT_CORE_OPTION 1
+
+static int process_config_option(struct cache *cache, char **argv)
+{
+ unsigned long tmp;
+
+ if (!strcasecmp(argv[0], "migration_threshold")) {
+ if (kstrtoul(argv[1], 10, &tmp))
+ return -EINVAL;
+
+ cache->migration_threshold = tmp;
+ return 0;
+ }
+
+ return NOT_CORE_OPTION;
+}
+
+/*
+ * Supports <key> <value>.
+ *
+ * The key migration_threshold is supported by the cache target core.
+ */
+static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r;
+ struct cache *cache = ti->private;
+
+ if (argc != 2)
+ return -EINVAL;
+
+ r = process_config_option(cache, argv);
+ if (r == NOT_CORE_OPTION)
+ return policy_set_config_value(cache->policy, argv[0], argv[1]);
+
+ return r;
+}
+
+static int cache_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ int r = 0;
+ struct cache *cache = ti->private;
+
+ r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
+ if (!r)
+ r = fn(ti, cache->origin_dev, 0, ti->len, data);
+
+ return r;
+}
+
+/*
+ * We assume I/O is going to the origin (which is the volume
+ * more likely to have restrictions e.g. by being striped).
+ * (Looking up the exact location of the data would be expensive
+ * and could always be out of date by the time the bio is submitted.)
+ */
+static int cache_bvec_merge(struct dm_target *ti,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct cache *cache = ti->private;
+ struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = cache->origin_dev->bdev;
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
+{
+ /*
+ * FIXME: these limits may be incompatible with the cache device
+ */
+ limits->max_discard_sectors = cache->discard_block_size * 1024;
+ limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
+}
+
+static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct cache *cache = ti->private;
+
+ blk_limits_io_min(limits, 0);
+ blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
+ set_discard_limits(cache, limits);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct target_type cache_target = {
+ .name = "cache",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = cache_ctr,
+ .dtr = cache_dtr,
+ .map = cache_map,
+ .end_io = cache_end_io,
+ .postsuspend = cache_postsuspend,
+ .preresume = cache_preresume,
+ .resume = cache_resume,
+ .status = cache_status,
+ .message = cache_message,
+ .iterate_devices = cache_iterate_devices,
+ .merge = cache_bvec_merge,
+ .io_hints = cache_io_hints,
+};
+
+static int __init dm_cache_init(void)
+{
+ int r;
+
+ r = dm_register_target(&cache_target);
+ if (r) {
+ DMERR("cache target registration failed: %d", r);
+ return r;
+ }
+
+ migration_cache = KMEM_CACHE(dm_cache_migration, 0);
+ if (!migration_cache) {
+ dm_unregister_target(&cache_target);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __exit dm_cache_exit(void)
+{
+ dm_unregister_target(&cache_target);
+ kmem_cache_destroy(migration_cache);
+}
+
+module_init(dm_cache_init);
+module_exit(dm_cache_exit);
+
+MODULE_DESCRIPTION(DM_NAME " cache target");
+MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f7369f9d859..13c15480d94 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1234,20 +1234,6 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
return 0;
}
-/*
- * Encode key into its hex representation
- */
-static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
-{
- unsigned int i;
-
- for (i = 0; i < size; i++) {
- sprintf(hex, "%02x", *key);
- hex += 2;
- key++;
- }
-}
-
static void crypt_free_tfms(struct crypt_config *cc)
{
unsigned i;
@@ -1651,7 +1637,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (opt_params == 1 && opt_string &&
!strcasecmp(opt_string, "allow_discards"))
- ti->num_discard_requests = 1;
+ ti->num_discard_bios = 1;
else if (opt_params) {
ret = -EINVAL;
ti->error = "Invalid feature arguments";
@@ -1679,7 +1665,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- ti->num_flush_requests = 1;
+ ti->num_flush_bios = 1;
ti->discard_zeroes_data_unsupported = true;
return 0;
@@ -1717,11 +1703,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
-static int crypt_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void crypt_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
struct crypt_config *cc = ti->private;
- unsigned int sz = 0;
+ unsigned i, sz = 0;
switch (type) {
case STATUSTYPE_INFO:
@@ -1731,27 +1717,20 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE:
DMEMIT("%s ", cc->cipher_string);
- if (cc->key_size > 0) {
- if ((maxlen - sz) < ((cc->key_size << 1) + 1))
- return -ENOMEM;
-
- crypt_encode_key(result + sz, cc->key, cc->key_size);
- sz += cc->key_size << 1;
- } else {
- if (sz >= maxlen)
- return -ENOMEM;
- result[sz++] = '-';
- }
+ if (cc->key_size > 0)
+ for (i = 0; i < cc->key_size; i++)
+ DMEMIT("%02x", cc->key[i]);
+ else
+ DMEMIT("-");
DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
cc->dev->name, (unsigned long long)cc->start);
- if (ti->num_discard_requests)
+ if (ti->num_discard_bios)
DMEMIT(" 1 allow_discards");
break;
}
- return 0;
}
static void crypt_postsuspend(struct dm_target *ti)
@@ -1845,7 +1824,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 12, 0},
+ .version = {1, 12, 1},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index cc1bd048acb..496d5f3646a 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -198,8 +198,8 @@ out:
mutex_init(&dc->timer_lock);
atomic_set(&dc->may_delay, 1);
- ti->num_flush_requests = 1;
- ti->num_discard_requests = 1;
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
ti->private = dc;
return 0;
@@ -293,8 +293,8 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
return delay_bio(dc, dc->read_delay, bio);
}
-static int delay_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void delay_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
struct delay_c *dc = ti->private;
int sz = 0;
@@ -314,8 +314,6 @@ static int delay_status(struct dm_target *ti, status_type_t type,
dc->write_delay);
break;
}
-
- return 0;
}
static int delay_iterate_devices(struct dm_target *ti,
@@ -337,7 +335,7 @@ out:
static struct target_type delay_target = {
.name = "delay",
- .version = {1, 2, 0},
+ .version = {1, 2, 1},
.module = THIS_MODULE,
.ctr = delay_ctr,
.dtr = delay_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 9721f2ffb1a..7fcf21cb4ff 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -216,8 +216,8 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- ti->num_flush_requests = 1;
- ti->num_discard_requests = 1;
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
ti->per_bio_data_size = sizeof(struct per_bio_data);
ti->private = fc;
return 0;
@@ -337,8 +337,8 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
return error;
}
-static int flakey_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void flakey_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
unsigned sz = 0;
struct flakey_c *fc = ti->private;
@@ -368,7 +368,6 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
break;
}
- return 0;
}
static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
@@ -411,7 +410,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
static struct target_type flakey_target = {
.name = "flakey",
- .version = {1, 3, 0},
+ .version = {1, 3, 1},
.module = THIS_MODULE,
.ctr = flakey_ctr,
.dtr = flakey_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0666b5d14b8..aa04f022464 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1067,6 +1067,7 @@ static void retrieve_status(struct dm_table *table,
num_targets = dm_table_get_num_targets(table);
for (i = 0; i < num_targets; i++) {
struct dm_target *ti = dm_table_get_target(table, i);
+ size_t l;
remaining = len - (outptr - outbuf);
if (remaining <= sizeof(struct dm_target_spec)) {
@@ -1093,14 +1094,17 @@ static void retrieve_status(struct dm_table *table,
if (ti->type->status) {
if (param->flags & DM_NOFLUSH_FLAG)
status_flags |= DM_STATUS_NOFLUSH_FLAG;
- if (ti->type->status(ti, type, status_flags, outptr, remaining)) {
- param->flags |= DM_BUFFER_FULL_FLAG;
- break;
- }
+ ti->type->status(ti, type, status_flags, outptr, remaining);
} else
outptr[0] = '\0';
- outptr += strlen(outptr) + 1;
+ l = strlen(outptr) + 1;
+ if (l == remaining) {
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ break;
+ }
+
+ outptr += l;
used = param->data_start + (outptr - outbuf);
outptr = align_ptr(outptr);
@@ -1410,6 +1414,22 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
return 0;
}
+static bool buffer_test_overflow(char *result, unsigned maxlen)
+{
+ return !maxlen || strlen(result) + 1 >= maxlen;
+}
+
+/*
+ * Process device-mapper dependent messages.
+ * Returns a number <= 1 if message was processed by device mapper.
+ * Returns 2 if message should be delivered to the target.
+ */
+static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
+ char *result, unsigned maxlen)
+{
+ return 2;
+}
+
/*
* Pass a message to the target that's at the supplied device offset.
*/
@@ -1421,6 +1441,8 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
struct dm_table *table;
struct dm_target *ti;
struct dm_target_msg *tmsg = (void *) param + param->data_start;
+ size_t maxlen;
+ char *result = get_result_buffer(param, param_size, &maxlen);
md = find_device(param);
if (!md)
@@ -1444,6 +1466,10 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
goto out_argv;
}
+ r = message_for_md(md, argc, argv, result, maxlen);
+ if (r <= 1)
+ goto out_argv;
+
table = dm_get_live_table(md);
if (!table)
goto out_argv;
@@ -1469,44 +1495,68 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
out_argv:
kfree(argv);
out:
- param->data_size = 0;
+ if (r >= 0)
+ __dev_status(md, param);
+
+ if (r == 1) {
+ param->flags |= DM_DATA_OUT_FLAG;
+ if (buffer_test_overflow(result, maxlen))
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ else
+ param->data_size = param->data_start + strlen(result) + 1;
+ r = 0;
+ }
+
dm_put(md);
return r;
}
+/*
+ * The ioctl parameter block consists of two parts, a dm_ioctl struct
+ * followed by a data buffer. This flag is set if the second part,
+ * which has a variable size, is not used by the function processing
+ * the ioctl.
+ */
+#define IOCTL_FLAGS_NO_PARAMS 1
+
/*-----------------------------------------------------------------
* Implementation of open/close/ioctl on the special char
* device.
*---------------------------------------------------------------*/
-static ioctl_fn lookup_ioctl(unsigned int cmd)
+static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
{
static struct {
int cmd;
+ int flags;
ioctl_fn fn;
} _ioctls[] = {
- {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
- {DM_REMOVE_ALL_CMD, remove_all},
- {DM_LIST_DEVICES_CMD, list_devices},
-
- {DM_DEV_CREATE_CMD, dev_create},
- {DM_DEV_REMOVE_CMD, dev_remove},
- {DM_DEV_RENAME_CMD, dev_rename},
- {DM_DEV_SUSPEND_CMD, dev_suspend},
- {DM_DEV_STATUS_CMD, dev_status},
- {DM_DEV_WAIT_CMD, dev_wait},
-
- {DM_TABLE_LOAD_CMD, table_load},
- {DM_TABLE_CLEAR_CMD, table_clear},
- {DM_TABLE_DEPS_CMD, table_deps},
- {DM_TABLE_STATUS_CMD, table_status},
-
- {DM_LIST_VERSIONS_CMD, list_versions},
-
- {DM_TARGET_MSG_CMD, target_message},
- {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry}
+ {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */
+ {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all},
+ {DM_LIST_DEVICES_CMD, 0, list_devices},
+
+ {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create},
+ {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove},
+ {DM_DEV_RENAME_CMD, 0, dev_rename},
+ {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend},
+ {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status},
+ {DM_DEV_WAIT_CMD, 0, dev_wait},
+
+ {DM_TABLE_LOAD_CMD, 0, table_load},
+ {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear},
+ {DM_TABLE_DEPS_CMD, 0, table_deps},
+ {DM_TABLE_STATUS_CMD, 0, table_status},
+
+ {DM_LIST_VERSIONS_CMD, 0, list_versions},
+
+ {DM_TARGET_MSG_CMD, 0, target_message},
+ {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
};
- return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
+ if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
+ return NULL;
+
+ *ioctl_flags = _ioctls[cmd].flags;
+ return _ioctls[cmd].fn;
}
/*
@@ -1543,7 +1593,8 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
return r;
}
-#define DM_PARAMS_VMALLOC 0x0001 /* Params alloced with vmalloc not kmalloc */
+#define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */
+#define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */
#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */
static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
@@ -1551,66 +1602,80 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla
if (param_flags & DM_WIPE_BUFFER)
memset(param, 0, param_size);
+ if (param_flags & DM_PARAMS_KMALLOC)
+ kfree(param);
if (param_flags & DM_PARAMS_VMALLOC)
vfree(param);
- else
- kfree(param);
}
-static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, int *param_flags)
+static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
+ int ioctl_flags,
+ struct dm_ioctl **param, int *param_flags)
{
- struct dm_ioctl tmp, *dmi;
+ struct dm_ioctl *dmi;
int secure_data;
+ const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data);
- if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
+ if (copy_from_user(param_kernel, user, minimum_data_size))
return -EFAULT;
- if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
+ if (param_kernel->data_size < minimum_data_size)
return -EINVAL;
- secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
+ secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG;
*param_flags = secure_data ? DM_WIPE_BUFFER : 0;
+ if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) {
+ dmi = param_kernel;
+ dmi->data_size = minimum_data_size;
+ goto data_copied;
+ }
+
/*
* Try to avoid low memory issues when a device is suspended.
* Use kmalloc() rather than vmalloc() when we can.
*/
dmi = NULL;
- if (tmp.data_size <= KMALLOC_MAX_SIZE)
- dmi = kmalloc(tmp.data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
+ dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (dmi)
+ *param_flags |= DM_PARAMS_KMALLOC;
+ }
if (!dmi) {
- dmi = __vmalloc(tmp.data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
- *param_flags |= DM_PARAMS_VMALLOC;
+ dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
+ if (dmi)
+ *param_flags |= DM_PARAMS_VMALLOC;
}
if (!dmi) {
- if (secure_data && clear_user(user, tmp.data_size))
+ if (secure_data && clear_user(user, param_kernel->data_size))
return -EFAULT;
return -ENOMEM;
}
- if (copy_from_user(dmi, user, tmp.data_size))
+ if (copy_from_user(dmi, user, param_kernel->data_size))
goto bad;
+data_copied:
/*
* Abort if something changed the ioctl data while it was being copied.
*/
- if (dmi->data_size != tmp.data_size) {
+ if (dmi->data_size != param_kernel->data_size) {
DMERR("rejecting ioctl: data size modified while processing parameters");
goto bad;
}
/* Wipe the user buffer so we do not return it to userspace */
- if (secure_data && clear_user(user, tmp.data_size))
+ if (secure_data && clear_user(user, param_kernel->data_size))
goto bad;
*param = dmi;
return 0;
bad:
- free_params(dmi, tmp.data_size, *param_flags);
+ free_params(dmi, param_kernel->data_size, *param_flags);
return -EFAULT;
}
@@ -1621,6 +1686,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
param->flags &= ~DM_BUFFER_FULL_FLAG;
param->flags &= ~DM_UEVENT_GENERATED_FLAG;
param->flags &= ~DM_SECURE_DATA_FLAG;
+ param->flags &= ~DM_DATA_OUT_FLAG;
/* Ignores parameters */
if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1648,11 +1714,13 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
{
int r = 0;
+ int ioctl_flags;
int param_flags;
unsigned int cmd;
struct dm_ioctl *uninitialized_var(param);
ioctl_fn fn = NULL;
size_t input_param_size;
+ struct dm_ioctl param_kernel;
/* only root can play with this */
if (!capable(CAP_SYS_ADMIN))
@@ -1677,7 +1745,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
if (cmd == DM_VERSION_CMD)
return 0;
- fn = lookup_ioctl(cmd);
+ fn = lookup_ioctl(cmd, &ioctl_flags);
if (!fn) {
DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
return -ENOTTY;
@@ -1686,7 +1754,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
/*
* Copy the parameters into kernel space.
*/
- r = copy_params(user, &param, &param_flags);
+ r = copy_params(user, &param_kernel, ioctl_flags, &param, &param_flags);
if (r)
return r;
@@ -1699,6 +1767,10 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
param->data_size = sizeof(*param);
r = fn(param, input_param_size);
+ if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
+ unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
+ DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd);
+
/*
* Copy the results back to userland.
*/
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 68c02673263..d581fe5d2fa 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -22,6 +22,7 @@
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
+#include <linux/delay.h>
#include <linux/device-mapper.h>
#include <linux/dm-kcopyd.h>
@@ -51,6 +52,8 @@ struct dm_kcopyd_client {
struct workqueue_struct *kcopyd_wq;
struct work_struct kcopyd_work;
+ struct dm_kcopyd_throttle *throttle;
+
/*
* We maintain three lists of jobs:
*
@@ -68,6 +71,117 @@ struct dm_kcopyd_client {
static struct page_list zero_page_list;
+static DEFINE_SPINLOCK(throttle_spinlock);
+
+/*
+ * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
+ * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
+ * by 2.
+ */
+#define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ
+
+/*
+ * Sleep this number of milliseconds.
+ *
+ * The value was decided experimentally.
+ * Smaller values seem to cause an increased copy rate above the limit.
+ * The reason for this is unknown but possibly due to jiffies rounding errors
+ * or read/write cache inside the disk.
+ */
+#define SLEEP_MSEC 100
+
+/*
+ * Maximum number of sleep events. There is a theoretical livelock if more
+ * kcopyd clients do work simultaneously which this limit avoids.
+ */
+#define MAX_SLEEPS 10
+
+static void io_job_start(struct dm_kcopyd_throttle *t)
+{
+ unsigned throttle, now, difference;
+ int slept = 0, skew;
+
+ if (unlikely(!t))
+ return;
+
+try_again:
+ spin_lock_irq(&throttle_spinlock);
+
+ throttle = ACCESS_ONCE(t->throttle);
+
+ if (likely(throttle >= 100))
+ goto skip_limit;
+
+ now = jiffies;
+ difference = now - t->last_jiffies;
+ t->last_jiffies = now;
+ if (t->num_io_jobs)
+ t->io_period += difference;
+ t->total_period += difference;
+
+ /*
+ * Maintain sane values if we got a temporary overflow.
+ */
+ if (unlikely(t->io_period > t->total_period))
+ t->io_period = t->total_period;
+
+ if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
+ int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
+ t->total_period >>= shift;
+ t->io_period >>= shift;
+ }
+
+ skew = t->io_period - throttle * t->total_period / 100;
+
+ if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
+ slept++;
+ spin_unlock_irq(&throttle_spinlock);
+ msleep(SLEEP_MSEC);
+ goto try_again;
+ }
+
+skip_limit:
+ t->num_io_jobs++;
+
+ spin_unlock_irq(&throttle_spinlock);
+}
+
+static void io_job_finish(struct dm_kcopyd_throttle *t)
+{
+ unsigned long flags;
+
+ if (unlikely(!t))
+ return;
+
+ spin_lock_irqsave(&throttle_spinlock, flags);
+
+ t->num_io_jobs--;
+
+ if (likely(ACCESS_ONCE(t->throttle) >= 100))
+ goto skip_limit;
+
+ if (!t->num_io_jobs) {
+ unsigned now, difference;
+
+ now = jiffies;
+ difference = now - t->last_jiffies;
+ t->last_jiffies = now;
+
+ t->io_period += difference;
+ t->total_period += difference;
+
+ /*
+ * Maintain sane values if we got a temporary overflow.
+ */
+ if (unlikely(t->io_period > t->total_period))
+ t->io_period = t->total_period;
+ }
+
+skip_limit:
+ spin_unlock_irqrestore(&throttle_spinlock, flags);
+}
+
+
static void wake(struct dm_kcopyd_client *kc)
{
queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
@@ -348,6 +462,8 @@ static void complete_io(unsigned long error, void *context)
struct kcopyd_job *job = (struct kcopyd_job *) context;
struct dm_kcopyd_client *kc = job->kc;
+ io_job_finish(kc->throttle);
+
if (error) {
if (job->rw & WRITE)
job->write_err |= error;
@@ -389,6 +505,8 @@ static int run_io_job(struct kcopyd_job *job)
.client = job->kc->io_client,
};
+ io_job_start(job->kc->throttle);
+
if (job->rw == READ)
r = dm_io(&io_req, 1, &job->source, NULL);
else
@@ -695,7 +813,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
/*-----------------------------------------------------------------
* Client setup
*---------------------------------------------------------------*/
-struct dm_kcopyd_client *dm_kcopyd_client_create(void)
+struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
{
int r = -ENOMEM;
struct dm_kcopyd_client *kc;
@@ -708,6 +826,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(void)
INIT_LIST_HEAD(&kc->complete_jobs);
INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs);
+ kc->throttle = throttle;
kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
if (!kc->job_pool)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 328cad5617a..4f99d267340 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,9 +53,9 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- ti->num_flush_requests = 1;
- ti->num_discard_requests = 1;
- ti->num_write_same_requests = 1;
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
+ ti->num_write_same_bios = 1;
ti->private = lc;
return 0;
@@ -95,8 +95,8 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
-static int linear_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void linear_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
struct linear_c *lc = (struct linear_c *) ti->private;
@@ -110,7 +110,6 @@ static int linear_status(struct dm_target *ti, status_type_t type,
(unsigned long long)lc->start);
break;
}
- return 0;
}
static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
@@ -155,7 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti,
static struct target_type linear_target = {
.name = "linear",
- .version = {1, 2, 0},
+ .version = {1, 2, 1},
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 573bd04591b..51bb81676be 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -905,8 +905,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
goto bad;
}
- ti->num_flush_requests = 1;
- ti->num_discard_requests = 1;
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
return 0;
@@ -1378,8 +1378,8 @@ static void multipath_resume(struct dm_target *ti)
* [priority selector-name num_ps_args [ps_args]*
* num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
*/
-static int multipath_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void multipath_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
int sz = 0;
unsigned long flags;
@@ -1485,8 +1485,6 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
}
spin_unlock_irqrestore(&m->lock, flags);
-
- return 0;
}
static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
@@ -1695,7 +1693,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 5, 0},
+ .version = {1, 5, 1},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9e58dbd8d8c..311e3d35b27 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
{"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
};
+static char *raid10_md_layout_to_format(int layout)
+{
+ /*
+ * Bit 16 and 17 stand for "offset" and "use_far_sets"
+ * Refer to MD's raid10.c for details
+ */
+ if ((layout & 0x10000) && (layout & 0x20000))
+ return "offset";
+
+ if ((layout & 0xFF) > 1)
+ return "near";
+
+ return "far";
+}
+
static unsigned raid10_md_layout_to_copies(int layout)
{
- return layout & 0xFF;
+ if ((layout & 0xFF) > 1)
+ return layout & 0xFF;
+ return (layout >> 8) & 0xFF;
}
static int raid10_format_to_md_layout(char *format, unsigned copies)
{
- /* 1 "far" copy, and 'copies' "near" copies */
- return (1 << 8) | (copies & 0xFF);
+ unsigned n = 1, f = 1;
+
+ if (!strcmp("near", format))
+ n = copies;
+ else
+ f = copies;
+
+ if (!strcmp("offset", format))
+ return 0x30000 | (f << 8) | n;
+
+ if (!strcmp("far", format))
+ return 0x20000 | (f << 8) | n;
+
+ return (f << 8) | n;
}
static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
{
unsigned i, rebuild_cnt = 0;
unsigned rebuilds_per_group, copies, d;
+ unsigned group_size, last_group_start;
for (i = 0; i < rs->md.raid_disks; i++)
if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
* as long as the failed devices occur in different mirror
* groups (i.e. different stripes).
*
- * Right now, we only allow for "near" copies. When other
- * formats are added, we will have to check those too.
- *
* When checking "near" format, make sure no adjacent devices
* have failed beyond what can be handled. In addition to the
* simple case where the number of devices is a multiple of the
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
* A A B B C
* C D D E E
*/
- for (i = 0; i < rs->md.raid_disks * copies; i++) {
- if (!(i % copies))
+ if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
+ for (i = 0; i < rs->md.raid_disks * copies; i++) {
+ if (!(i % copies))
+ rebuilds_per_group = 0;
+ d = i % rs->md.raid_disks;
+ if ((!rs->dev[d].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+ (++rebuilds_per_group >= copies))
+ goto too_many;
+ }
+ break;
+ }
+
+ /*
+ * When checking "far" and "offset" formats, we need to ensure
+ * that the device that holds its copy is not also dead or
+ * being rebuilt. (Note that "far" and "offset" formats only
+ * support two copies right now. These formats also only ever
+ * use the 'use_far_sets' variant.)
+ *
+ * This check is somewhat complicated by the need to account
+ * for arrays that are not a multiple of (far) copies. This
+ * results in the need to treat the last (potentially larger)
+ * set differently.
+ */
+ group_size = (rs->md.raid_disks / copies);
+ last_group_start = (rs->md.raid_disks / group_size) - 1;
+ last_group_start *= group_size;
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (!(i % copies) && !(i > last_group_start))
rebuilds_per_group = 0;
- d = i % rs->md.raid_disks;
- if ((!rs->dev[d].rdev.sb_page ||
- !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+ if ((!rs->dev[i].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
(++rebuilds_per_group >= copies))
- goto too_many;
+ goto too_many;
}
break;
default:
@@ -433,7 +487,7 @@ too_many:
*
* RAID10-only options:
* [raid10_copies <# copies>] Number of copies. (Default: 2)
- * [raid10_format <near>] Layout algorithm. (Default: near)
+ * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
*/
static int parse_raid_params(struct raid_set *rs, char **argv,
unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
return -EINVAL;
}
- if (strcmp("near", argv[i])) {
+ if (strcmp("near", argv[i]) &&
+ strcmp("far", argv[i]) &&
+ strcmp("offset", argv[i])) {
rs->ti->error = "Invalid 'raid10_format' value given";
return -EINVAL;
}
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
+ /*
+ * If the format is not "near", we only support
+ * two copies at the moment.
+ */
+ if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
+ rs->ti->error = "Too many copies for given RAID10 format.";
+ return -EINVAL;
+ }
+
/* (Len * #mirrors) / #devices */
sectors_per_dev = rs->ti->len * raid10_copies;
sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
/*
* Reshaping is not currently allowed
*/
- if ((le32_to_cpu(sb->level) != mddev->level) ||
- (le32_to_cpu(sb->layout) != mddev->layout) ||
- (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
- DMERR("Reshaping arrays not yet supported.");
+ if (le32_to_cpu(sb->level) != mddev->level) {
+ DMERR("Reshaping arrays not yet supported. (RAID level change)");
+ return -EINVAL;
+ }
+ if (le32_to_cpu(sb->layout) != mddev->layout) {
+ DMERR("Reshaping arrays not yet supported. (RAID layout change)");
+ DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+ DMERR(" Old layout: %s w/ %d copies",
+ raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
+ raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
+ DMERR(" New layout: %s w/ %d copies",
+ raid10_md_layout_to_format(mddev->layout),
+ raid10_md_layout_to_copies(mddev->layout));
+ return -EINVAL;
+ }
+ if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
+ DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
return -EINVAL;
}
/* We can only change the number of devices in RAID1 right now */
if ((rs->raid_type->level != 1) &&
(le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
- DMERR("Reshaping arrays not yet supported.");
+ DMERR("Reshaping arrays not yet supported. (device count change)");
return -EINVAL;
}
@@ -1151,7 +1229,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
- ti->num_flush_requests = 1;
+ ti->num_flush_bios = 1;
mutex_lock(&rs->md.reconfig_mutex);
ret = md_run(&rs->md);
@@ -1201,8 +1279,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
-static int raid_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void raid_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
struct raid_set *rs = ti->private;
unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
@@ -1329,7 +1407,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
raid10_md_layout_to_copies(rs->md.layout));
if (rs->print_flags & DMPF_RAID10_FORMAT)
- DMEMIT(" raid10_format near");
+ DMEMIT(" raid10_format %s",
+ raid10_md_layout_to_format(rs->md.layout));
DMEMIT(" %d", rs->md.raid_disks);
for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1344,8 +1423,6 @@ static int raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" -");
}
}
-
- return 0;
}
static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
@@ -1405,7 +1482,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 4, 1},
+ .version = {1, 4, 2},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
@@ -1420,6 +1497,10 @@ static struct target_type raid_target = {
static int __init dm_raid_init(void)
{
+ DMINFO("Loading target version %u.%u.%u",
+ raid_target.version[0],
+ raid_target.version[1],
+ raid_target.version[2]);
return dm_register_target(&raid_target);
}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index fa519185ebb..d053098c6a9 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -82,6 +82,9 @@ struct mirror_set {
struct mirror mirror[0];
};
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
+ "A percentage of time allocated for raid resynchronization");
+
static void wakeup_mirrord(void *context)
{
struct mirror_set *ms = context;
@@ -1072,8 +1075,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto err_free_context;
- ti->num_flush_requests = 1;
- ti->num_discard_requests = 1;
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
ti->discard_zeroes_data_unsupported = true;
@@ -1111,7 +1114,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto err_destroy_wq;
}
- ms->kcopyd_client = dm_kcopyd_client_create();
+ ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(ms->kcopyd_client)) {
r = PTR_ERR(ms->kcopyd_client);
goto err_destroy_wq;
@@ -1347,8 +1350,8 @@ static char device_status_char(struct mirror *m)
}
-static int mirror_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void mirror_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
unsigned int m, sz = 0;
struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1383,8 +1386,6 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
if (ms->features & DM_RAID1_HANDLE_ERRORS)
DMEMIT(" 1 handle_errors");
}
-
- return 0;
}
static int mirror_iterate_devices(struct dm_target *ti,
@@ -1403,7 +1404,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 13, 1},
+ .version = {1, 13, 2},
.module = THIS_MODULE,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 10079e07edf..c0e07026a8d 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -124,6 +124,9 @@ struct dm_snapshot {
#define RUNNING_MERGE 0
#define SHUTDOWN_MERGE 1
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
+ "A percentage of time allocated for copy on write");
+
struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
{
return s->origin;
@@ -1037,7 +1040,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
int i;
int r = -EINVAL;
char *origin_path, *cow_path;
- unsigned args_used, num_flush_requests = 1;
+ unsigned args_used, num_flush_bios = 1;
fmode_t origin_mode = FMODE_READ;
if (argc != 4) {
@@ -1047,7 +1050,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
if (dm_target_is_snapshot_merge(ti)) {
- num_flush_requests = 2;
+ num_flush_bios = 2;
origin_mode = FMODE_WRITE;
}
@@ -1108,7 +1111,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_hash_tables;
}
- s->kcopyd_client = dm_kcopyd_client_create();
+ s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(s->kcopyd_client)) {
r = PTR_ERR(s->kcopyd_client);
ti->error = "Could not create kcopyd client";
@@ -1127,7 +1130,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
spin_lock_init(&s->tracked_chunk_lock);
ti->private = s;
- ti->num_flush_requests = num_flush_requests;
+ ti->num_flush_bios = num_flush_bios;
ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
/* Add snapshot to the list of snapshots for this origin */
@@ -1691,7 +1694,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
init_tracked_chunk(bio);
if (bio->bi_rw & REQ_FLUSH) {
- if (!dm_bio_get_target_request_nr(bio))
+ if (!dm_bio_get_target_bio_nr(bio))
bio->bi_bdev = s->origin->bdev;
else
bio->bi_bdev = s->cow->bdev;
@@ -1836,8 +1839,8 @@ static void snapshot_merge_resume(struct dm_target *ti)
start_merge(s);
}
-static int snapshot_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void snapshot_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
unsigned sz = 0;
struct dm_snapshot *snap = ti->private;
@@ -1883,8 +1886,6 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
maxlen - sz);
break;
}
-
- return 0;
}
static int snapshot_iterate_devices(struct dm_target *ti,
@@ -2104,7 +2105,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ti->private = dev;
- ti->num_flush_requests = 1;
+ ti->num_flush_bios = 1;
return 0;
}
@@ -2138,8 +2139,8 @@ static void origin_resume(struct dm_target *ti)
ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
}
-static int origin_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void origin_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
struct dm_dev *dev = ti->private;
@@ -2152,8 +2153,6 @@ static int origin_status(struct dm_target *ti, status_type_t type,
snprintf(result, maxlen, "%s", dev->name);
break;
}
-
- return 0;
}
static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
@@ -2180,7 +2179,7 @@ static int origin_iterate_devices(struct dm_target *ti,
static struct target_type origin_target = {
.name = "snapshot-origin",
- .version = {1, 8, 0},
+ .version = {1, 8, 1},
.module = THIS_MODULE,
.ctr = origin_ctr,
.dtr = origin_dtr,
@@ -2193,7 +2192,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 11, 0},
+ .version = {1, 11, 1},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2306,3 +2305,5 @@ module_exit(dm_snapshot_exit);
MODULE_DESCRIPTION(DM_NAME " snapshot target");
MODULE_AUTHOR("Joe Thornber");
MODULE_LICENSE("GPL");
+MODULE_ALIAS("dm-snapshot-origin");
+MODULE_ALIAS("dm-snapshot-merge");
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c89cde86d40..d8837d313f5 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -160,9 +160,9 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
return r;
- ti->num_flush_requests = stripes;
- ti->num_discard_requests = stripes;
- ti->num_write_same_requests = stripes;
+ ti->num_flush_bios = stripes;
+ ti->num_discard_bios = stripes;
+ ti->num_write_same_bios = stripes;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
@@ -276,19 +276,19 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
{
struct stripe_c *sc = ti->private;
uint32_t stripe;
- unsigned target_request_nr;
+ unsigned target_bio_nr;
if (bio->bi_rw & REQ_FLUSH) {
- target_request_nr = dm_bio_get_target_request_nr(bio);
- BUG_ON(target_request_nr >= sc->stripes);
- bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
+ target_bio_nr = dm_bio_get_target_bio_nr(bio);
+ BUG_ON(target_bio_nr >= sc->stripes);
+ bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
return DM_MAPIO_REMAPPED;
}
if (unlikely(bio->bi_rw & REQ_DISCARD) ||
unlikely(bio->bi_rw & REQ_WRITE_SAME)) {
- target_request_nr = dm_bio_get_target_request_nr(bio);
- BUG_ON(target_request_nr >= sc->stripes);
- return stripe_map_range(sc, bio, target_request_nr);
+ target_bio_nr = dm_bio_get_target_bio_nr(bio);
+ BUG_ON(target_bio_nr >= sc->stripes);
+ return stripe_map_range(sc, bio, target_bio_nr);
}
stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
@@ -312,8 +312,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
*
*/
-static int stripe_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void stripe_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
struct stripe_c *sc = (struct stripe_c *) ti->private;
char buffer[sc->stripes + 1];
@@ -340,7 +340,6 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
(unsigned long long)sc->stripe[i].physical_start);
break;
}
- return 0;
}
static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
@@ -428,7 +427,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
static struct target_type stripe_target = {
.name = "striped",
- .version = {1, 5, 0},
+ .version = {1, 5, 1},
.module = THIS_MODULE,
.ctr = stripe_ctr,
.dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index daf25d0890b..e50dad0c65f 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -217,7 +217,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
if (alloc_targets(t, num_targets)) {
kfree(t);
- t = NULL;
return -ENOMEM;
}
@@ -823,8 +822,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
- if (!tgt->num_discard_requests && tgt->discards_supported)
- DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
+ if (!tgt->num_discard_bios && tgt->discards_supported)
+ DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.",
dm_device_name(t->md), type);
return 0;
@@ -1360,7 +1359,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
while (i < dm_table_get_num_targets(t)) {
ti = dm_table_get_target(t, i++);
- if (!ti->num_flush_requests)
+ if (!ti->num_flush_bios)
continue;
if (ti->flush_supported)
@@ -1439,7 +1438,7 @@ static bool dm_table_supports_write_same(struct dm_table *t)
while (i < dm_table_get_num_targets(t)) {
ti = dm_table_get_target(t, i++);
- if (!ti->num_write_same_requests)
+ if (!ti->num_write_same_bios)
return false;
if (!ti->type->iterate_devices ||
@@ -1657,7 +1656,7 @@ bool dm_table_supports_discards(struct dm_table *t)
while (i < dm_table_get_num_targets(t)) {
ti = dm_table_get_target(t, i++);
- if (!ti->num_discard_requests)
+ if (!ti->num_discard_bios)
continue;
if (ti->discards_supported)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 617d21a7725..37ba5db71cd 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -116,7 +116,7 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
/*
* Return error for discards instead of -EOPNOTSUPP
*/
- tt->num_discard_requests = 1;
+ tt->num_discard_bios = 1;
return 0;
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 4d6e85367b8..00cee02f8fc 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -280,7 +280,7 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
*t = v & ((1 << 24) - 1);
}
-static void data_block_inc(void *context, void *value_le)
+static void data_block_inc(void *context, const void *value_le)
{
struct dm_space_map *sm = context;
__le64 v_le;
@@ -292,7 +292,7 @@ static void data_block_inc(void *context, void *value_le)
dm_sm_inc_block(sm, b);
}
-static void data_block_dec(void *context, void *value_le)
+static void data_block_dec(void *context, const void *value_le)
{
struct dm_space_map *sm = context;
__le64 v_le;
@@ -304,7 +304,7 @@ static void data_block_dec(void *context, void *value_le)
dm_sm_dec_block(sm, b);
}
-static int data_block_equal(void *context, void *value1_le, void *value2_le)
+static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
{
__le64 v1_le, v2_le;
uint64_t b1, b2;
@@ -318,7 +318,7 @@ static int data_block_equal(void *context, void *value1_le, void *value2_le)
return b1 == b2;
}
-static void subtree_inc(void *context, void *value)
+static void subtree_inc(void *context, const void *value)
{
struct dm_btree_info *info = context;
__le64 root_le;
@@ -329,7 +329,7 @@ static void subtree_inc(void *context, void *value)
dm_tm_inc(info->tm, root);
}
-static void subtree_dec(void *context, void *value)
+static void subtree_dec(void *context, const void *value)
{
struct dm_btree_info *info = context;
__le64 root_le;
@@ -341,7 +341,7 @@ static void subtree_dec(void *context, void *value)
DMERR("btree delete failed\n");
}
-static int subtree_equal(void *context, void *value1_le, void *value2_le)
+static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
{
__le64 v1_le, v2_le;
memcpy(&v1_le, value1_le, sizeof(v1_le));
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 5409607d487..009339d6282 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -26,6 +26,9 @@
#define PRISON_CELLS 1024
#define COMMIT_PERIOD HZ
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
+ "A percentage of time allocated for copy on write");
+
/*
* The block size of the device holding pool data must be
* between 64KB and 1GB.
@@ -227,6 +230,78 @@ struct thin_c {
/*----------------------------------------------------------------*/
/*
+ * wake_worker() is used when new work is queued and when pool_resume is
+ * ready to continue deferred IO processing.
+ */
+static void wake_worker(struct pool *pool)
+{
+ queue_work(pool->wq, &pool->worker);
+}
+
+/*----------------------------------------------------------------*/
+
+static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ struct dm_bio_prison_cell *cell_prealloc;
+
+ /*
+ * Allocate a cell from the prison's mempool.
+ * This might block but it can't fail.
+ */
+ cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
+
+ r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
+ if (r)
+ /*
+ * We reused an old cell; we can get rid of
+ * the new one.
+ */
+ dm_bio_prison_free_cell(pool->prison, cell_prealloc);
+
+ return r;
+}
+
+static void cell_release(struct pool *pool,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *bios)
+{
+ dm_cell_release(pool->prison, cell, bios);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_release_no_holder(struct pool *pool,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *bios)
+{
+ dm_cell_release_no_holder(pool->prison, cell, bios);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_defer_no_holder_no_free(struct thin_c *tc,
+ struct dm_bio_prison_cell *cell)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios);
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ wake_worker(pool);
+}
+
+static void cell_error(struct pool *pool,
+ struct dm_bio_prison_cell *cell)
+{
+ dm_cell_error(pool->prison, cell);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
* A global list of pools that uses a struct mapped_device as a key.
*/
static struct dm_thin_pool_table {
@@ -330,14 +405,20 @@ static void requeue_io(struct thin_c *tc)
* target.
*/
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+ return pool->sectors_per_block_shift >= 0;
+}
+
static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{
+ struct pool *pool = tc->pool;
sector_t block_nr = bio->bi_sector;
- if (tc->pool->sectors_per_block_shift < 0)
- (void) sector_div(block_nr, tc->pool->sectors_per_block);
+ if (block_size_is_power_of_two(pool))
+ block_nr >>= pool->sectors_per_block_shift;
else
- block_nr >>= tc->pool->sectors_per_block_shift;
+ (void) sector_div(block_nr, pool->sectors_per_block);
return block_nr;
}
@@ -348,12 +429,12 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
sector_t bi_sector = bio->bi_sector;
bio->bi_bdev = tc->pool_dev->bdev;
- if (tc->pool->sectors_per_block_shift < 0)
- bio->bi_sector = (block * pool->sectors_per_block) +
- sector_div(bi_sector, pool->sectors_per_block);
- else
+ if (block_size_is_power_of_two(pool))
bio->bi_sector = (block << pool->sectors_per_block_shift) |
(bi_sector & (pool->sectors_per_block - 1));
+ else
+ bio->bi_sector = (block * pool->sectors_per_block) +
+ sector_div(bi_sector, pool->sectors_per_block);
}
static void remap_to_origin(struct thin_c *tc, struct bio *bio)
@@ -420,15 +501,6 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
issue(tc, bio);
}
-/*
- * wake_worker() is used when new work is queued and when pool_resume is
- * ready to continue deferred IO processing.
- */
-static void wake_worker(struct pool *pool)
-{
- queue_work(pool->wq, &pool->worker);
-}
-
/*----------------------------------------------------------------*/
/*
@@ -515,14 +587,14 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
unsigned long flags;
spin_lock_irqsave(&pool->lock, flags);
- dm_cell_release(cell, &pool->deferred_bios);
+ cell_release(pool, cell, &pool->deferred_bios);
spin_unlock_irqrestore(&tc->pool->lock, flags);
wake_worker(pool);
}
/*
- * Same as cell_defer except it omits the original holder of the cell.
+ * Same as cell_defer above, except it omits the original holder of the cell.
*/
static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
@@ -530,7 +602,7 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
unsigned long flags;
spin_lock_irqsave(&pool->lock, flags);
- dm_cell_release_no_holder(cell, &pool->deferred_bios);
+ cell_release_no_holder(pool, cell, &pool->deferred_bios);
spin_unlock_irqrestore(&pool->lock, flags);
wake_worker(pool);
@@ -540,13 +612,15 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
if (m->bio)
m->bio->bi_end_io = m->saved_bi_end_io;
- dm_cell_error(m->cell);
+ cell_error(m->tc->pool, m->cell);
list_del(&m->list);
mempool_free(m, m->tc->pool->mapping_pool);
}
+
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
struct bio *bio;
int r;
@@ -555,7 +629,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
bio->bi_end_io = m->saved_bi_end_io;
if (m->err) {
- dm_cell_error(m->cell);
+ cell_error(pool, m->cell);
goto out;
}
@@ -567,7 +641,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
if (r) {
DMERR_LIMIT("dm_thin_insert_block() failed");
- dm_cell_error(m->cell);
+ cell_error(pool, m->cell);
goto out;
}
@@ -585,7 +659,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
out:
list_del(&m->list);
- mempool_free(m, tc->pool->mapping_pool);
+ mempool_free(m, pool->mapping_pool);
}
static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
@@ -736,7 +810,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
if (r < 0) {
mempool_free(m, pool->mapping_pool);
DMERR_LIMIT("dm_kcopyd_copy() failed");
- dm_cell_error(cell);
+ cell_error(pool, cell);
}
}
}
@@ -802,7 +876,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
if (r < 0) {
mempool_free(m, pool->mapping_pool);
DMERR_LIMIT("dm_kcopyd_zero() failed");
- dm_cell_error(cell);
+ cell_error(pool, cell);
}
}
}
@@ -908,13 +982,13 @@ static void retry_on_resume(struct bio *bio)
spin_unlock_irqrestore(&pool->lock, flags);
}
-static void no_space(struct dm_bio_prison_cell *cell)
+static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
{
struct bio *bio;
struct bio_list bios;
bio_list_init(&bios);
- dm_cell_release(cell, &bios);
+ cell_release(pool, cell, &bios);
while ((bio = bio_list_pop(&bios)))
retry_on_resume(bio);
@@ -932,7 +1006,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
struct dm_thin_new_mapping *m;
build_virtual_key(tc->td, block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
+ if (bio_detain(tc->pool, &key, bio, &cell))
return;
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
@@ -944,7 +1018,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
* on this block.
*/
build_data_key(tc->td, lookup_result.block, &key2);
- if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+ if (bio_detain(tc->pool, &key2, bio, &cell2)) {
cell_defer_no_holder(tc, cell);
break;
}
@@ -1020,13 +1094,13 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
break;
case -ENOSPC:
- no_space(cell);
+ no_space(tc->pool, cell);
break;
default:
DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
__func__, r);
- dm_cell_error(cell);
+ cell_error(tc->pool, cell);
break;
}
}
@@ -1044,7 +1118,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
* of being broken so we have nothing further to do here.
*/
build_data_key(tc->td, lookup_result->block, &key);
- if (dm_bio_detain(pool->prison, &key, bio, &cell))
+ if (bio_detain(pool, &key, bio, &cell))
return;
if (bio_data_dir(bio) == WRITE && bio->bi_size)
@@ -1065,12 +1139,13 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
{
int r;
dm_block_t data_block;
+ struct pool *pool = tc->pool;
/*
* Remap empty bios (flushes) immediately, without provisioning.
*/
if (!bio->bi_size) {
- inc_all_io_entry(tc->pool, bio);
+ inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
remap_and_issue(tc, bio, 0);
@@ -1097,14 +1172,14 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
break;
case -ENOSPC:
- no_space(cell);
+ no_space(pool, cell);
break;
default:
DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
__func__, r);
- set_pool_mode(tc->pool, PM_READ_ONLY);
- dm_cell_error(cell);
+ set_pool_mode(pool, PM_READ_ONLY);
+ cell_error(pool, cell);
break;
}
}
@@ -1112,6 +1187,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
static void process_bio(struct thin_c *tc, struct bio *bio)
{
int r;
+ struct pool *pool = tc->pool;
dm_block_t block = get_bio_block(tc, bio);
struct dm_bio_prison_cell *cell;
struct dm_cell_key key;
@@ -1122,7 +1198,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
* being provisioned so we have nothing further to do here.
*/
build_virtual_key(tc->td, block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
+ if (bio_detain(pool, &key, bio, &cell))
return;
r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
@@ -1130,9 +1206,9 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
case 0:
if (lookup_result.shared) {
process_shared_bio(tc, bio, block, &lookup_result);
- cell_defer_no_holder(tc, cell);
+ cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
} else {
- inc_all_io_entry(tc->pool, bio);
+ inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
remap_and_issue(tc, bio, lookup_result.block);
@@ -1141,7 +1217,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
case -ENODATA:
if (bio_data_dir(bio) == READ && tc->origin_dev) {
- inc_all_io_entry(tc->pool, bio);
+ inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
remap_to_origin_and_issue(tc, bio);
@@ -1378,7 +1454,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
dm_block_t block = get_bio_block(tc, bio);
struct dm_thin_device *td = tc->td;
struct dm_thin_lookup_result result;
- struct dm_bio_prison_cell *cell1, *cell2;
+ struct dm_bio_prison_cell cell1, cell2;
+ struct dm_bio_prison_cell *cell_result;
struct dm_cell_key key;
thin_hook_bio(tc, bio);
@@ -1420,18 +1497,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
}
build_virtual_key(tc->td, block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
+ if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
return DM_MAPIO_SUBMITTED;
build_data_key(tc->td, result.block, &key);
- if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
- cell_defer_no_holder(tc, cell1);
+ if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
+ cell_defer_no_holder_no_free(tc, &cell1);
return DM_MAPIO_SUBMITTED;
}
inc_all_io_entry(tc->pool, bio);
- cell_defer_no_holder(tc, cell2);
- cell_defer_no_holder(tc, cell1);
+ cell_defer_no_holder_no_free(tc, &cell2);
+ cell_defer_no_holder_no_free(tc, &cell1);
remap(tc, bio, result.block);
return DM_MAPIO_REMAPPED;
@@ -1636,7 +1713,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_prison;
}
- pool->copier = dm_kcopyd_client_create();
+ pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(pool->copier)) {
r = PTR_ERR(pool->copier);
*error = "Error creating pool's kcopyd client";
@@ -1938,7 +2015,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf;
- ti->num_flush_requests = 1;
+ ti->num_flush_bios = 1;
/*
* Only need to enable discards if the pool should pass
@@ -1946,7 +2023,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
* processing will cause mappings to be removed from the btree.
*/
if (pf.discard_enabled && pf.discard_passdown) {
- ti->num_discard_requests = 1;
+ ti->num_discard_bios = 1;
/*
* Setting 'discards_supported' circumvents the normal
@@ -2299,8 +2376,8 @@ static void emit_flags(struct pool_features *pf, char *result,
* <transaction id> <used metadata sectors>/<total metadata sectors>
* <used data sectors>/<total data sectors> <held metadata root>
*/
-static int pool_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void pool_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
int r;
unsigned sz = 0;
@@ -2326,32 +2403,41 @@ static int pool_status(struct dm_target *ti, status_type_t type,
if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
(void) commit_or_fallback(pool);
- r = dm_pool_get_metadata_transaction_id(pool->pmd,
- &transaction_id);
- if (r)
- return r;
+ r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
+ if (r) {
+ DMERR("dm_pool_get_metadata_transaction_id returned %d", r);
+ goto err;
+ }
- r = dm_pool_get_free_metadata_block_count(pool->pmd,
- &nr_free_blocks_metadata);
- if (r)
- return r;
+ r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
+ if (r) {
+ DMERR("dm_pool_get_free_metadata_block_count returned %d", r);
+ goto err;
+ }
r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
- if (r)
- return r;
+ if (r) {
+ DMERR("dm_pool_get_metadata_dev_size returned %d", r);
+ goto err;
+ }
- r = dm_pool_get_free_block_count(pool->pmd,
- &nr_free_blocks_data);
- if (r)
- return r;
+ r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
+ if (r) {
+ DMERR("dm_pool_get_free_block_count returned %d", r);
+ goto err;
+ }
r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
- if (r)
- return r;
+ if (r) {
+ DMERR("dm_pool_get_data_dev_size returned %d", r);
+ goto err;
+ }
r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
- if (r)
- return r;
+ if (r) {
+ DMERR("dm_pool_get_metadata_snap returned %d", r);
+ goto err;
+ }
DMEMIT("%llu %llu/%llu %llu/%llu ",
(unsigned long long)transaction_id,
@@ -2388,8 +2474,10 @@ static int pool_status(struct dm_target *ti, status_type_t type,
emit_flags(&pt->requested_pf, result, sz, maxlen);
break;
}
+ return;
- return 0;
+err:
+ DMEMIT("Error");
}
static int pool_iterate_devices(struct dm_target *ti,
@@ -2414,11 +2502,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
-static bool block_size_is_power_of_two(struct pool *pool)
-{
- return pool->sectors_per_block_shift >= 0;
-}
-
static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
{
struct pool *pool = pt->pool;
@@ -2432,15 +2515,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
if (pt->adjusted_pf.discard_passdown) {
data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
limits->discard_granularity = data_limits->discard_granularity;
- } else if (block_size_is_power_of_two(pool))
+ } else
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
- else
- /*
- * Use largest power of 2 that is a factor of sectors_per_block
- * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
- */
- limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
- DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
}
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -2468,7 +2544,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 6, 0},
+ .version = {1, 6, 1},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -2588,17 +2664,17 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto bad_thin_open;
- ti->num_flush_requests = 1;
+ ti->num_flush_bios = 1;
ti->flush_supported = true;
ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
/* In case the pool supports discards, pass them on. */
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
- ti->num_discard_requests = 1;
+ ti->num_discard_bios = 1;
ti->discard_zeroes_data_unsupported = true;
- /* Discard requests must be split on a block boundary */
- ti->split_discard_requests = true;
+ /* Discard bios must be split on a block boundary */
+ ti->split_discard_bios = true;
}
dm_put(pool_md);
@@ -2676,8 +2752,8 @@ static void thin_postsuspend(struct dm_target *ti)
/*
* <nr mapped sectors> <highest mapped sector>
*/
-static int thin_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void thin_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
int r;
ssize_t sz = 0;
@@ -2687,7 +2763,7 @@ static int thin_status(struct dm_target *ti, status_type_t type,
if (get_pool_mode(tc->pool) == PM_FAIL) {
DMEMIT("Fail");
- return 0;
+ return;
}
if (!tc->td)
@@ -2696,12 +2772,16 @@ static int thin_status(struct dm_target *ti, status_type_t type,
switch (type) {
case STATUSTYPE_INFO:
r = dm_thin_get_mapped_count(tc->td, &mapped);
- if (r)
- return r;
+ if (r) {
+ DMERR("dm_thin_get_mapped_count returned %d", r);
+ goto err;
+ }
r = dm_thin_get_highest_mapped_block(tc->td, &highest);
- if (r < 0)
- return r;
+ if (r < 0) {
+ DMERR("dm_thin_get_highest_mapped_block returned %d", r);
+ goto err;
+ }
DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
if (r)
@@ -2721,7 +2801,10 @@ static int thin_status(struct dm_target *ti, status_type_t type,
}
}
- return 0;
+ return;
+
+err:
+ DMEMIT("Error");
}
static int thin_iterate_devices(struct dm_target *ti,
@@ -2748,7 +2831,7 @@ static int thin_iterate_devices(struct dm_target *ti,
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 7, 0},
+ .version = {1, 7, 1},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 52cde982164..6ad538375c3 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -508,8 +508,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
/*
* Status: V (valid) or C (corruption found)
*/
-static int verity_status(struct dm_target *ti, status_type_t type,
- unsigned status_flags, char *result, unsigned maxlen)
+static void verity_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
{
struct dm_verity *v = ti->private;
unsigned sz = 0;
@@ -540,8 +540,6 @@ static int verity_status(struct dm_target *ti, status_type_t type,
DMEMIT("%02x", v->salt[x]);
break;
}
-
- return 0;
}
static int verity_ioctl(struct dm_target *ti, unsigned cmd,
@@ -860,7 +858,7 @@ bad:
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 1, 0},
+ .version = {1, 1, 1},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 69a5c3b3b34..c99003e0d47 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -25,7 +25,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/*
* Silently drop discards, avoiding -EOPNOTSUPP.
*/
- ti->num_discard_requests = 1;
+ ti->num_discard_bios = 1;
return 0;
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index bb2cd3ce9b0..7e469260fe5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -163,7 +163,6 @@ struct mapped_device {
* io objects are allocated from here.
*/
mempool_t *io_pool;
- mempool_t *tio_pool;
struct bio_set *bs;
@@ -197,7 +196,6 @@ struct mapped_device {
*/
struct dm_md_mempools {
mempool_t *io_pool;
- mempool_t *tio_pool;
struct bio_set *bs;
};
@@ -205,12 +203,6 @@ struct dm_md_mempools {
static struct kmem_cache *_io_cache;
static struct kmem_cache *_rq_tio_cache;
-/*
- * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
- * still used for _io_cache, I'm leaving this for a later cleanup
- */
-static struct kmem_cache *_rq_bio_info_cache;
-
static int __init local_init(void)
{
int r = -ENOMEM;
@@ -224,13 +216,9 @@ static int __init local_init(void)
if (!_rq_tio_cache)
goto out_free_io_cache;
- _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
- if (!_rq_bio_info_cache)
- goto out_free_rq_tio_cache;
-
r = dm_uevent_init();
if (r)
- goto out_free_rq_bio_info_cache;
+ goto out_free_rq_tio_cache;
_major = major;
r = register_blkdev(_major, _name);
@@ -244,8 +232,6 @@ static int __init local_init(void)
out_uevent_exit:
dm_uevent_exit();
-out_free_rq_bio_info_cache:
- kmem_cache_destroy(_rq_bio_info_cache);
out_free_rq_tio_cache:
kmem_cache_destroy(_rq_tio_cache);
out_free_io_cache:
@@ -256,7 +242,6 @@ out_free_io_cache:
static void local_exit(void)
{
- kmem_cache_destroy(_rq_bio_info_cache);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
@@ -448,12 +433,12 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
gfp_t gfp_mask)
{
- return mempool_alloc(md->tio_pool, gfp_mask);
+ return mempool_alloc(md->io_pool, gfp_mask);
}
static void free_rq_tio(struct dm_rq_target_io *tio)
{
- mempool_free(tio, tio->md->tio_pool);
+ mempool_free(tio, tio->md->io_pool);
}
static int md_in_flight(struct mapped_device *md)
@@ -985,12 +970,13 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
}
EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
-static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
+static void __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
struct mapped_device *md;
struct bio *clone = &tio->clone;
+ struct dm_target *ti = tio->ti;
clone->bi_end_io = clone_endio;
clone->bi_private = tio;
@@ -1031,32 +1017,54 @@ struct clone_info {
unsigned short idx;
};
+static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
+{
+ bio->bi_sector = sector;
+ bio->bi_size = to_bytes(len);
+}
+
+static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
+{
+ bio->bi_idx = idx;
+ bio->bi_vcnt = idx + bv_count;
+ bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+}
+
+static void clone_bio_integrity(struct bio *bio, struct bio *clone,
+ unsigned short idx, unsigned len, unsigned offset,
+ unsigned trim)
+{
+ if (!bio_integrity(bio))
+ return;
+
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+
+ if (trim)
+ bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
+}
+
/*
* Creates a little bio that just does part of a bvec.
*/
-static void split_bvec(struct dm_target_io *tio, struct bio *bio,
- sector_t sector, unsigned short idx, unsigned int offset,
- unsigned int len, struct bio_set *bs)
+static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
+ sector_t sector, unsigned short idx,
+ unsigned offset, unsigned len)
{
struct bio *clone = &tio->clone;
struct bio_vec *bv = bio->bi_io_vec + idx;
*clone->bi_io_vec = *bv;
- clone->bi_sector = sector;
+ bio_setup_sector(clone, sector, len);
+
clone->bi_bdev = bio->bi_bdev;
clone->bi_rw = bio->bi_rw;
clone->bi_vcnt = 1;
- clone->bi_size = to_bytes(len);
clone->bi_io_vec->bv_offset = offset;
clone->bi_io_vec->bv_len = clone->bi_size;
clone->bi_flags |= 1 << BIO_CLONED;
- if (bio_integrity(bio)) {
- bio_integrity_clone(clone, bio, GFP_NOIO);
- bio_integrity_trim(clone,
- bio_sector_offset(bio, idx, offset), len);
- }
+ clone_bio_integrity(bio, clone, idx, len, offset, 1);
}
/*
@@ -1064,29 +1072,23 @@ static void split_bvec(struct dm_target_io *tio, struct bio *bio,
*/
static void clone_bio(struct dm_target_io *tio, struct bio *bio,
sector_t sector, unsigned short idx,
- unsigned short bv_count, unsigned int len,
- struct bio_set *bs)
+ unsigned short bv_count, unsigned len)
{
struct bio *clone = &tio->clone;
+ unsigned trim = 0;
__bio_clone(clone, bio);
- clone->bi_sector = sector;
- clone->bi_idx = idx;
- clone->bi_vcnt = idx + bv_count;
- clone->bi_size = to_bytes(len);
- clone->bi_flags &= ~(1 << BIO_SEG_VALID);
-
- if (bio_integrity(bio)) {
- bio_integrity_clone(clone, bio, GFP_NOIO);
-
- if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
- bio_integrity_trim(clone,
- bio_sector_offset(bio, idx, 0), len);
- }
+ bio_setup_sector(clone, sector, len);
+ bio_setup_bv(clone, idx, bv_count);
+
+ if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+ trim = 1;
+ clone_bio_integrity(bio, clone, idx, len, 0, trim);
}
static struct dm_target_io *alloc_tio(struct clone_info *ci,
- struct dm_target *ti, int nr_iovecs)
+ struct dm_target *ti, int nr_iovecs,
+ unsigned target_bio_nr)
{
struct dm_target_io *tio;
struct bio *clone;
@@ -1097,96 +1099,104 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
tio->io = ci->io;
tio->ti = ti;
memset(&tio->info, 0, sizeof(tio->info));
- tio->target_request_nr = 0;
+ tio->target_bio_nr = target_bio_nr;
return tio;
}
-static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
- unsigned request_nr, sector_t len)
+static void __clone_and_map_simple_bio(struct clone_info *ci,
+ struct dm_target *ti,
+ unsigned target_bio_nr, sector_t len)
{
- struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
+ struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
struct bio *clone = &tio->clone;
- tio->target_request_nr = request_nr;
-
/*
* Discard requests require the bio's inline iovecs be initialized.
* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
* and discard, so no need for concern about wasted bvec allocations.
*/
-
__bio_clone(clone, ci->bio);
- if (len) {
- clone->bi_sector = ci->sector;
- clone->bi_size = to_bytes(len);
- }
+ if (len)
+ bio_setup_sector(clone, ci->sector, len);
- __map_bio(ti, tio);
+ __map_bio(tio);
}
-static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
- unsigned num_requests, sector_t len)
+static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
+ unsigned num_bios, sector_t len)
{
- unsigned request_nr;
+ unsigned target_bio_nr;
- for (request_nr = 0; request_nr < num_requests; request_nr++)
- __issue_target_request(ci, ti, request_nr, len);
+ for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
+ __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
}
-static int __clone_and_map_empty_flush(struct clone_info *ci)
+static int __send_empty_flush(struct clone_info *ci)
{
unsigned target_nr = 0;
struct dm_target *ti;
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
- __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
+ __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
return 0;
}
-/*
- * Perform all io with a single clone.
- */
-static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
+static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
+ sector_t sector, int nr_iovecs,
+ unsigned short idx, unsigned short bv_count,
+ unsigned offset, unsigned len,
+ unsigned split_bvec)
{
struct bio *bio = ci->bio;
struct dm_target_io *tio;
+ unsigned target_bio_nr;
+ unsigned num_target_bios = 1;
+
+ /*
+ * Does the target want to receive duplicate copies of the bio?
+ */
+ if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
+ num_target_bios = ti->num_write_bios(ti, bio);
- tio = alloc_tio(ci, ti, bio->bi_max_vecs);
- clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx,
- ci->sector_count, ci->md->bs);
- __map_bio(ti, tio);
- ci->sector_count = 0;
+ for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
+ tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
+ if (split_bvec)
+ clone_split_bio(tio, bio, sector, idx, offset, len);
+ else
+ clone_bio(tio, bio, sector, idx, bv_count, len);
+ __map_bio(tio);
+ }
}
-typedef unsigned (*get_num_requests_fn)(struct dm_target *ti);
+typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
-static unsigned get_num_discard_requests(struct dm_target *ti)
+static unsigned get_num_discard_bios(struct dm_target *ti)
{
- return ti->num_discard_requests;
+ return ti->num_discard_bios;
}
-static unsigned get_num_write_same_requests(struct dm_target *ti)
+static unsigned get_num_write_same_bios(struct dm_target *ti)
{
- return ti->num_write_same_requests;
+ return ti->num_write_same_bios;
}
typedef bool (*is_split_required_fn)(struct dm_target *ti);
static bool is_split_required_for_discard(struct dm_target *ti)
{
- return ti->split_discard_requests;
+ return ti->split_discard_bios;
}
-static int __clone_and_map_changing_extent_only(struct clone_info *ci,
- get_num_requests_fn get_num_requests,
- is_split_required_fn is_split_required)
+static int __send_changing_extent_only(struct clone_info *ci,
+ get_num_bios_fn get_num_bios,
+ is_split_required_fn is_split_required)
{
struct dm_target *ti;
sector_t len;
- unsigned num_requests;
+ unsigned num_bios;
do {
ti = dm_table_find_target(ci->map, ci->sector);
@@ -1199,8 +1209,8 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
* reconfiguration might also have changed that since the
* check was performed.
*/
- num_requests = get_num_requests ? get_num_requests(ti) : 0;
- if (!num_requests)
+ num_bios = get_num_bios ? get_num_bios(ti) : 0;
+ if (!num_bios)
return -EOPNOTSUPP;
if (is_split_required && !is_split_required(ti))
@@ -1208,7 +1218,7 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
else
len = min(ci->sector_count, max_io_len(ci->sector, ti));
- __issue_target_requests(ci, ti, num_requests, len);
+ __send_duplicate_bios(ci, ti, num_bios, len);
ci->sector += len;
} while (ci->sector_count -= len);
@@ -1216,108 +1226,129 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
return 0;
}
-static int __clone_and_map_discard(struct clone_info *ci)
+static int __send_discard(struct clone_info *ci)
{
- return __clone_and_map_changing_extent_only(ci, get_num_discard_requests,
- is_split_required_for_discard);
+ return __send_changing_extent_only(ci, get_num_discard_bios,
+ is_split_required_for_discard);
}
-static int __clone_and_map_write_same(struct clone_info *ci)
+static int __send_write_same(struct clone_info *ci)
{
- return __clone_and_map_changing_extent_only(ci, get_num_write_same_requests, NULL);
+ return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
}
-static int __clone_and_map(struct clone_info *ci)
+/*
+ * Find maximum number of sectors / bvecs we can process with a single bio.
+ */
+static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
{
struct bio *bio = ci->bio;
- struct dm_target *ti;
- sector_t len = 0, max;
- struct dm_target_io *tio;
-
- if (unlikely(bio->bi_rw & REQ_DISCARD))
- return __clone_and_map_discard(ci);
- else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
- return __clone_and_map_write_same(ci);
+ sector_t bv_len, total_len = 0;
- ti = dm_table_find_target(ci->map, ci->sector);
- if (!dm_target_is_valid(ti))
- return -EIO;
+ for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
+ bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
- max = max_io_len(ci->sector, ti);
+ if (bv_len > max)
+ break;
- if (ci->sector_count <= max) {
- /*
- * Optimise for the simple case where we can do all of
- * the remaining io with a single clone.
- */
- __clone_and_map_simple(ci, ti);
+ max -= bv_len;
+ total_len += bv_len;
+ }
- } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
- /*
- * There are some bvecs that don't span targets.
- * Do as many of these as possible.
- */
- int i;
- sector_t remaining = max;
- sector_t bv_len;
+ return total_len;
+}
- for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
- bv_len = to_sector(bio->bi_io_vec[i].bv_len);
+static int __split_bvec_across_targets(struct clone_info *ci,
+ struct dm_target *ti, sector_t max)
+{
+ struct bio *bio = ci->bio;
+ struct bio_vec *bv = bio->bi_io_vec + ci->idx;
+ sector_t remaining = to_sector(bv->bv_len);
+ unsigned offset = 0;
+ sector_t len;
- if (bv_len > remaining)
- break;
+ do {
+ if (offset) {
+ ti = dm_table_find_target(ci->map, ci->sector);
+ if (!dm_target_is_valid(ti))
+ return -EIO;
- remaining -= bv_len;
- len += bv_len;
+ max = max_io_len(ci->sector, ti);
}
- tio = alloc_tio(ci, ti, bio->bi_max_vecs);
- clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len,
- ci->md->bs);
- __map_bio(ti, tio);
+ len = min(remaining, max);
+
+ __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
+ bv->bv_offset + offset, len, 1);
ci->sector += len;
ci->sector_count -= len;
- ci->idx = i;
+ offset += to_bytes(len);
+ } while (remaining -= len);
- } else {
- /*
- * Handle a bvec that must be split between two or more targets.
- */
- struct bio_vec *bv = bio->bi_io_vec + ci->idx;
- sector_t remaining = to_sector(bv->bv_len);
- unsigned int offset = 0;
+ ci->idx++;
+
+ return 0;
+}
+
+/*
+ * Select the correct strategy for processing a non-flush bio.
+ */
+static int __split_and_process_non_flush(struct clone_info *ci)
+{
+ struct bio *bio = ci->bio;
+ struct dm_target *ti;
+ sector_t len, max;
+ int idx;
+
+ if (unlikely(bio->bi_rw & REQ_DISCARD))
+ return __send_discard(ci);
+ else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
+ return __send_write_same(ci);
- do {
- if (offset) {
- ti = dm_table_find_target(ci->map, ci->sector);
- if (!dm_target_is_valid(ti))
- return -EIO;
+ ti = dm_table_find_target(ci->map, ci->sector);
+ if (!dm_target_is_valid(ti))
+ return -EIO;
- max = max_io_len(ci->sector, ti);
- }
+ max = max_io_len(ci->sector, ti);
- len = min(remaining, max);
+ /*
+ * Optimise for the simple case where we can do all of
+ * the remaining io with a single clone.
+ */
+ if (ci->sector_count <= max) {
+ __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
+ ci->idx, bio->bi_vcnt - ci->idx, 0,
+ ci->sector_count, 0);
+ ci->sector_count = 0;
+ return 0;
+ }
- tio = alloc_tio(ci, ti, 1);
- split_bvec(tio, bio, ci->sector, ci->idx,
- bv->bv_offset + offset, len, ci->md->bs);
+ /*
+ * There are some bvecs that don't span targets.
+ * Do as many of these as possible.
+ */
+ if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
+ len = __len_within_target(ci, max, &idx);
- __map_bio(ti, tio);
+ __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
+ ci->idx, idx - ci->idx, 0, len, 0);
- ci->sector += len;
- ci->sector_count -= len;
- offset += to_bytes(len);
- } while (remaining -= len);
+ ci->sector += len;
+ ci->sector_count -= len;
+ ci->idx = idx;
- ci->idx++;
+ return 0;
}
- return 0;
+ /*
+ * Handle a bvec that must be split between two or more targets.
+ */
+ return __split_bvec_across_targets(ci, ti, max);
}
/*
- * Split the bio into several clones and submit it to targets.
+ * Entry point to split a bio into clones and submit them to the targets.
*/
static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
{
@@ -1341,16 +1372,17 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
ci.idx = bio->bi_idx;
start_io_acct(ci.io);
+
if (bio->bi_rw & REQ_FLUSH) {
ci.bio = &ci.md->flush_bio;
ci.sector_count = 0;
- error = __clone_and_map_empty_flush(&ci);
+ error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
} else {
ci.bio = bio;
ci.sector_count = bio_sectors(bio);
while (ci.sector_count && !error)
- error = __clone_and_map(&ci);
+ error = __split_and_process_non_flush(&ci);
}
/* drop the extra reference count */
@@ -1923,8 +1955,6 @@ static void free_dev(struct mapped_device *md)
unlock_fs(md);
bdput(md->bdev);
destroy_workqueue(md->wq);
- if (md->tio_pool)
- mempool_destroy(md->tio_pool);
if (md->io_pool)
mempool_destroy(md->io_pool);
if (md->bs)
@@ -1947,24 +1977,33 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
{
struct dm_md_mempools *p = dm_table_get_md_mempools(t);
- if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs) {
- /*
- * The md already has necessary mempools. Reload just the
- * bioset because front_pad may have changed because
- * a different table was loaded.
- */
- bioset_free(md->bs);
- md->bs = p->bs;
- p->bs = NULL;
+ if (md->io_pool && md->bs) {
+ /* The md already has necessary mempools. */
+ if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+ /*
+ * Reload bioset because front_pad may have changed
+ * because a different table was loaded.
+ */
+ bioset_free(md->bs);
+ md->bs = p->bs;
+ p->bs = NULL;
+ } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
+ /*
+ * There's no need to reload with request-based dm
+ * because the size of front_pad doesn't change.
+ * Note for future: If you are to reload bioset,
+ * prep-ed requests in the queue may refer
+ * to bio from the old bioset, so you must walk
+ * through the queue to unprep.
+ */
+ }
goto out;
}
- BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
+ BUG_ON(!p || md->io_pool || md->bs);
md->io_pool = p->io_pool;
p->io_pool = NULL;
- md->tio_pool = p->tio_pool;
- p->tio_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
@@ -2395,7 +2434,7 @@ static void dm_queue_flush(struct mapped_device *md)
*/
struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
{
- struct dm_table *live_map, *map = ERR_PTR(-EINVAL);
+ struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
struct queue_limits limits;
int r;
@@ -2418,10 +2457,12 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
dm_table_put(live_map);
}
- r = dm_calculate_queue_limits(table, &limits);
- if (r) {
- map = ERR_PTR(r);
- goto out;
+ if (!live_map) {
+ r = dm_calculate_queue_limits(table, &limits);
+ if (r) {
+ map = ERR_PTR(r);
+ goto out;
+ }
}
map = __bind(md, table, &limits);
@@ -2719,52 +2760,42 @@ EXPORT_SYMBOL_GPL(dm_noflush_suspending);
struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
{
- struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
- unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
+ struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+ struct kmem_cache *cachep;
+ unsigned int pool_size;
+ unsigned int front_pad;
if (!pools)
return NULL;
- per_bio_data_size = roundup(per_bio_data_size, __alignof__(struct dm_target_io));
+ if (type == DM_TYPE_BIO_BASED) {
+ cachep = _io_cache;
+ pool_size = 16;
+ front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+ } else if (type == DM_TYPE_REQUEST_BASED) {
+ cachep = _rq_tio_cache;
+ pool_size = MIN_IOS;
+ front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
+ /* per_bio_data_size is not used. See __bind_mempools(). */
+ WARN_ON(per_bio_data_size != 0);
+ } else
+ goto out;
- pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
- mempool_create_slab_pool(MIN_IOS, _io_cache) :
- mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
+ pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
if (!pools->io_pool)
- goto free_pools_and_out;
-
- pools->tio_pool = NULL;
- if (type == DM_TYPE_REQUEST_BASED) {
- pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
- if (!pools->tio_pool)
- goto free_io_pool_and_out;
- }
+ goto out;
- pools->bs = (type == DM_TYPE_BIO_BASED) ?
- bioset_create(pool_size,
- per_bio_data_size + offsetof(struct dm_target_io, clone)) :
- bioset_create(pool_size,
- offsetof(struct dm_rq_clone_bio_info, clone));
+ pools->bs = bioset_create(pool_size, front_pad);
if (!pools->bs)
- goto free_tio_pool_and_out;
+ goto out;
if (integrity && bioset_integrity_create(pools->bs, pool_size))
- goto free_bioset_and_out;
+ goto out;
return pools;
-free_bioset_and_out:
- bioset_free(pools->bs);
-
-free_tio_pool_and_out:
- if (pools->tio_pool)
- mempool_destroy(pools->tio_pool);
-
-free_io_pool_and_out:
- mempool_destroy(pools->io_pool);
-
-free_pools_and_out:
- kfree(pools);
+out:
+ dm_free_md_mempools(pools);
return NULL;
}
@@ -2777,9 +2808,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (pools->io_pool)
mempool_destroy(pools->io_pool);
- if (pools->tio_pool)
- mempool_destroy(pools->tio_pool);
-
if (pools->bs)
bioset_free(pools->bs);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3db3d1b271f..fcb878f8879 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
bio_io_error(bio);
return;
}
+ if (mddev->ro == 1 && unlikely(rw == WRITE)) {
+ bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
+ return;
+ }
smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
if (mddev->suspended) {
@@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (!sectors)
sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
rdev->data_offset;
+ if (!my_mddev->pers->resize)
+ /* Cannot change size for RAID0 or Linear etc */
+ return -EINVAL;
}
if (sectors < my_mddev->dev_sectors)
return -EINVAL; /* component must fit device */
@@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
mddev->ro = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
+ /* mddev_unlock will wake thread */
+ /* If a device failed while we were read-only, we
+ * need to make sure the metadata is updated now.
+ */
+ if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+ mddev_unlock(mddev);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags));
+ mddev_lock(mddev);
+ }
} else {
err = -EROFS;
goto abort_unlock;
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index ceb359050a5..19b26879541 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -1,6 +1,6 @@
config DM_PERSISTENT_DATA
tristate
- depends on BLK_DEV_DM && EXPERIMENTAL
+ depends on BLK_DEV_DM
select LIBCRC32C
select DM_BUFIO
---help---
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
index d8e7cb767c1..ff528792c35 100644
--- a/drivers/md/persistent-data/Makefile
+++ b/drivers/md/persistent-data/Makefile
@@ -1,5 +1,7 @@
obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
dm-persistent-data-objs := \
+ dm-array.o \
+ dm-bitset.o \
dm-block-manager.o \
dm-space-map-common.o \
dm-space-map-disk.o \
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
new file mode 100644
index 00000000000..172147eb1d4
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.c
@@ -0,0 +1,808 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-array.h"
+#include "dm-space-map.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "array"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The array is implemented as a fully populated btree, which points to
+ * blocks that contain the packed values. This is more space efficient
+ * than just using a btree since we don't store 1 key per value.
+ */
+struct array_block {
+ __le32 csum;
+ __le32 max_entries;
+ __le32 nr_entries;
+ __le32 value_size;
+ __le64 blocknr; /* Block this node is supposed to live in. */
+} __packed;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Validator methods. As usual we calculate a checksum, and also write the
+ * block location into the header (paranoia about ssds remapping areas by
+ * mistake).
+ */
+#define CSUM_XOR 595846735
+
+static void array_block_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t size_of_block)
+{
+ struct array_block *bh_le = dm_block_data(b);
+
+ bh_le->blocknr = cpu_to_le64(dm_block_location(b));
+ bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
+ size_of_block - sizeof(__le32),
+ CSUM_XOR));
+}
+
+static int array_block_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t size_of_block)
+{
+ struct array_block *bh_le = dm_block_data(b);
+ __le32 csum_disk;
+
+ if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) {
+ DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu",
+ (unsigned long long) le64_to_cpu(bh_le->blocknr),
+ (unsigned long long) dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
+ size_of_block - sizeof(__le32),
+ CSUM_XOR));
+ if (csum_disk != bh_le->csum) {
+ DMERR_LIMIT("array_block_check failed: csum %u != wanted %u",
+ (unsigned) le32_to_cpu(csum_disk),
+ (unsigned) le32_to_cpu(bh_le->csum));
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+static struct dm_block_validator array_validator = {
+ .name = "array",
+ .prepare_for_write = array_block_prepare_for_write,
+ .check = array_block_check
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Functions for manipulating the array blocks.
+ */
+
+/*
+ * Returns a pointer to a value within an array block.
+ *
+ * index - The index into _this_ specific block.
+ */
+static void *element_at(struct dm_array_info *info, struct array_block *ab,
+ unsigned index)
+{
+ unsigned char *entry = (unsigned char *) (ab + 1);
+
+ entry += index * info->value_type.size;
+
+ return entry;
+}
+
+/*
+ * Utility function that calls one of the value_type methods on every value
+ * in an array block.
+ */
+static void on_entries(struct dm_array_info *info, struct array_block *ab,
+ void (*fn)(void *, const void *))
+{
+ unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
+
+ for (i = 0; i < nr_entries; i++)
+ fn(info->value_type.context, element_at(info, ab, i));
+}
+
+/*
+ * Increment every value in an array block.
+ */
+static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab)
+{
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ if (vt->inc)
+ on_entries(info, ab, vt->inc);
+}
+
+/*
+ * Decrement every value in an array block.
+ */
+static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab)
+{
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ if (vt->dec)
+ on_entries(info, ab, vt->dec);
+}
+
+/*
+ * Each array block can hold this many values.
+ */
+static uint32_t calc_max_entries(size_t value_size, size_t size_of_block)
+{
+ return (size_of_block - sizeof(struct array_block)) / value_size;
+}
+
+/*
+ * Allocate a new array block. The caller will need to unlock block.
+ */
+static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
+ uint32_t max_entries,
+ struct dm_block **block, struct array_block **ab)
+{
+ int r;
+
+ r = dm_tm_new_block(info->btree_info.tm, &array_validator, block);
+ if (r)
+ return r;
+
+ (*ab) = dm_block_data(*block);
+ (*ab)->max_entries = cpu_to_le32(max_entries);
+ (*ab)->nr_entries = cpu_to_le32(0);
+ (*ab)->value_size = cpu_to_le32(info->value_type.size);
+
+ return 0;
+}
+
+/*
+ * Pad an array block out with a particular value. Every instance will
+ * cause an increment of the value_type. new_nr must always be more than
+ * the current number of entries.
+ */
+static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
+ const void *value, unsigned new_nr)
+{
+ unsigned i;
+ uint32_t nr_entries;
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+ BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
+
+ nr_entries = le32_to_cpu(ab->nr_entries);
+ for (i = nr_entries; i < new_nr; i++) {
+ if (vt->inc)
+ vt->inc(vt->context, value);
+ memcpy(element_at(info, ab, i), value, vt->size);
+ }
+ ab->nr_entries = cpu_to_le32(new_nr);
+}
+
+/*
+ * Remove some entries from the back of an array block. Every value
+ * removed will be decremented. new_nr must be <= the current number of
+ * entries.
+ */
+static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
+ unsigned new_nr)
+{
+ unsigned i;
+ uint32_t nr_entries;
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+ BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
+
+ nr_entries = le32_to_cpu(ab->nr_entries);
+ for (i = nr_entries; i > new_nr; i--)
+ if (vt->dec)
+ vt->dec(vt->context, element_at(info, ab, i - 1));
+ ab->nr_entries = cpu_to_le32(new_nr);
+}
+
+/*
+ * Read locks a block, and coerces it to an array block. The caller must
+ * unlock 'block' when finished.
+ */
+static int get_ablock(struct dm_array_info *info, dm_block_t b,
+ struct dm_block **block, struct array_block **ab)
+{
+ int r;
+
+ r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block);
+ if (r)
+ return r;
+
+ *ab = dm_block_data(*block);
+ return 0;
+}
+
+/*
+ * Unlocks an array block.
+ */
+static int unlock_ablock(struct dm_array_info *info, struct dm_block *block)
+{
+ return dm_tm_unlock(info->btree_info.tm, block);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Btree manipulation.
+ */
+
+/*
+ * Looks up an array block in the btree, and then read locks it.
+ *
+ * index is the index of the index of the array_block, (ie. the array index
+ * / max_entries).
+ */
+static int lookup_ablock(struct dm_array_info *info, dm_block_t root,
+ unsigned index, struct dm_block **block,
+ struct array_block **ab)
+{
+ int r;
+ uint64_t key = index;
+ __le64 block_le;
+
+ r = dm_btree_lookup(&info->btree_info, root, &key, &block_le);
+ if (r)
+ return r;
+
+ return get_ablock(info, le64_to_cpu(block_le), block, ab);
+}
+
+/*
+ * Insert an array block into the btree. The block is _not_ unlocked.
+ */
+static int insert_ablock(struct dm_array_info *info, uint64_t index,
+ struct dm_block *block, dm_block_t *root)
+{
+ __le64 block_le = cpu_to_le64(dm_block_location(block));
+
+ __dm_bless_for_disk(block_le);
+ return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root);
+}
+
+/*
+ * Looks up an array block in the btree. Then shadows it, and updates the
+ * btree to point to this new shadow. 'root' is an input/output parameter
+ * for both the current root block, and the new one.
+ */
+static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
+ unsigned index, struct dm_block **block,
+ struct array_block **ab)
+{
+ int r, inc;
+ uint64_t key = index;
+ dm_block_t b;
+ __le64 block_le;
+
+ /*
+ * lookup
+ */
+ r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le);
+ if (r)
+ return r;
+ b = le64_to_cpu(block_le);
+
+ /*
+ * shadow
+ */
+ r = dm_tm_shadow_block(info->btree_info.tm, b,
+ &array_validator, block, &inc);
+ if (r)
+ return r;
+
+ *ab = dm_block_data(*block);
+ if (inc)
+ inc_ablock_entries(info, *ab);
+
+ /*
+ * Reinsert.
+ *
+ * The shadow op will often be a noop. Only insert if it really
+ * copied data.
+ */
+ if (dm_block_location(*block) != b)
+ r = insert_ablock(info, index, *block, root);
+
+ return r;
+}
+
+/*
+ * Allocate an new array block, and fill it with some values.
+ */
+static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block,
+ uint32_t max_entries,
+ unsigned block_index, uint32_t nr,
+ const void *value, dm_block_t *root)
+{
+ int r;
+ struct dm_block *block;
+ struct array_block *ab;
+
+ r = alloc_ablock(info, size_of_block, max_entries, &block, &ab);
+ if (r)
+ return r;
+
+ fill_ablock(info, ab, value, nr);
+ r = insert_ablock(info, block_index, block, root);
+ unlock_ablock(info, block);
+
+ return r;
+}
+
+static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block,
+ unsigned begin_block, unsigned end_block,
+ unsigned max_entries, const void *value,
+ dm_block_t *root)
+{
+ int r = 0;
+
+ for (; !r && begin_block != end_block; begin_block++)
+ r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root);
+
+ return r;
+}
+
+/*
+ * There are a bunch of functions involved with resizing an array. This
+ * structure holds information that commonly needed by them. Purely here
+ * to reduce parameter count.
+ */
+struct resize {
+ /*
+ * Describes the array.
+ */
+ struct dm_array_info *info;
+
+ /*
+ * The current root of the array. This gets updated.
+ */
+ dm_block_t root;
+
+ /*
+ * Metadata block size. Used to calculate the nr entries in an
+ * array block.
+ */
+ size_t size_of_block;
+
+ /*
+ * Maximum nr entries in an array block.
+ */
+ unsigned max_entries;
+
+ /*
+ * nr of completely full blocks in the array.
+ *
+ * 'old' refers to before the resize, 'new' after.
+ */
+ unsigned old_nr_full_blocks, new_nr_full_blocks;
+
+ /*
+ * Number of entries in the final block. 0 iff only full blocks in
+ * the array.
+ */
+ unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block;
+
+ /*
+ * The default value used when growing the array.
+ */
+ const void *value;
+};
+
+/*
+ * Removes a consecutive set of array blocks from the btree. The values
+ * in block are decremented as a side effect of the btree remove.
+ *
+ * begin_index - the index of the first array block to remove.
+ * end_index - the one-past-the-end value. ie. this block is not removed.
+ */
+static int drop_blocks(struct resize *resize, unsigned begin_index,
+ unsigned end_index)
+{
+ int r;
+
+ while (begin_index != end_index) {
+ uint64_t key = begin_index++;
+ r = dm_btree_remove(&resize->info->btree_info, resize->root,
+ &key, &resize->root);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+/*
+ * Calculates how many blocks are needed for the array.
+ */
+static unsigned total_nr_blocks_needed(unsigned nr_full_blocks,
+ unsigned nr_entries_in_last_block)
+{
+ return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0);
+}
+
+/*
+ * Shrink an array.
+ */
+static int shrink(struct resize *resize)
+{
+ int r;
+ unsigned begin, end;
+ struct dm_block *block;
+ struct array_block *ab;
+
+ /*
+ * Lose some blocks from the back?
+ */
+ if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) {
+ begin = total_nr_blocks_needed(resize->new_nr_full_blocks,
+ resize->new_nr_entries_in_last_block);
+ end = total_nr_blocks_needed(resize->old_nr_full_blocks,
+ resize->old_nr_entries_in_last_block);
+
+ r = drop_blocks(resize, begin, end);
+ if (r)
+ return r;
+ }
+
+ /*
+ * Trim the new tail block
+ */
+ if (resize->new_nr_entries_in_last_block) {
+ r = shadow_ablock(resize->info, &resize->root,
+ resize->new_nr_full_blocks, &block, &ab);
+ if (r)
+ return r;
+
+ trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block);
+ unlock_ablock(resize->info, block);
+ }
+
+ return 0;
+}
+
+/*
+ * Grow an array.
+ */
+static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries)
+{
+ int r;
+ struct dm_block *block;
+ struct array_block *ab;
+
+ r = shadow_ablock(resize->info, &resize->root,
+ resize->old_nr_full_blocks, &block, &ab);
+ if (r)
+ return r;
+
+ fill_ablock(resize->info, ab, resize->value, new_nr_entries);
+ unlock_ablock(resize->info, block);
+
+ return r;
+}
+
+static int grow_add_tail_block(struct resize *resize)
+{
+ return insert_new_ablock(resize->info, resize->size_of_block,
+ resize->max_entries,
+ resize->new_nr_full_blocks,
+ resize->new_nr_entries_in_last_block,
+ resize->value, &resize->root);
+}
+
+static int grow_needs_more_blocks(struct resize *resize)
+{
+ int r;
+
+ if (resize->old_nr_entries_in_last_block > 0) {
+ r = grow_extend_tail_block(resize, resize->max_entries);
+ if (r)
+ return r;
+ }
+
+ r = insert_full_ablocks(resize->info, resize->size_of_block,
+ resize->old_nr_full_blocks,
+ resize->new_nr_full_blocks,
+ resize->max_entries, resize->value,
+ &resize->root);
+ if (r)
+ return r;
+
+ if (resize->new_nr_entries_in_last_block)
+ r = grow_add_tail_block(resize);
+
+ return r;
+}
+
+static int grow(struct resize *resize)
+{
+ if (resize->new_nr_full_blocks > resize->old_nr_full_blocks)
+ return grow_needs_more_blocks(resize);
+
+ else if (resize->old_nr_entries_in_last_block)
+ return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block);
+
+ else
+ return grow_add_tail_block(resize);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These are the value_type functions for the btree elements, which point
+ * to array blocks.
+ */
+static void block_inc(void *context, const void *value)
+{
+ __le64 block_le;
+ struct dm_array_info *info = context;
+
+ memcpy(&block_le, value, sizeof(block_le));
+ dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
+}
+
+static void block_dec(void *context, const void *value)
+{
+ int r;
+ uint64_t b;
+ __le64 block_le;
+ uint32_t ref_count;
+ struct dm_block *block;
+ struct array_block *ab;
+ struct dm_array_info *info = context;
+
+ memcpy(&block_le, value, sizeof(block_le));
+ b = le64_to_cpu(block_le);
+
+ r = dm_tm_ref(info->btree_info.tm, b, &ref_count);
+ if (r) {
+ DMERR_LIMIT("couldn't get reference count for block %llu",
+ (unsigned long long) b);
+ return;
+ }
+
+ if (ref_count == 1) {
+ /*
+ * We're about to drop the last reference to this ablock.
+ * So we need to decrement the ref count of the contents.
+ */
+ r = get_ablock(info, b, &block, &ab);
+ if (r) {
+ DMERR_LIMIT("couldn't get array block %llu",
+ (unsigned long long) b);
+ return;
+ }
+
+ dec_ablock_entries(info, ab);
+ unlock_ablock(info, block);
+ }
+
+ dm_tm_dec(info->btree_info.tm, b);
+}
+
+static int block_equal(void *context, const void *value1, const void *value2)
+{
+ return !memcmp(value1, value2, sizeof(__le64));
+}
+
+/*----------------------------------------------------------------*/
+
+void dm_array_info_init(struct dm_array_info *info,
+ struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt)
+{
+ struct dm_btree_value_type *bvt = &info->btree_info.value_type;
+
+ memcpy(&info->value_type, vt, sizeof(info->value_type));
+ info->btree_info.tm = tm;
+ info->btree_info.levels = 1;
+
+ bvt->context = info;
+ bvt->size = sizeof(__le64);
+ bvt->inc = block_inc;
+ bvt->dec = block_dec;
+ bvt->equal = block_equal;
+}
+EXPORT_SYMBOL_GPL(dm_array_info_init);
+
+int dm_array_empty(struct dm_array_info *info, dm_block_t *root)
+{
+ return dm_btree_empty(&info->btree_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_array_empty);
+
+static int array_resize(struct dm_array_info *info, dm_block_t root,
+ uint32_t old_size, uint32_t new_size,
+ const void *value, dm_block_t *new_root)
+{
+ int r;
+ struct resize resize;
+
+ if (old_size == new_size)
+ return 0;
+
+ resize.info = info;
+ resize.root = root;
+ resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+ resize.max_entries = calc_max_entries(info->value_type.size,
+ resize.size_of_block);
+
+ resize.old_nr_full_blocks = old_size / resize.max_entries;
+ resize.old_nr_entries_in_last_block = old_size % resize.max_entries;
+ resize.new_nr_full_blocks = new_size / resize.max_entries;
+ resize.new_nr_entries_in_last_block = new_size % resize.max_entries;
+ resize.value = value;
+
+ r = ((new_size > old_size) ? grow : shrink)(&resize);
+ if (r)
+ return r;
+
+ *new_root = resize.root;
+ return 0;
+}
+
+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
+ uint32_t old_size, uint32_t new_size,
+ const void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value)
+{
+ int r = array_resize(info, root, old_size, new_size, value, new_root);
+ __dm_unbless_for_disk(value);
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_resize);
+
+int dm_array_del(struct dm_array_info *info, dm_block_t root)
+{
+ return dm_btree_del(&info->btree_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_array_del);
+
+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, void *value_le)
+{
+ int r;
+ struct dm_block *block;
+ struct array_block *ab;
+ size_t size_of_block;
+ unsigned entry, max_entries;
+
+ size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+ max_entries = calc_max_entries(info->value_type.size, size_of_block);
+
+ r = lookup_ablock(info, root, index / max_entries, &block, &ab);
+ if (r)
+ return r;
+
+ entry = index % max_entries;
+ if (entry >= le32_to_cpu(ab->nr_entries))
+ r = -ENODATA;
+ else
+ memcpy(value_le, element_at(info, ab, entry),
+ info->value_type.size);
+
+ unlock_ablock(info, block);
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_get_value);
+
+static int array_set_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, const void *value, dm_block_t *new_root)
+{
+ int r;
+ struct dm_block *block;
+ struct array_block *ab;
+ size_t size_of_block;
+ unsigned max_entries;
+ unsigned entry;
+ void *old_value;
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+ max_entries = calc_max_entries(info->value_type.size, size_of_block);
+
+ r = shadow_ablock(info, &root, index / max_entries, &block, &ab);
+ if (r)
+ return r;
+ *new_root = root;
+
+ entry = index % max_entries;
+ if (entry >= le32_to_cpu(ab->nr_entries)) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ old_value = element_at(info, ab, entry);
+ if (vt->dec &&
+ (!vt->equal || !vt->equal(vt->context, old_value, value))) {
+ vt->dec(vt->context, old_value);
+ if (vt->inc)
+ vt->inc(vt->context, value);
+ }
+
+ memcpy(old_value, value, info->value_type.size);
+
+out:
+ unlock_ablock(info, block);
+ return r;
+}
+
+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, const void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value)
+{
+ int r;
+
+ r = array_set_value(info, root, index, value, new_root);
+ __dm_unbless_for_disk(value);
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_set_value);
+
+struct walk_info {
+ struct dm_array_info *info;
+ int (*fn)(void *context, uint64_t key, void *leaf);
+ void *context;
+};
+
+static int walk_ablock(void *context, uint64_t *keys, void *leaf)
+{
+ struct walk_info *wi = context;
+
+ int r;
+ unsigned i;
+ __le64 block_le;
+ unsigned nr_entries, max_entries;
+ struct dm_block *block;
+ struct array_block *ab;
+
+ memcpy(&block_le, leaf, sizeof(block_le));
+ r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab);
+ if (r)
+ return r;
+
+ max_entries = le32_to_cpu(ab->max_entries);
+ nr_entries = le32_to_cpu(ab->nr_entries);
+ for (i = 0; i < nr_entries; i++) {
+ r = wi->fn(wi->context, keys[0] * max_entries + i,
+ element_at(wi->info, ab, i));
+
+ if (r)
+ break;
+ }
+
+ unlock_ablock(wi->info, block);
+ return r;
+}
+
+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
+ int (*fn)(void *, uint64_t key, void *leaf),
+ void *context)
+{
+ struct walk_info wi;
+
+ wi.info = info;
+ wi.fn = fn;
+ wi.context = context;
+
+ return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi);
+}
+EXPORT_SYMBOL_GPL(dm_array_walk);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
new file mode 100644
index 00000000000..ea177d6fa58
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_ARRAY_H
+#define _LINUX_DM_ARRAY_H
+
+#include "dm-btree.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The dm-array is a persistent version of an array. It packs the data
+ * more efficiently than a btree which will result in less disk space use,
+ * and a performance boost. The element get and set operations are still
+ * O(ln(n)), but with a much smaller constant.
+ *
+ * The value type structure is reused from the btree type to support proper
+ * reference counting of values.
+ *
+ * The arrays implicitly know their length, and bounds are checked for
+ * lookups and updated. It doesn't store this in an accessible place
+ * because it would waste a whole metadata block. Make sure you store the
+ * size along with the array root in your encompassing data.
+ *
+ * Array entries are indexed via an unsigned integer starting from zero.
+ * Arrays are not sparse; if you resize an array to have 'n' entries then
+ * 'n - 1' will be the last valid index.
+ *
+ * Typical use:
+ *
+ * a) initialise a dm_array_info structure. This describes the array
+ * values and ties it into a specific transaction manager. It holds no
+ * instance data; the same info can be used for many similar arrays if
+ * you wish.
+ *
+ * b) Get yourself a root. The root is the index of a block of data on the
+ * disk that holds a particular instance of an array. You may have a
+ * pre existing root in your metadata that you wish to use, or you may
+ * want to create a brand new, empty array with dm_array_empty().
+ *
+ * Like the other data structures in this library, dm_array objects are
+ * immutable between transactions. Update functions will return you the
+ * root for a _new_ array. If you've incremented the old root, via
+ * dm_tm_inc(), before calling the update function you may continue to use
+ * it in parallel with the new root.
+ *
+ * c) resize an array with dm_array_resize().
+ *
+ * d) Get a value from the array with dm_array_get_value().
+ *
+ * e) Set a value in the array with dm_array_set_value().
+ *
+ * f) Walk an array of values in index order with dm_array_walk(). More
+ * efficient than making many calls to dm_array_get_value().
+ *
+ * g) Destroy the array with dm_array_del(). This tells the transaction
+ * manager that you're no longer using this data structure so it can
+ * recycle it's blocks. (dm_array_dec() would be a better name for it,
+ * but del is in keeping with dm_btree_del()).
+ */
+
+/*
+ * Describes an array. Don't initialise this structure yourself, use the
+ * init function below.
+ */
+struct dm_array_info {
+ struct dm_transaction_manager *tm;
+ struct dm_btree_value_type value_type;
+ struct dm_btree_info btree_info;
+};
+
+/*
+ * Sets up a dm_array_info structure. You don't need to do anything with
+ * this structure when you finish using it.
+ *
+ * info - the structure being filled in.
+ * tm - the transaction manager that should supervise this structure.
+ * vt - describes the leaf values.
+ */
+void dm_array_info_init(struct dm_array_info *info,
+ struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt);
+
+/*
+ * Create an empty, zero length array.
+ *
+ * info - describes the array
+ * root - on success this will be filled out with the root block
+ */
+int dm_array_empty(struct dm_array_info *info, dm_block_t *root);
+
+/*
+ * Resizes the array.
+ *
+ * info - describes the array
+ * root - the root block of the array on disk
+ * old_size - the caller is responsible for remembering the size of
+ * the array
+ * new_size - can be bigger or smaller than old_size
+ * value - if we're growing the array the new entries will have this value
+ * new_root - on success, points to the new root block
+ *
+ * If growing the inc function for 'value' will be called the appropriate
+ * number of times. So if the caller is holding a reference they may want
+ * to drop it.
+ */
+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
+ uint32_t old_size, uint32_t new_size,
+ const void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value);
+
+/*
+ * Frees a whole array. The value_type's decrement operation will be called
+ * for all values in the array
+ */
+int dm_array_del(struct dm_array_info *info, dm_block_t root);
+
+/*
+ * Lookup a value in the array
+ *
+ * info - describes the array
+ * root - root block of the array
+ * index - array index
+ * value - the value to be read. Will be in on-disk format of course.
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, void *value);
+
+/*
+ * Set an entry in the array.
+ *
+ * info - describes the array
+ * root - root block of the array
+ * index - array index
+ * value - value to be written to disk. Make sure you confirm the value is
+ * in on-disk format with__dm_bless_for_disk() before calling.
+ * new_root - the new root block
+ *
+ * The old value being overwritten will be decremented, the new value
+ * incremented.
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, const void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value);
+
+/*
+ * Walk through all the entries in an array.
+ *
+ * info - describes the array
+ * root - root block of the array
+ * fn - called back for every element
+ * context - passed to the callback
+ */
+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
+ int (*fn)(void *context, uint64_t key, void *leaf),
+ void *context);
+
+/*----------------------------------------------------------------*/
+
+#endif /* _LINUX_DM_ARRAY_H */
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
new file mode 100644
index 00000000000..cd9a86d4cdf
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-bitset.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "bitset"
+#define BITS_PER_ARRAY_ENTRY 64
+
+/*----------------------------------------------------------------*/
+
+static struct dm_btree_value_type bitset_bvt = {
+ .context = NULL,
+ .size = sizeof(__le64),
+ .inc = NULL,
+ .dec = NULL,
+ .equal = NULL,
+};
+
+/*----------------------------------------------------------------*/
+
+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
+ struct dm_disk_bitset *info)
+{
+ dm_array_info_init(&info->array_info, tm, &bitset_bvt);
+ info->current_index_set = false;
+}
+EXPORT_SYMBOL_GPL(dm_disk_bitset_init);
+
+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
+{
+ return dm_array_empty(&info->array_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_empty);
+
+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t old_nr_entries, uint32_t new_nr_entries,
+ bool default_value, dm_block_t *new_root)
+{
+ uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY);
+ uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY);
+ __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0);
+
+ __dm_bless_for_disk(&value);
+ return dm_array_resize(&info->array_info, root, old_blocks, new_blocks,
+ &value, new_root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_resize);
+
+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root)
+{
+ return dm_array_del(&info->array_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_del);
+
+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
+ dm_block_t *new_root)
+{
+ int r;
+ __le64 value;
+
+ if (!info->current_index_set)
+ return 0;
+
+ value = cpu_to_le64(info->current_bits);
+
+ __dm_bless_for_disk(&value);
+ r = dm_array_set_value(&info->array_info, root, info->current_index,
+ &value, new_root);
+ if (r)
+ return r;
+
+ info->current_index_set = false;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_flush);
+
+static int read_bits(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t array_index)
+{
+ int r;
+ __le64 value;
+
+ r = dm_array_get_value(&info->array_info, root, array_index, &value);
+ if (r)
+ return r;
+
+ info->current_bits = le64_to_cpu(value);
+ info->current_index_set = true;
+ info->current_index = array_index;
+ return 0;
+}
+
+static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root)
+{
+ int r;
+ unsigned array_index = index / BITS_PER_ARRAY_ENTRY;
+
+ if (info->current_index_set) {
+ if (info->current_index == array_index)
+ return 0;
+
+ r = dm_bitset_flush(info, root, new_root);
+ if (r)
+ return r;
+ }
+
+ return read_bits(info, root, array_index);
+}
+
+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root)
+{
+ int r;
+ unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+ r = get_array_entry(info, root, index, new_root);
+ if (r)
+ return r;
+
+ set_bit(b, (unsigned long *) &info->current_bits);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_set_bit);
+
+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root)
+{
+ int r;
+ unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+ r = get_array_entry(info, root, index, new_root);
+ if (r)
+ return r;
+
+ clear_bit(b, (unsigned long *) &info->current_bits);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_clear_bit);
+
+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root, bool *result)
+{
+ int r;
+ unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+ r = get_array_entry(info, root, index, new_root);
+ if (r)
+ return r;
+
+ *result = test_bit(b, (unsigned long *) &info->current_bits);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
new file mode 100644
index 00000000000..e1b9bea14aa
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_BITSET_H
+#define _LINUX_DM_BITSET_H
+
+#include "dm-array.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This bitset type is a thin wrapper round a dm_array of 64bit words. It
+ * uses a tiny, one word cache to reduce the number of array lookups and so
+ * increase performance.
+ *
+ * Like the dm-array that it's based on, the caller needs to keep track of
+ * the size of the bitset separately. The underlying dm-array implicitly
+ * knows how many words it's storing and will return -ENODATA if you try
+ * and access an out of bounds word. However, an out of bounds bit in the
+ * final word will _not_ be detected, you have been warned.
+ *
+ * Bits are indexed from zero.
+
+ * Typical use:
+ *
+ * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init().
+ * This describes the bitset and includes the cache. It's not called it
+ * dm_bitset_info in line with other data structures because it does
+ * include instance data.
+ *
+ * b) Get yourself a root. The root is the index of a block of data on the
+ * disk that holds a particular instance of an bitset. You may have a
+ * pre existing root in your metadata that you wish to use, or you may
+ * want to create a brand new, empty bitset with dm_bitset_empty().
+ *
+ * Like the other data structures in this library, dm_bitset objects are
+ * immutable between transactions. Update functions will return you the
+ * root for a _new_ array. If you've incremented the old root, via
+ * dm_tm_inc(), before calling the update function you may continue to use
+ * it in parallel with the new root.
+ *
+ * Even read operations may trigger the cache to be flushed and as such
+ * return a root for a new, updated bitset.
+ *
+ * c) resize a bitset with dm_bitset_resize().
+ *
+ * d) Set a bit with dm_bitset_set_bit().
+ *
+ * e) Clear a bit with dm_bitset_clear_bit().
+ *
+ * f) Test a bit with dm_bitset_test_bit().
+ *
+ * g) Flush all updates from the cache with dm_bitset_flush().
+ *
+ * h) Destroy the bitset with dm_bitset_del(). This tells the transaction
+ * manager that you're no longer using this data structure so it can
+ * recycle it's blocks. (dm_bitset_dec() would be a better name for it,
+ * but del is in keeping with dm_btree_del()).
+ */
+
+/*
+ * Opaque object. Unlike dm_array_info, you should have one of these per
+ * bitset. Initialise with dm_disk_bitset_init().
+ */
+struct dm_disk_bitset {
+ struct dm_array_info array_info;
+
+ uint32_t current_index;
+ uint64_t current_bits;
+
+ bool current_index_set:1;
+};
+
+/*
+ * Sets up a dm_disk_bitset structure. You don't need to do anything with
+ * this structure when you finish using it.
+ *
+ * tm - the transaction manager that should supervise this structure
+ * info - the structure being initialised
+ */
+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
+ struct dm_disk_bitset *info);
+
+/*
+ * Create an empty, zero length bitset.
+ *
+ * info - describes the bitset
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
+
+/*
+ * Resize the bitset.
+ *
+ * info - describes the bitset
+ * old_root - the root block of the array on disk
+ * old_nr_entries - the number of bits in the old bitset
+ * new_nr_entries - the number of bits you want in the new bitset
+ * default_value - the value for any new bits
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root,
+ uint32_t old_nr_entries, uint32_t new_nr_entries,
+ bool default_value, dm_block_t *new_root);
+
+/*
+ * Frees the bitset.
+ */
+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root);
+
+/*
+ * Set a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root);
+
+/*
+ * Clears a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root);
+
+/*
+ * Tests a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block (cached values may have been written)
+ * result - the bit value you're after
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root, bool *result);
+
+/*
+ * Flush any cached changes to disk.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
+ dm_block_t *new_root);
+
+/*----------------------------------------------------------------*/
+
+#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 28c3ed072a7..81b513890e2 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -613,6 +613,7 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
return dm_bufio_write_dirty_buffers(bm->bufio);
}
+EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
void dm_bm_set_read_only(struct dm_block_manager *bm)
{
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index accbb05f17b..37d367bb9aa 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -64,6 +64,7 @@ struct ro_spine {
void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
int exit_ro_spine(struct ro_spine *s);
int ro_step(struct ro_spine *s, dm_block_t new_child);
+void ro_pop(struct ro_spine *s);
struct btree_node *ro_node(struct ro_spine *s);
struct shadow_spine {
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index f199a0c4ed0..cf9fd676ae4 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -164,6 +164,13 @@ int ro_step(struct ro_spine *s, dm_block_t new_child)
return r;
}
+void ro_pop(struct ro_spine *s)
+{
+ BUG_ON(!s->count);
+ --s->count;
+ unlock_block(s->info, s->nodes[s->count]);
+}
+
struct btree_node *ro_node(struct ro_spine *s)
{
struct dm_block *block;
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 4caf66918cd..35865425e4b 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -807,3 +807,55 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
return r ? r : count;
}
EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
+
+/*
+ * FIXME: We shouldn't use a recursive algorithm when we have limited stack
+ * space. Also this only works for single level trees.
+ */
+static int walk_node(struct ro_spine *s, dm_block_t block,
+ int (*fn)(void *context, uint64_t *keys, void *leaf),
+ void *context)
+{
+ int r;
+ unsigned i, nr;
+ struct btree_node *n;
+ uint64_t keys;
+
+ r = ro_step(s, block);
+ n = ro_node(s);
+
+ nr = le32_to_cpu(n->header.nr_entries);
+ for (i = 0; i < nr; i++) {
+ if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
+ r = walk_node(s, value64(n, i), fn, context);
+ if (r)
+ goto out;
+ } else {
+ keys = le64_to_cpu(*key_ptr(n, i));
+ r = fn(context, &keys, value_ptr(n, i));
+ if (r)
+ goto out;
+ }
+ }
+
+out:
+ ro_pop(s);
+ return r;
+}
+
+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
+ int (*fn)(void *context, uint64_t *keys, void *leaf),
+ void *context)
+{
+ int r;
+ struct ro_spine spine;
+
+ BUG_ON(info->levels > 1);
+
+ init_ro_spine(&spine, info);
+ r = walk_node(&spine, root, fn, context);
+ exit_ro_spine(&spine);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_walk);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index a2cd50441ca..8672d159e0b 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -58,21 +58,21 @@ struct dm_btree_value_type {
* somewhere.) This method is _not_ called for insertion of a new
* value: It is assumed the ref count is already 1.
*/
- void (*inc)(void *context, void *value);
+ void (*inc)(void *context, const void *value);
/*
* This value is being deleted. The btree takes care of freeing
* the memory pointed to by @value. Often the del function just
* needs to decrement a reference count somewhere.
*/
- void (*dec)(void *context, void *value);
+ void (*dec)(void *context, const void *value);
/*
* A test for equality between two values. When a value is
* overwritten with a new one, the old one has the dec method
* called _unless_ the new and old value are deemed equal.
*/
- int (*equal)(void *context, void *value1, void *value2);
+ int (*equal)(void *context, const void *value1, const void *value2);
};
/*
@@ -142,4 +142,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
uint64_t *result_keys);
+/*
+ * Iterate through the a btree, calling fn() on each entry.
+ * It only works for single level trees and is internally recursive, so
+ * monitor stack usage carefully.
+ */
+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
+ int (*fn)(void *context, uint64_t *keys, void *leaf),
+ void *context);
+
#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7..0505452de8d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
rdev1->new_raid_disk = j;
}
- if (j < 0 || j >= mddev->raid_disks) {
+ if (j < 0) {
+ printk(KERN_ERR
+ "md/raid0:%s: remove inactive devices before converting to RAID0\n",
+ mdname(mddev));
+ goto abort;
+ }
+ if (j >= mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
"aborting!\n", mdname(mddev), j);
goto abort;
@@ -289,7 +295,7 @@ abort:
kfree(conf->strip_zone);
kfree(conf->devlist);
kfree(conf);
- *private_conf = NULL;
+ *private_conf = ERR_PTR(err);
return err;
}
@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
"%s does not support generic reshape\n", __func__);
rdev_for_each(rdev, mddev)
- array_sectors += rdev->sectors;
+ array_sectors += (rdev->sectors &
+ ~(sector_t)(mddev->chunk_sectors-1));
return array_sectors;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010..fd86b372692 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
@@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
@@ -1301,7 +1303,8 @@ read_again:
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
+ mbio->bi_rw =
+ WRITE | do_flush_fua | do_sync | do_discard | do_same;
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
@@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
+ if (mddev->queue)
+ blk_queue_max_write_same_sectors(mddev->queue,
+ mddev->chunk_sectors);
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
continue;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03..77b562d18a9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
* near_copies (stored in low byte of layout)
* far_copies (stored in second byte of layout)
* far_offset (stored in bit 16 of layout )
+ * use_far_sets (stored in bit 17 of layout )
*
- * The data to be stored is divided into chunks using chunksize.
- * Each device is divided into far_copies sections.
- * In each section, chunks are laid out in a style similar to raid0, but
- * near_copies copies of each chunk is stored (each on a different drive).
- * The starting device for each section is offset near_copies from the starting
- * device of the previous section.
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
- * drive.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
+ * The data to be stored is divided into chunks using chunksize. Each device
+ * is divided into far_copies sections. In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive). The starting device for each section is offset
+ * near_copies from the starting device of the previous section. Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive. near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
*
* If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
- * on disk, there are adjacent stripes.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true. In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size. The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array. This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ * A B C D A B C D E
+ * ... ...
+ * D A B C E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ * [A B] [C D] [A B] [C D E]
+ * |...| |...| |...| | ... |
+ * [B A] [D C] [B A] [E C D]
*/
/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
sector_t stripe;
int dev;
int slot = 0;
+ int last_far_set_start, last_far_set_size;
+
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ last_far_set_size = geo->far_set_size;
+ last_far_set_size += (geo->raid_disks % geo->far_set_size);
/* now calculate first sector/dev */
chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
/* and calculate all the others */
for (n = 0; n < geo->near_copies; n++) {
int d = dev;
+ int set;
sector_t s = sector;
- r10bio->devs[slot].addr = sector;
r10bio->devs[slot].devnum = d;
+ r10bio->devs[slot].addr = s;
slot++;
for (f = 1; f < geo->far_copies; f++) {
+ set = d / geo->far_set_size;
d += geo->near_copies;
- if (d >= geo->raid_disks)
- d -= geo->raid_disks;
+
+ if ((geo->raid_disks % geo->far_set_size) &&
+ (d > last_far_set_start)) {
+ d -= last_far_set_start;
+ d %= last_far_set_size;
+ d += last_far_set_start;
+ } else {
+ d %= geo->far_set_size;
+ d += geo->far_set_size * set;
+ }
s += geo->stride;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* or recovery, so reshape isn't happening
*/
struct geom *geo = &conf->geo;
+ int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+ int far_set_size = geo->far_set_size;
+ int last_far_set_start;
+
+ if (geo->raid_disks % geo->far_set_size) {
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ if (dev >= last_far_set_start) {
+ far_set_size = geo->far_set_size;
+ far_set_size += (geo->raid_disks % geo->far_set_size);
+ far_set_start = last_far_set_start;
+ }
+ }
offset = sector & geo->chunk_mask;
if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, geo->far_copies);
dev -= fc * geo->near_copies;
- if (dev < 0)
- dev += geo->raid_disks;
+ if (dev < far_set_start)
+ dev += far_set_size;
} else {
while (sector >= geo->stride) {
sector -= geo->stride;
- if (dev < geo->near_copies)
- dev += geo->raid_disks - geo->near_copies;
+ if (dev < (geo->near_copies + far_set_start))
+ dev += far_set_size - geo->near_copies;
else
dev -= geo->near_copies;
}
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
@@ -1460,7 +1508,8 @@ retry_write:
rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1551,8 @@ retry_write:
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
disks = mddev->raid_disks + mddev->delta_disks;
break;
}
- if (layout >> 17)
+ if (layout >> 18)
return -1;
if (chunk < (PAGE_SIZE >> 9) ||
!is_power_of_2(chunk))
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
geo->near_copies = nc;
geo->far_copies = fc;
geo->far_offset = fo;
+ geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
geo->chunk_mask = chunk - 1;
geo->chunk_shift = ffz(~chunk);
return nc*fc;
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue,
+ mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf60234..157d69e83ff 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
* far_offset, in which case it is
* 1 stripe.
*/
+ int far_set_size; /* The number of devices in a set,
+ * where a 'set' are devices that
+ * contain far/offset copies of
+ * each other.
+ */
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
} prev, geo;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5af2d270908..3ee2912889e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1403,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
-static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL;
@@ -1468,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
-#ifdef CONFIG_MULTICORE_RAID456
-static void async_run_ops(void *param, async_cookie_t cookie)
-{
- struct stripe_head *sh = param;
- unsigned long ops_request = sh->ops.request;
-
- clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
- wake_up(&sh->ops.wait_for_ops);
-
- __raid_run_ops(sh, ops_request);
- release_stripe(sh);
-}
-
-static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
-{
- /* since handle_stripe can be called outside of raid5d context
- * we need to ensure sh->ops.request is de-staged before another
- * request arrives
- */
- wait_event(sh->ops.wait_for_ops,
- !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
- sh->ops.request = ops_request;
-
- atomic_inc(&sh->count);
- async_schedule(async_run_ops, sh);
-}
-#else
-#define raid_run_ops __raid_run_ops
-#endif
-
static int grow_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
@@ -1506,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf)
return 0;
sh->raid_conf = conf;
- #ifdef CONFIG_MULTICORE_RAID456
- init_waitqueue_head(&sh->ops.wait_for_ops);
- #endif
spin_lock_init(&sh->stripe_lock);
@@ -1627,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
break;
nsh->raid_conf = conf;
- #ifdef CONFIG_MULTICORE_RAID456
- init_waitqueue_head(&nsh->ops.wait_for_ops);
- #endif
spin_lock_init(&nsh->stripe_lock);
list_add(&nsh->lru, &newstripes);
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 3aa9a969b37..36f5d52775a 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -103,6 +103,7 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/module.h>
+#include <asm/sections.h>
#define v1printk(a...) do { \
if (verbose) \
@@ -222,6 +223,7 @@ static unsigned long lookup_addr(char *arg)
addr = (unsigned long)do_fork;
else if (!strcmp(arg, "hw_break_val"))
addr = (unsigned long)&hw_break_val;
+ addr = (unsigned long) dereference_function_descriptor((void *)addr);
return addr;
}
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index 03f2eb5627e..557bec599f4 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -74,8 +74,8 @@ config MTD_REDBOOT_PARTS_READONLY
endif # MTD_REDBOOT_PARTS
config MTD_CMDLINE_PARTS
- bool "Command line partition table parsing"
- depends on MTD = "y"
+ tristate "Command line partition table parsing"
+ depends on MTD
---help---
Allow generic configuration of the MTD partition tables via the kernel
command line. Multiple flash resources are supported for hardware where
diff --git a/drivers/mtd/ar7part.c b/drivers/mtd/ar7part.c
index 7c057a05adb..ddc0a4287a4 100644
--- a/drivers/mtd/ar7part.c
+++ b/drivers/mtd/ar7part.c
@@ -142,7 +142,13 @@ static int __init ar7_parser_init(void)
return register_mtd_parser(&ar7_parser);
}
+static void __exit ar7_parser_exit(void)
+{
+ deregister_mtd_parser(&ar7_parser);
+}
+
module_init(ar7_parser_init);
+module_exit(ar7_parser_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR( "Felix Fietkau <nbd@openwrt.org>, "
diff --git a/drivers/mtd/bcm47xxpart.c b/drivers/mtd/bcm47xxpart.c
index e06d782489a..63feb75cc8e 100644
--- a/drivers/mtd/bcm47xxpart.c
+++ b/drivers/mtd/bcm47xxpart.c
@@ -14,17 +14,11 @@
#include <linux/slab.h>
#include <linux/mtd/mtd.h>
#include <linux/mtd/partitions.h>
-#include <asm/mach-bcm47xx/nvram.h>
+#include <bcm47xx_nvram.h>
/* 10 parts were found on sflash on Netgear WNDR4500 */
#define BCM47XXPART_MAX_PARTS 12
-/*
- * Amount of bytes we read when analyzing each block of flash memory.
- * Set it big enough to allow detecting partition and reading important data.
- */
-#define BCM47XXPART_BYTES_TO_READ 0x404
-
/* Magics */
#define BOARD_DATA_MAGIC 0x5246504D /* MPFR */
#define POT_MAGIC1 0x54544f50 /* POTT */
@@ -59,13 +53,21 @@ static int bcm47xxpart_parse(struct mtd_info *master,
uint32_t *buf;
size_t bytes_read;
uint32_t offset;
- uint32_t blocksize = 0x10000;
+ uint32_t blocksize = master->erasesize;
struct trx_header *trx;
+ int trx_part = -1;
+ int last_trx_part = -1;
+ int max_bytes_to_read = 0x8004;
+
+ if (blocksize <= 0x10000)
+ blocksize = 0x10000;
+ if (blocksize == 0x20000)
+ max_bytes_to_read = 0x18004;
/* Alloc */
parts = kzalloc(sizeof(struct mtd_partition) * BCM47XXPART_MAX_PARTS,
GFP_KERNEL);
- buf = kzalloc(BCM47XXPART_BYTES_TO_READ, GFP_KERNEL);
+ buf = kzalloc(max_bytes_to_read, GFP_KERNEL);
/* Parse block by block looking for magics */
for (offset = 0; offset <= master->size - blocksize;
@@ -80,7 +82,7 @@ static int bcm47xxpart_parse(struct mtd_info *master,
}
/* Read beginning of the block */
- if (mtd_read(master, offset, BCM47XXPART_BYTES_TO_READ,
+ if (mtd_read(master, offset, max_bytes_to_read,
&bytes_read, (uint8_t *)buf) < 0) {
pr_err("mtd_read error while parsing (offset: 0x%X)!\n",
offset);
@@ -95,9 +97,16 @@ static int bcm47xxpart_parse(struct mtd_info *master,
}
/* Standard NVRAM */
- if (buf[0x000 / 4] == NVRAM_HEADER) {
+ if (buf[0x000 / 4] == NVRAM_HEADER ||
+ buf[0x1000 / 4] == NVRAM_HEADER ||
+ buf[0x8000 / 4] == NVRAM_HEADER ||
+ (blocksize == 0x20000 && (
+ buf[0x10000 / 4] == NVRAM_HEADER ||
+ buf[0x11000 / 4] == NVRAM_HEADER ||
+ buf[0x18000 / 4] == NVRAM_HEADER))) {
bcm47xxpart_add_part(&parts[curr_part++], "nvram",
offset, 0);
+ offset = rounddown(offset, blocksize);
continue;
}
@@ -131,6 +140,10 @@ static int bcm47xxpart_parse(struct mtd_info *master,
if (buf[0x000 / 4] == TRX_MAGIC) {
trx = (struct trx_header *)buf;
+ trx_part = curr_part;
+ bcm47xxpart_add_part(&parts[curr_part++], "firmware",
+ offset, 0);
+
i = 0;
/* We have LZMA loader if offset[2] points to sth */
if (trx->offset[2]) {
@@ -154,6 +167,8 @@ static int bcm47xxpart_parse(struct mtd_info *master,
offset + trx->offset[i], 0);
i++;
+ last_trx_part = curr_part - 1;
+
/*
* We have whole TRX scanned, skip to the next part. Use
* roundown (not roundup), as the loop will increase
@@ -169,11 +184,15 @@ static int bcm47xxpart_parse(struct mtd_info *master,
* Assume that partitions end at the beginning of the one they are
* followed by.
*/
- for (i = 0; i < curr_part - 1; i++)
- parts[i].size = parts[i + 1].offset - parts[i].offset;
- if (curr_part > 0)
- parts[curr_part - 1].size =
- master->size - parts[curr_part - 1].offset;
+ for (i = 0; i < curr_part; i++) {
+ u64 next_part_offset = (i < curr_part - 1) ?
+ parts[i + 1].offset : master->size;
+
+ parts[i].size = next_part_offset - parts[i].offset;
+ if (i == last_trx_part && trx_part >= 0)
+ parts[trx_part].size = next_part_offset -
+ parts[trx_part].offset;
+ }
*pparts = parts;
return curr_part;
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index b86197286f2..fff665d59a0 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -33,6 +33,8 @@
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/reboot.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
#include <linux/mtd/map.h>
#include <linux/mtd/mtd.h>
#include <linux/mtd/cfi.h>
@@ -74,6 +76,10 @@ static void put_chip(struct map_info *map, struct flchip *chip, unsigned long ad
static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_ppb_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_ppb_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_ppb_is_locked(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+
static struct mtd_chip_driver cfi_amdstd_chipdrv = {
.probe = NULL, /* Not usable directly */
.destroy = cfi_amdstd_destroy,
@@ -496,6 +502,7 @@ static void cfi_fixup_m29ew_delay_after_resume(struct cfi_private *cfi)
struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
{
struct cfi_private *cfi = map->fldrv_priv;
+ struct device_node __maybe_unused *np = map->device_node;
struct mtd_info *mtd;
int i;
@@ -570,6 +577,17 @@ struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
cfi_tell_features(extp);
#endif
+#ifdef CONFIG_OF
+ if (np && of_property_read_bool(
+ np, "use-advanced-sector-protection")
+ && extp->BlkProtUnprot == 8) {
+ printk(KERN_INFO " Advanced Sector Protection (PPB Locking) supported\n");
+ mtd->_lock = cfi_ppb_lock;
+ mtd->_unlock = cfi_ppb_unlock;
+ mtd->_is_locked = cfi_ppb_is_locked;
+ }
+#endif
+
bootloc = extp->TopBottom;
if ((bootloc < 2) || (bootloc > 5)) {
printk(KERN_WARNING "%s: CFI contains unrecognised boot "
@@ -2172,6 +2190,205 @@ static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
return cfi_varsize_frob(mtd, do_atmel_unlock, ofs, len, NULL);
}
+/*
+ * Advanced Sector Protection - PPB (Persistent Protection Bit) locking
+ */
+
+struct ppb_lock {
+ struct flchip *chip;
+ loff_t offset;
+ int locked;
+};
+
+#define MAX_SECTORS 512
+
+#define DO_XXLOCK_ONEBLOCK_LOCK ((void *)1)
+#define DO_XXLOCK_ONEBLOCK_UNLOCK ((void *)2)
+#define DO_XXLOCK_ONEBLOCK_GETLOCK ((void *)3)
+
+static int __maybe_unused do_ppb_xxlock(struct map_info *map,
+ struct flchip *chip,
+ unsigned long adr, int len, void *thunk)
+{
+ struct cfi_private *cfi = map->fldrv_priv;
+ unsigned long timeo;
+ int ret;
+
+ mutex_lock(&chip->mutex);
+ ret = get_chip(map, chip, adr + chip->start, FL_LOCKING);
+ if (ret) {
+ mutex_unlock(&chip->mutex);
+ return ret;
+ }
+
+ pr_debug("MTD %s(): XXLOCK 0x%08lx len %d\n", __func__, adr, len);
+
+ cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi,
+ cfi->device_type, NULL);
+ cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi,
+ cfi->device_type, NULL);
+ /* PPB entry command */
+ cfi_send_gen_cmd(0xC0, cfi->addr_unlock1, chip->start, map, cfi,
+ cfi->device_type, NULL);
+
+ if (thunk == DO_XXLOCK_ONEBLOCK_LOCK) {
+ chip->state = FL_LOCKING;
+ map_write(map, CMD(0xA0), chip->start + adr);
+ map_write(map, CMD(0x00), chip->start + adr);
+ } else if (thunk == DO_XXLOCK_ONEBLOCK_UNLOCK) {
+ /*
+ * Unlocking of one specific sector is not supported, so we
+ * have to unlock all sectors of this device instead
+ */
+ chip->state = FL_UNLOCKING;
+ map_write(map, CMD(0x80), chip->start);
+ map_write(map, CMD(0x30), chip->start);
+ } else if (thunk == DO_XXLOCK_ONEBLOCK_GETLOCK) {
+ chip->state = FL_JEDEC_QUERY;
+ /* Return locked status: 0->locked, 1->unlocked */
+ ret = !cfi_read_query(map, adr);
+ } else
+ BUG();
+
+ /*
+ * Wait for some time as unlocking of all sectors takes quite long
+ */
+ timeo = jiffies + msecs_to_jiffies(2000); /* 2s max (un)locking */
+ for (;;) {
+ if (chip_ready(map, adr))
+ break;
+
+ if (time_after(jiffies, timeo)) {
+ printk(KERN_ERR "Waiting for chip to be ready timed out.\n");
+ ret = -EIO;
+ break;
+ }
+
+ UDELAY(map, chip, adr, 1);
+ }
+
+ /* Exit BC commands */
+ map_write(map, CMD(0x90), chip->start);
+ map_write(map, CMD(0x00), chip->start);
+
+ chip->state = FL_READY;
+ put_chip(map, chip, adr + chip->start);
+ mutex_unlock(&chip->mutex);
+
+ return ret;
+}
+
+static int __maybe_unused cfi_ppb_lock(struct mtd_info *mtd, loff_t ofs,
+ uint64_t len)
+{
+ return cfi_varsize_frob(mtd, do_ppb_xxlock, ofs, len,
+ DO_XXLOCK_ONEBLOCK_LOCK);
+}
+
+static int __maybe_unused cfi_ppb_unlock(struct mtd_info *mtd, loff_t ofs,
+ uint64_t len)
+{
+ struct mtd_erase_region_info *regions = mtd->eraseregions;
+ struct map_info *map = mtd->priv;
+ struct cfi_private *cfi = map->fldrv_priv;
+ struct ppb_lock *sect;
+ unsigned long adr;
+ loff_t offset;
+ uint64_t length;
+ int chipnum;
+ int i;
+ int sectors;
+ int ret;
+
+ /*
+ * PPB unlocking always unlocks all sectors of the flash chip.
+ * We need to re-lock all previously locked sectors. So lets
+ * first check the locking status of all sectors and save
+ * it for future use.
+ */
+ sect = kzalloc(MAX_SECTORS * sizeof(struct ppb_lock), GFP_KERNEL);
+ if (!sect)
+ return -ENOMEM;
+
+ /*
+ * This code to walk all sectors is a slightly modified version
+ * of the cfi_varsize_frob() code.
+ */
+ i = 0;
+ chipnum = 0;
+ adr = 0;
+ sectors = 0;
+ offset = 0;
+ length = mtd->size;
+
+ while (length) {
+ int size = regions[i].erasesize;
+
+ /*
+ * Only test sectors that shall not be unlocked. The other
+ * sectors shall be unlocked, so lets keep their locking
+ * status at "unlocked" (locked=0) for the final re-locking.
+ */
+ if ((adr < ofs) || (adr >= (ofs + len))) {
+ sect[sectors].chip = &cfi->chips[chipnum];
+ sect[sectors].offset = offset;
+ sect[sectors].locked = do_ppb_xxlock(
+ map, &cfi->chips[chipnum], adr, 0,
+ DO_XXLOCK_ONEBLOCK_GETLOCK);
+ }
+
+ adr += size;
+ offset += size;
+ length -= size;
+
+ if (offset == regions[i].offset + size * regions[i].numblocks)
+ i++;
+
+ if (adr >> cfi->chipshift) {
+ adr = 0;
+ chipnum++;
+
+ if (chipnum >= cfi->numchips)
+ break;
+ }
+
+ sectors++;
+ if (sectors >= MAX_SECTORS) {
+ printk(KERN_ERR "Only %d sectors for PPB locking supported!\n",
+ MAX_SECTORS);
+ kfree(sect);
+ return -EINVAL;
+ }
+ }
+
+ /* Now unlock the whole chip */
+ ret = cfi_varsize_frob(mtd, do_ppb_xxlock, ofs, len,
+ DO_XXLOCK_ONEBLOCK_UNLOCK);
+ if (ret) {
+ kfree(sect);
+ return ret;
+ }
+
+ /*
+ * PPB unlocking always unlocks all sectors of the flash chip.
+ * We need to re-lock all previously locked sectors.
+ */
+ for (i = 0; i < sectors; i++) {
+ if (sect[i].locked)
+ do_ppb_xxlock(map, sect[i].chip, sect[i].offset, 0,
+ DO_XXLOCK_ONEBLOCK_LOCK);
+ }
+
+ kfree(sect);
+ return ret;
+}
+
+static int __maybe_unused cfi_ppb_is_locked(struct mtd_info *mtd, loff_t ofs,
+ uint64_t len)
+{
+ return cfi_varsize_frob(mtd, do_ppb_xxlock, ofs, len,
+ DO_XXLOCK_ONEBLOCK_GETLOCK) ? 1 : 0;
+}
static void cfi_amdstd_sync (struct mtd_info *mtd)
{
diff --git a/drivers/mtd/cmdlinepart.c b/drivers/mtd/cmdlinepart.c
index c533f27d863..721caebbc5c 100644
--- a/drivers/mtd/cmdlinepart.c
+++ b/drivers/mtd/cmdlinepart.c
@@ -22,11 +22,22 @@
*
* mtdparts=<mtddef>[;<mtddef]
* <mtddef> := <mtd-id>:<partdef>[,<partdef>]
- * where <mtd-id> is the name from the "cat /proc/mtd" command
- * <partdef> := <size>[@offset][<name>][ro][lk]
+ * <partdef> := <size>[@<offset>][<name>][ro][lk]
* <mtd-id> := unique name used in mapping driver/device (mtd->name)
* <size> := standard linux memsize OR "-" to denote all remaining space
+ * size is automatically truncated at end of device
+ * if specified or trucated size is 0 the part is skipped
+ * <offset> := standard linux memsize
+ * if omitted the part will immediately follow the previous part
+ * or 0 if the first part
* <name> := '(' NAME ')'
+ * NAME will appear in /proc/mtd
+ *
+ * <size> and <offset> can be specified such that the parts are out of order
+ * in physical memory and may even overlap.
+ *
+ * The parts are assigned MTD numbers in the order they are specified in the
+ * command line regardless of their order in physical memory.
*
* Examples:
*
@@ -70,6 +81,7 @@ struct cmdline_mtd_partition {
static struct cmdline_mtd_partition *partitions;
/* the command line passed to mtdpart_setup() */
+static char *mtdparts;
static char *cmdline;
static int cmdline_parsed;
@@ -330,6 +342,14 @@ static int parse_cmdline_partitions(struct mtd_info *master,
if (part->parts[i].size == SIZE_REMAINING)
part->parts[i].size = master->size - offset;
+ if (offset + part->parts[i].size > master->size) {
+ printk(KERN_WARNING ERRP
+ "%s: partitioning exceeds flash size, truncating\n",
+ part->mtd_id);
+ part->parts[i].size = master->size - offset;
+ }
+ offset += part->parts[i].size;
+
if (part->parts[i].size == 0) {
printk(KERN_WARNING ERRP
"%s: skipping zero sized partition\n",
@@ -337,16 +357,8 @@ static int parse_cmdline_partitions(struct mtd_info *master,
part->num_parts--;
memmove(&part->parts[i], &part->parts[i + 1],
sizeof(*part->parts) * (part->num_parts - i));
- continue;
- }
-
- if (offset + part->parts[i].size > master->size) {
- printk(KERN_WARNING ERRP
- "%s: partitioning exceeds flash size, truncating\n",
- part->mtd_id);
- part->parts[i].size = master->size - offset;
+ i--;
}
- offset += part->parts[i].size;
}
*pparts = kmemdup(part->parts, sizeof(*part->parts) * part->num_parts,
@@ -365,7 +377,7 @@ static int parse_cmdline_partitions(struct mtd_info *master,
*
* This function needs to be visible for bootloaders.
*/
-static int mtdpart_setup(char *s)
+static int __init mtdpart_setup(char *s)
{
cmdline = s;
return 1;
@@ -381,10 +393,21 @@ static struct mtd_part_parser cmdline_parser = {
static int __init cmdline_parser_init(void)
{
+ if (mtdparts)
+ mtdpart_setup(mtdparts);
return register_mtd_parser(&cmdline_parser);
}
+static void __exit cmdline_parser_exit(void)
+{
+ deregister_mtd_parser(&cmdline_parser);
+}
+
module_init(cmdline_parser_init);
+module_exit(cmdline_parser_exit);
+
+MODULE_PARM_DESC(mtdparts, "Partitioning specification");
+module_param(mtdparts, charp, 0);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Marius Groeger <mag@sysgo.de>");
diff --git a/drivers/mtd/devices/Makefile b/drivers/mtd/devices/Makefile
index 395733a30ef..369a1943ca2 100644
--- a/drivers/mtd/devices/Makefile
+++ b/drivers/mtd/devices/Makefile
@@ -17,8 +17,10 @@ obj-$(CONFIG_MTD_LART) += lart.o
obj-$(CONFIG_MTD_BLOCK2MTD) += block2mtd.o
obj-$(CONFIG_MTD_DATAFLASH) += mtd_dataflash.o
obj-$(CONFIG_MTD_M25P80) += m25p80.o
+obj-$(CONFIG_MTD_NAND_OMAP_BCH) += elm.o
obj-$(CONFIG_MTD_SPEAR_SMI) += spear_smi.o
obj-$(CONFIG_MTD_SST25L) += sst25l.o
obj-$(CONFIG_MTD_BCM47XXSFLASH) += bcm47xxsflash.o
-CFLAGS_docg3.o += -I$(src) \ No newline at end of file
+
+CFLAGS_docg3.o += -I$(src)
diff --git a/drivers/mtd/devices/bcm47xxsflash.c b/drivers/mtd/devices/bcm47xxsflash.c
index 4714584aa99..95266285acb 100644
--- a/drivers/mtd/devices/bcm47xxsflash.c
+++ b/drivers/mtd/devices/bcm47xxsflash.c
@@ -5,6 +5,8 @@
#include <linux/platform_device.h>
#include <linux/bcma/bcma.h>
+#include "bcm47xxsflash.h"
+
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Serial flash driver for BCMA bus");
@@ -13,26 +15,28 @@ static const char *probes[] = { "bcm47xxpart", NULL };
static int bcm47xxsflash_read(struct mtd_info *mtd, loff_t from, size_t len,
size_t *retlen, u_char *buf)
{
- struct bcma_sflash *sflash = mtd->priv;
+ struct bcm47xxsflash *b47s = mtd->priv;
/* Check address range */
if ((from + len) > mtd->size)
return -EINVAL;
- memcpy_fromio(buf, (void __iomem *)KSEG0ADDR(sflash->window + from),
+ memcpy_fromio(buf, (void __iomem *)KSEG0ADDR(b47s->window + from),
len);
+ *retlen = len;
return len;
}
-static void bcm47xxsflash_fill_mtd(struct bcma_sflash *sflash,
- struct mtd_info *mtd)
+static void bcm47xxsflash_fill_mtd(struct bcm47xxsflash *b47s)
{
- mtd->priv = sflash;
+ struct mtd_info *mtd = &b47s->mtd;
+
+ mtd->priv = b47s;
mtd->name = "bcm47xxsflash";
mtd->owner = THIS_MODULE;
mtd->type = MTD_ROM;
- mtd->size = sflash->size;
+ mtd->size = b47s->size;
mtd->_read = bcm47xxsflash_read;
/* TODO: implement writing support and verify/change following code */
@@ -40,19 +44,30 @@ static void bcm47xxsflash_fill_mtd(struct bcma_sflash *sflash,
mtd->writebufsize = mtd->writesize = 1;
}
-static int bcm47xxsflash_probe(struct platform_device *pdev)
+/**************************************************
+ * BCMA
+ **************************************************/
+
+static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
{
struct bcma_sflash *sflash = dev_get_platdata(&pdev->dev);
+ struct bcm47xxsflash *b47s;
int err;
- sflash->mtd = kzalloc(sizeof(struct mtd_info), GFP_KERNEL);
- if (!sflash->mtd) {
+ b47s = kzalloc(sizeof(*b47s), GFP_KERNEL);
+ if (!b47s) {
err = -ENOMEM;
goto out;
}
- bcm47xxsflash_fill_mtd(sflash, sflash->mtd);
+ sflash->priv = b47s;
- err = mtd_device_parse_register(sflash->mtd, probes, NULL, NULL, 0);
+ b47s->window = sflash->window;
+ b47s->blocksize = sflash->blocksize;
+ b47s->numblocks = sflash->numblocks;
+ b47s->size = sflash->size;
+ bcm47xxsflash_fill_mtd(b47s);
+
+ err = mtd_device_parse_register(&b47s->mtd, probes, NULL, NULL, 0);
if (err) {
pr_err("Failed to register MTD device: %d\n", err);
goto err_dev_reg;
@@ -61,34 +76,40 @@ static int bcm47xxsflash_probe(struct platform_device *pdev)
return 0;
err_dev_reg:
- kfree(sflash->mtd);
+ kfree(&b47s->mtd);
out:
return err;
}
-static int bcm47xxsflash_remove(struct platform_device *pdev)
+static int bcm47xxsflash_bcma_remove(struct platform_device *pdev)
{
struct bcma_sflash *sflash = dev_get_platdata(&pdev->dev);
+ struct bcm47xxsflash *b47s = sflash->priv;
- mtd_device_unregister(sflash->mtd);
- kfree(sflash->mtd);
+ mtd_device_unregister(&b47s->mtd);
+ kfree(b47s);
return 0;
}
static struct platform_driver bcma_sflash_driver = {
- .remove = bcm47xxsflash_remove,
+ .probe = bcm47xxsflash_bcma_probe,
+ .remove = bcm47xxsflash_bcma_remove,
.driver = {
.name = "bcma_sflash",
.owner = THIS_MODULE,
},
};
+/**************************************************
+ * Init
+ **************************************************/
+
static int __init bcm47xxsflash_init(void)
{
int err;
- err = platform_driver_probe(&bcma_sflash_driver, bcm47xxsflash_probe);
+ err = platform_driver_register(&bcma_sflash_driver);
if (err)
pr_err("Failed to register BCMA serial flash driver: %d\n",
err);
diff --git a/drivers/mtd/devices/bcm47xxsflash.h b/drivers/mtd/devices/bcm47xxsflash.h
new file mode 100644
index 00000000000..ebf6f710e23
--- /dev/null
+++ b/drivers/mtd/devices/bcm47xxsflash.h
@@ -0,0 +1,15 @@
+#ifndef __BCM47XXSFLASH_H
+#define __BCM47XXSFLASH_H
+
+#include <linux/mtd/mtd.h>
+
+struct bcm47xxsflash {
+ u32 window;
+ u32 blocksize;
+ u16 numblocks;
+ u32 size;
+
+ struct mtd_info mtd;
+};
+
+#endif /* BCM47XXSFLASH */
diff --git a/drivers/mtd/devices/elm.c b/drivers/mtd/devices/elm.c
new file mode 100644
index 00000000000..2ec5da9ee24
--- /dev/null
+++ b/drivers/mtd/devices/elm.c
@@ -0,0 +1,404 @@
+/*
+ * Error Location Module
+ *
+ * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/pm_runtime.h>
+#include <linux/platform_data/elm.h>
+
+#define ELM_IRQSTATUS 0x018
+#define ELM_IRQENABLE 0x01c
+#define ELM_LOCATION_CONFIG 0x020
+#define ELM_PAGE_CTRL 0x080
+#define ELM_SYNDROME_FRAGMENT_0 0x400
+#define ELM_SYNDROME_FRAGMENT_6 0x418
+#define ELM_LOCATION_STATUS 0x800
+#define ELM_ERROR_LOCATION_0 0x880
+
+/* ELM Interrupt Status Register */
+#define INTR_STATUS_PAGE_VALID BIT(8)
+
+/* ELM Interrupt Enable Register */
+#define INTR_EN_PAGE_MASK BIT(8)
+
+/* ELM Location Configuration Register */
+#define ECC_BCH_LEVEL_MASK 0x3
+
+/* ELM syndrome */
+#define ELM_SYNDROME_VALID BIT(16)
+
+/* ELM_LOCATION_STATUS Register */
+#define ECC_CORRECTABLE_MASK BIT(8)
+#define ECC_NB_ERRORS_MASK 0x1f
+
+/* ELM_ERROR_LOCATION_0-15 Registers */
+#define ECC_ERROR_LOCATION_MASK 0x1fff
+
+#define ELM_ECC_SIZE 0x7ff
+
+#define SYNDROME_FRAGMENT_REG_SIZE 0x40
+#define ERROR_LOCATION_SIZE 0x100
+
+struct elm_info {
+ struct device *dev;
+ void __iomem *elm_base;
+ struct completion elm_completion;
+ struct list_head list;
+ enum bch_ecc bch_type;
+};
+
+static LIST_HEAD(elm_devices);
+
+static void elm_write_reg(struct elm_info *info, int offset, u32 val)
+{
+ writel(val, info->elm_base + offset);
+}
+
+static u32 elm_read_reg(struct elm_info *info, int offset)
+{
+ return readl(info->elm_base + offset);
+}
+
+/**
+ * elm_config - Configure ELM module
+ * @dev: ELM device
+ * @bch_type: Type of BCH ecc
+ */
+void elm_config(struct device *dev, enum bch_ecc bch_type)
+{
+ u32 reg_val;
+ struct elm_info *info = dev_get_drvdata(dev);
+
+ reg_val = (bch_type & ECC_BCH_LEVEL_MASK) | (ELM_ECC_SIZE << 16);
+ elm_write_reg(info, ELM_LOCATION_CONFIG, reg_val);
+ info->bch_type = bch_type;
+}
+EXPORT_SYMBOL(elm_config);
+
+/**
+ * elm_configure_page_mode - Enable/Disable page mode
+ * @info: elm info
+ * @index: index number of syndrome fragment vector
+ * @enable: enable/disable flag for page mode
+ *
+ * Enable page mode for syndrome fragment index
+ */
+static void elm_configure_page_mode(struct elm_info *info, int index,
+ bool enable)
+{
+ u32 reg_val;
+
+ reg_val = elm_read_reg(info, ELM_PAGE_CTRL);
+ if (enable)
+ reg_val |= BIT(index); /* enable page mode */
+ else
+ reg_val &= ~BIT(index); /* disable page mode */
+
+ elm_write_reg(info, ELM_PAGE_CTRL, reg_val);
+}
+
+/**
+ * elm_load_syndrome - Load ELM syndrome reg
+ * @info: elm info
+ * @err_vec: elm error vectors
+ * @ecc: buffer with calculated ecc
+ *
+ * Load syndrome fragment registers with calculated ecc in reverse order.
+ */
+static void elm_load_syndrome(struct elm_info *info,
+ struct elm_errorvec *err_vec, u8 *ecc)
+{
+ int i, offset;
+ u32 val;
+
+ for (i = 0; i < ERROR_VECTOR_MAX; i++) {
+
+ /* Check error reported */
+ if (err_vec[i].error_reported) {
+ elm_configure_page_mode(info, i, true);
+ offset = ELM_SYNDROME_FRAGMENT_0 +
+ SYNDROME_FRAGMENT_REG_SIZE * i;
+
+ /* BCH8 */
+ if (info->bch_type) {
+
+ /* syndrome fragment 0 = ecc[9-12B] */
+ val = cpu_to_be32(*(u32 *) &ecc[9]);
+ elm_write_reg(info, offset, val);
+
+ /* syndrome fragment 1 = ecc[5-8B] */
+ offset += 4;
+ val = cpu_to_be32(*(u32 *) &ecc[5]);
+ elm_write_reg(info, offset, val);
+
+ /* syndrome fragment 2 = ecc[1-4B] */
+ offset += 4;
+ val = cpu_to_be32(*(u32 *) &ecc[1]);
+ elm_write_reg(info, offset, val);
+
+ /* syndrome fragment 3 = ecc[0B] */
+ offset += 4;
+ val = ecc[0];
+ elm_write_reg(info, offset, val);
+ } else {
+ /* syndrome fragment 0 = ecc[20-52b] bits */
+ val = (cpu_to_be32(*(u32 *) &ecc[3]) >> 4) |
+ ((ecc[2] & 0xf) << 28);
+ elm_write_reg(info, offset, val);
+
+ /* syndrome fragment 1 = ecc[0-20b] bits */
+ offset += 4;
+ val = cpu_to_be32(*(u32 *) &ecc[0]) >> 12;
+ elm_write_reg(info, offset, val);
+ }
+ }
+
+ /* Update ecc pointer with ecc byte size */
+ ecc += info->bch_type ? BCH8_SIZE : BCH4_SIZE;
+ }
+}
+
+/**
+ * elm_start_processing - start elm syndrome processing
+ * @info: elm info
+ * @err_vec: elm error vectors
+ *
+ * Set syndrome valid bit for syndrome fragment registers for which
+ * elm syndrome fragment registers are loaded. This enables elm module
+ * to start processing syndrome vectors.
+ */
+static void elm_start_processing(struct elm_info *info,
+ struct elm_errorvec *err_vec)
+{
+ int i, offset;
+ u32 reg_val;
+
+ /*
+ * Set syndrome vector valid, so that ELM module
+ * will process it for vectors error is reported
+ */
+ for (i = 0; i < ERROR_VECTOR_MAX; i++) {
+ if (err_vec[i].error_reported) {
+ offset = ELM_SYNDROME_FRAGMENT_6 +
+ SYNDROME_FRAGMENT_REG_SIZE * i;
+ reg_val = elm_read_reg(info, offset);
+ reg_val |= ELM_SYNDROME_VALID;
+ elm_write_reg(info, offset, reg_val);
+ }
+ }
+}
+
+/**
+ * elm_error_correction - locate correctable error position
+ * @info: elm info
+ * @err_vec: elm error vectors
+ *
+ * On completion of processing by elm module, error location status
+ * register updated with correctable/uncorrectable error information.
+ * In case of correctable errors, number of errors located from
+ * elm location status register & read the positions from
+ * elm error location register.
+ */
+static void elm_error_correction(struct elm_info *info,
+ struct elm_errorvec *err_vec)
+{
+ int i, j, errors = 0;
+ int offset;
+ u32 reg_val;
+
+ for (i = 0; i < ERROR_VECTOR_MAX; i++) {
+
+ /* Check error reported */
+ if (err_vec[i].error_reported) {
+ offset = ELM_LOCATION_STATUS + ERROR_LOCATION_SIZE * i;
+ reg_val = elm_read_reg(info, offset);
+
+ /* Check correctable error or not */
+ if (reg_val & ECC_CORRECTABLE_MASK) {
+ offset = ELM_ERROR_LOCATION_0 +
+ ERROR_LOCATION_SIZE * i;
+
+ /* Read count of correctable errors */
+ err_vec[i].error_count = reg_val &
+ ECC_NB_ERRORS_MASK;
+
+ /* Update the error locations in error vector */
+ for (j = 0; j < err_vec[i].error_count; j++) {
+
+ reg_val = elm_read_reg(info, offset);
+ err_vec[i].error_loc[j] = reg_val &
+ ECC_ERROR_LOCATION_MASK;
+
+ /* Update error location register */
+ offset += 4;
+ }
+
+ errors += err_vec[i].error_count;
+ } else {
+ err_vec[i].error_uncorrectable = true;
+ }
+
+ /* Clearing interrupts for processed error vectors */
+ elm_write_reg(info, ELM_IRQSTATUS, BIT(i));
+
+ /* Disable page mode */
+ elm_configure_page_mode(info, i, false);
+ }
+ }
+}
+
+/**
+ * elm_decode_bch_error_page - Locate error position
+ * @dev: device pointer
+ * @ecc_calc: calculated ECC bytes from GPMC
+ * @err_vec: elm error vectors
+ *
+ * Called with one or more error reported vectors & vectors with
+ * error reported is updated in err_vec[].error_reported
+ */
+void elm_decode_bch_error_page(struct device *dev, u8 *ecc_calc,
+ struct elm_errorvec *err_vec)
+{
+ struct elm_info *info = dev_get_drvdata(dev);
+ u32 reg_val;
+
+ /* Enable page mode interrupt */
+ reg_val = elm_read_reg(info, ELM_IRQSTATUS);
+ elm_write_reg(info, ELM_IRQSTATUS, reg_val & INTR_STATUS_PAGE_VALID);
+ elm_write_reg(info, ELM_IRQENABLE, INTR_EN_PAGE_MASK);
+
+ /* Load valid ecc byte to syndrome fragment register */
+ elm_load_syndrome(info, err_vec, ecc_calc);
+
+ /* Enable syndrome processing for which syndrome fragment is updated */
+ elm_start_processing(info, err_vec);
+
+ /* Wait for ELM module to finish locating error correction */
+ wait_for_completion(&info->elm_completion);
+
+ /* Disable page mode interrupt */
+ reg_val = elm_read_reg(info, ELM_IRQENABLE);
+ elm_write_reg(info, ELM_IRQENABLE, reg_val & ~INTR_EN_PAGE_MASK);
+ elm_error_correction(info, err_vec);
+}
+EXPORT_SYMBOL(elm_decode_bch_error_page);
+
+static irqreturn_t elm_isr(int this_irq, void *dev_id)
+{
+ u32 reg_val;
+ struct elm_info *info = dev_id;
+
+ reg_val = elm_read_reg(info, ELM_IRQSTATUS);
+
+ /* All error vectors processed */
+ if (reg_val & INTR_STATUS_PAGE_VALID) {
+ elm_write_reg(info, ELM_IRQSTATUS,
+ reg_val & INTR_STATUS_PAGE_VALID);
+ complete(&info->elm_completion);
+ return IRQ_HANDLED;
+ }
+
+ return IRQ_NONE;
+}
+
+static int elm_probe(struct platform_device *pdev)
+{
+ int ret = 0;
+ struct resource *res, *irq;
+ struct elm_info *info;
+
+ info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ dev_err(&pdev->dev, "failed to allocate memory\n");
+ return -ENOMEM;
+ }
+
+ info->dev = &pdev->dev;
+
+ irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+ if (!irq) {
+ dev_err(&pdev->dev, "no irq resource defined\n");
+ return -ENODEV;
+ }
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "no memory resource defined\n");
+ return -ENODEV;
+ }
+
+ info->elm_base = devm_request_and_ioremap(&pdev->dev, res);
+ if (!info->elm_base)
+ return -EADDRNOTAVAIL;
+
+ ret = devm_request_irq(&pdev->dev, irq->start, elm_isr, 0,
+ pdev->name, info);
+ if (ret) {
+ dev_err(&pdev->dev, "failure requesting irq %i\n", irq->start);
+ return ret;
+ }
+
+ pm_runtime_enable(&pdev->dev);
+ if (pm_runtime_get_sync(&pdev->dev)) {
+ ret = -EINVAL;
+ pm_runtime_disable(&pdev->dev);
+ dev_err(&pdev->dev, "can't enable clock\n");
+ return ret;
+ }
+
+ init_completion(&info->elm_completion);
+ INIT_LIST_HEAD(&info->list);
+ list_add(&info->list, &elm_devices);
+ platform_set_drvdata(pdev, info);
+ return ret;
+}
+
+static int elm_remove(struct platform_device *pdev)
+{
+ pm_runtime_put_sync(&pdev->dev);
+ pm_runtime_disable(&pdev->dev);
+ platform_set_drvdata(pdev, NULL);
+ return 0;
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id elm_of_match[] = {
+ { .compatible = "ti,am3352-elm" },
+ {},
+};
+MODULE_DEVICE_TABLE(of, elm_of_match);
+#endif
+
+static struct platform_driver elm_driver = {
+ .driver = {
+ .name = "elm",
+ .owner = THIS_MODULE,
+ .of_match_table = of_match_ptr(elm_of_match),
+ },
+ .probe = elm_probe,
+ .remove = elm_remove,
+};
+
+module_platform_driver(elm_driver);
+
+MODULE_DESCRIPTION("ELM driver for BCH error correction");
+MODULE_AUTHOR("Texas Instruments");
+MODULE_ALIAS("platform: elm");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 4eeeb2d7f6e..5b6b0728be2 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -565,6 +565,96 @@ time_out:
return ret;
}
+static int m25p80_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
+{
+ struct m25p *flash = mtd_to_m25p(mtd);
+ uint32_t offset = ofs;
+ uint8_t status_old, status_new;
+ int res = 0;
+
+ mutex_lock(&flash->lock);
+ /* Wait until finished previous command */
+ if (wait_till_ready(flash)) {
+ res = 1;
+ goto err;
+ }
+
+ status_old = read_sr(flash);
+
+ if (offset < flash->mtd.size-(flash->mtd.size/2))
+ status_new = status_old | SR_BP2 | SR_BP1 | SR_BP0;
+ else if (offset < flash->mtd.size-(flash->mtd.size/4))
+ status_new = (status_old & ~SR_BP0) | SR_BP2 | SR_BP1;
+ else if (offset < flash->mtd.size-(flash->mtd.size/8))
+ status_new = (status_old & ~SR_BP1) | SR_BP2 | SR_BP0;
+ else if (offset < flash->mtd.size-(flash->mtd.size/16))
+ status_new = (status_old & ~(SR_BP0|SR_BP1)) | SR_BP2;
+ else if (offset < flash->mtd.size-(flash->mtd.size/32))
+ status_new = (status_old & ~SR_BP2) | SR_BP1 | SR_BP0;
+ else if (offset < flash->mtd.size-(flash->mtd.size/64))
+ status_new = (status_old & ~(SR_BP2|SR_BP0)) | SR_BP1;
+ else
+ status_new = (status_old & ~(SR_BP2|SR_BP1)) | SR_BP0;
+
+ /* Only modify protection if it will not unlock other areas */
+ if ((status_new&(SR_BP2|SR_BP1|SR_BP0)) >
+ (status_old&(SR_BP2|SR_BP1|SR_BP0))) {
+ write_enable(flash);
+ if (write_sr(flash, status_new) < 0) {
+ res = 1;
+ goto err;
+ }
+ }
+
+err: mutex_unlock(&flash->lock);
+ return res;
+}
+
+static int m25p80_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
+{
+ struct m25p *flash = mtd_to_m25p(mtd);
+ uint32_t offset = ofs;
+ uint8_t status_old, status_new;
+ int res = 0;
+
+ mutex_lock(&flash->lock);
+ /* Wait until finished previous command */
+ if (wait_till_ready(flash)) {
+ res = 1;
+ goto err;
+ }
+
+ status_old = read_sr(flash);
+
+ if (offset+len > flash->mtd.size-(flash->mtd.size/64))
+ status_new = status_old & ~(SR_BP2|SR_BP1|SR_BP0);
+ else if (offset+len > flash->mtd.size-(flash->mtd.size/32))
+ status_new = (status_old & ~(SR_BP2|SR_BP1)) | SR_BP0;
+ else if (offset+len > flash->mtd.size-(flash->mtd.size/16))
+ status_new = (status_old & ~(SR_BP2|SR_BP0)) | SR_BP1;
+ else if (offset+len > flash->mtd.size-(flash->mtd.size/8))
+ status_new = (status_old & ~SR_BP2) | SR_BP1 | SR_BP0;
+ else if (offset+len > flash->mtd.size-(flash->mtd.size/4))
+ status_new = (status_old & ~(SR_BP0|SR_BP1)) | SR_BP2;
+ else if (offset+len > flash->mtd.size-(flash->mtd.size/2))
+ status_new = (status_old & ~SR_BP1) | SR_BP2 | SR_BP0;
+ else
+ status_new = (status_old & ~SR_BP0) | SR_BP2 | SR_BP1;
+
+ /* Only modify protection if it will not lock other areas */
+ if ((status_new&(SR_BP2|SR_BP1|SR_BP0)) <
+ (status_old&(SR_BP2|SR_BP1|SR_BP0))) {
+ write_enable(flash);
+ if (write_sr(flash, status_new) < 0) {
+ res = 1;
+ goto err;
+ }
+ }
+
+err: mutex_unlock(&flash->lock);
+ return res;
+}
+
/****************************************************************************/
/*
@@ -642,6 +732,10 @@ static const struct spi_device_id m25p_ids[] = {
/* Everspin */
{ "mr25h256", CAT25_INFO( 32 * 1024, 1, 256, 2) },
+ /* GigaDevice */
+ { "gd25q32", INFO(0xc84016, 0, 64 * 1024, 64, SECT_4K) },
+ { "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) },
+
/* Intel/Numonyx -- xxxs33b */
{ "160s33b", INFO(0x898911, 0, 64 * 1024, 32, 0) },
{ "320s33b", INFO(0x898912, 0, 64 * 1024, 64, 0) },
@@ -899,6 +993,12 @@ static int m25p_probe(struct spi_device *spi)
flash->mtd._erase = m25p80_erase;
flash->mtd._read = m25p80_read;
+ /* flash protection support for STmicro chips */
+ if (JEDEC_MFR(info->jedec_id) == CFI_MFR_ST) {
+ flash->mtd._lock = m25p80_lock;
+ flash->mtd._unlock = m25p80_unlock;
+ }
+
/* sst flash chips use AAI word program */
if (JEDEC_MFR(info->jedec_id) == CFI_MFR_SST)
flash->mtd._write = sst_write;
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 62ba82c396c..3ed17c4d435 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -429,7 +429,7 @@ config MTD_GPIO_ADDR
config MTD_UCLINUX
bool "Generic uClinux RAM/ROM filesystem support"
- depends on MTD_RAM=y && (!MMU || COLDFIRE)
+ depends on (MTD_RAM=y || MTD_ROM=y) && (!MMU || COLDFIRE)
help
Map driver to support image based filesystems for uClinux.
diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c
index 7901d72c924..363939dfad0 100644
--- a/drivers/mtd/maps/physmap_of.c
+++ b/drivers/mtd/maps/physmap_of.c
@@ -68,9 +68,6 @@ static int of_flash_remove(struct platform_device *dev)
kfree(info->list[i].res);
}
}
-
- kfree(info);
-
return 0;
}
@@ -199,8 +196,9 @@ static int of_flash_probe(struct platform_device *dev)
map_indirect = of_property_read_bool(dp, "no-unaligned-direct-access");
err = -ENOMEM;
- info = kzalloc(sizeof(struct of_flash) +
- sizeof(struct of_flash_list) * count, GFP_KERNEL);
+ info = devm_kzalloc(&dev->dev,
+ sizeof(struct of_flash) +
+ sizeof(struct of_flash_list) * count, GFP_KERNEL);
if (!info)
goto err_flash_remove;
@@ -241,6 +239,7 @@ static int of_flash_probe(struct platform_device *dev)
info->list[i].map.phys = res.start;
info->list[i].map.size = res_size;
info->list[i].map.bankwidth = be32_to_cpup(width);
+ info->list[i].map.device_node = dp;
err = -ENOMEM;
info->list[i].map.virt = ioremap(info->list[i].map.phys,
diff --git a/drivers/mtd/maps/uclinux.c b/drivers/mtd/maps/uclinux.c
index 299bf88a6f4..c1af83db520 100644
--- a/drivers/mtd/maps/uclinux.c
+++ b/drivers/mtd/maps/uclinux.c
@@ -23,12 +23,26 @@
/****************************************************************************/
+#ifdef CONFIG_MTD_ROM
+#define MAP_NAME "rom"
+#else
+#define MAP_NAME "ram"
+#endif
+
+/*
+ * Blackfin uses uclinux_ram_map during startup, so it must not be static.
+ * Provide a dummy declaration to make sparse happy.
+ */
+extern struct map_info uclinux_ram_map;
+
struct map_info uclinux_ram_map = {
- .name = "RAM",
- .phys = (unsigned long)__bss_stop,
+ .name = MAP_NAME,
.size = 0,
};
+static unsigned long physaddr = -1;
+module_param(physaddr, ulong, S_IRUGO);
+
static struct mtd_info *uclinux_ram_mtdinfo;
/****************************************************************************/
@@ -60,11 +74,17 @@ static int __init uclinux_mtd_init(void)
struct map_info *mapp;
mapp = &uclinux_ram_map;
+
+ if (physaddr == -1)
+ mapp->phys = (resource_size_t)__bss_stop;
+ else
+ mapp->phys = physaddr;
+
if (!mapp->size)
mapp->size = PAGE_ALIGN(ntohl(*((unsigned long *)(mapp->phys + 8))));
mapp->bankwidth = 4;
- printk("uclinux[mtd]: RAM probe address=0x%x size=0x%x\n",
+ printk("uclinux[mtd]: probe address=0x%x size=0x%x\n",
(int) mapp->phys, (int) mapp->size);
/*
@@ -82,7 +102,7 @@ static int __init uclinux_mtd_init(void)
simple_map_init(mapp);
- mtd = do_map_probe("map_ram", mapp);
+ mtd = do_map_probe("map_" MAP_NAME, mapp);
if (!mtd) {
printk("uclinux[mtd]: failed to find a mapping?\n");
return(-ENXIO);
@@ -118,6 +138,6 @@ module_exit(uclinux_mtd_cleanup);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Greg Ungerer <gerg@snapgear.com>");
-MODULE_DESCRIPTION("Generic RAM based MTD for uClinux");
+MODULE_DESCRIPTION("Generic MTD for uClinux");
/****************************************************************************/
diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c
index c516a940808..ffcbcca2fd2 100644
--- a/drivers/mtd/nand/atmel_nand.c
+++ b/drivers/mtd/nand/atmel_nand.c
@@ -101,6 +101,8 @@ struct atmel_nand_host {
u8 pmecc_corr_cap;
u16 pmecc_sector_size;
u32 pmecc_lookup_table_offset;
+ u32 pmecc_lookup_table_offset_512;
+ u32 pmecc_lookup_table_offset_1024;
int pmecc_bytes_per_sector;
int pmecc_sector_number;
@@ -908,6 +910,84 @@ static void atmel_pmecc_core_init(struct mtd_info *mtd)
pmecc_writel(host->ecc, CTRL, PMECC_CTRL_ENABLE);
}
+/*
+ * Get ECC requirement in ONFI parameters, returns -1 if ONFI
+ * parameters is not supported.
+ * return 0 if success to get the ECC requirement.
+ */
+static int get_onfi_ecc_param(struct nand_chip *chip,
+ int *ecc_bits, int *sector_size)
+{
+ *ecc_bits = *sector_size = 0;
+
+ if (chip->onfi_params.ecc_bits == 0xff)
+ /* TODO: the sector_size and ecc_bits need to be find in
+ * extended ecc parameter, currently we don't support it.
+ */
+ return -1;
+
+ *ecc_bits = chip->onfi_params.ecc_bits;
+
+ /* The default sector size (ecc codeword size) is 512 */
+ *sector_size = 512;
+
+ return 0;
+}
+
+/*
+ * Get ecc requirement from ONFI parameters ecc requirement.
+ * If pmecc-cap, pmecc-sector-size in DTS are not specified, this function
+ * will set them according to ONFI ecc requirement. Otherwise, use the
+ * value in DTS file.
+ * return 0 if success. otherwise return error code.
+ */
+static int pmecc_choose_ecc(struct atmel_nand_host *host,
+ int *cap, int *sector_size)
+{
+ /* Get ECC requirement from ONFI parameters */
+ *cap = *sector_size = 0;
+ if (host->nand_chip.onfi_version) {
+ if (!get_onfi_ecc_param(&host->nand_chip, cap, sector_size))
+ dev_info(host->dev, "ONFI params, minimum required ECC: %d bits in %d bytes\n",
+ *cap, *sector_size);
+ else
+ dev_info(host->dev, "NAND chip ECC reqirement is in Extended ONFI parameter, we don't support yet.\n");
+ } else {
+ dev_info(host->dev, "NAND chip is not ONFI compliant, assume ecc_bits is 2 in 512 bytes");
+ }
+ if (*cap == 0 && *sector_size == 0) {
+ *cap = 2;
+ *sector_size = 512;
+ }
+
+ /* If dts file doesn't specify then use the one in ONFI parameters */
+ if (host->pmecc_corr_cap == 0) {
+ /* use the most fitable ecc bits (the near bigger one ) */
+ if (*cap <= 2)
+ host->pmecc_corr_cap = 2;
+ else if (*cap <= 4)
+ host->pmecc_corr_cap = 4;
+ else if (*cap < 8)
+ host->pmecc_corr_cap = 8;
+ else if (*cap < 12)
+ host->pmecc_corr_cap = 12;
+ else if (*cap < 24)
+ host->pmecc_corr_cap = 24;
+ else
+ return -EINVAL;
+ }
+ if (host->pmecc_sector_size == 0) {
+ /* use the most fitable sector size (the near smaller one ) */
+ if (*sector_size >= 1024)
+ host->pmecc_sector_size = 1024;
+ else if (*sector_size >= 512)
+ host->pmecc_sector_size = 512;
+ else
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int __init atmel_pmecc_nand_init_params(struct platform_device *pdev,
struct atmel_nand_host *host)
{
@@ -916,8 +996,22 @@ static int __init atmel_pmecc_nand_init_params(struct platform_device *pdev,
struct resource *regs, *regs_pmerr, *regs_rom;
int cap, sector_size, err_no;
+ err_no = pmecc_choose_ecc(host, &cap, &sector_size);
+ if (err_no) {
+ dev_err(host->dev, "The NAND flash's ECC requirement are not support!");
+ return err_no;
+ }
+
+ if (cap != host->pmecc_corr_cap ||
+ sector_size != host->pmecc_sector_size)
+ dev_info(host->dev, "WARNING: Be Caution! Using different PMECC parameters from Nand ONFI ECC reqirement.\n");
+
cap = host->pmecc_corr_cap;
sector_size = host->pmecc_sector_size;
+ host->pmecc_lookup_table_offset = (sector_size == 512) ?
+ host->pmecc_lookup_table_offset_512 :
+ host->pmecc_lookup_table_offset_1024;
+
dev_info(host->dev, "Initialize PMECC params, cap: %d, sector: %d\n",
cap, sector_size);
@@ -1215,7 +1309,7 @@ static void atmel_nand_hwctl(struct mtd_info *mtd, int mode)
static int atmel_of_init_port(struct atmel_nand_host *host,
struct device_node *np)
{
- u32 val, table_offset;
+ u32 val;
u32 offset[2];
int ecc_mode;
struct atmel_nand_data *board = &host->board;
@@ -1259,42 +1353,41 @@ static int atmel_of_init_port(struct atmel_nand_host *host,
/* use PMECC, get correction capability, sector size and lookup
* table offset.
+ * If correction bits and sector size are not specified, then find
+ * them from NAND ONFI parameters.
*/
- if (of_property_read_u32(np, "atmel,pmecc-cap", &val) != 0) {
- dev_err(host->dev, "Cannot decide PMECC Capability\n");
- return -EINVAL;
- } else if ((val != 2) && (val != 4) && (val != 8) && (val != 12) &&
- (val != 24)) {
- dev_err(host->dev,
- "Unsupported PMECC correction capability: %d; should be 2, 4, 8, 12 or 24\n",
- val);
- return -EINVAL;
+ if (of_property_read_u32(np, "atmel,pmecc-cap", &val) == 0) {
+ if ((val != 2) && (val != 4) && (val != 8) && (val != 12) &&
+ (val != 24)) {
+ dev_err(host->dev,
+ "Unsupported PMECC correction capability: %d; should be 2, 4, 8, 12 or 24\n",
+ val);
+ return -EINVAL;
+ }
+ host->pmecc_corr_cap = (u8)val;
}
- host->pmecc_corr_cap = (u8)val;
- if (of_property_read_u32(np, "atmel,pmecc-sector-size", &val) != 0) {
- dev_err(host->dev, "Cannot decide PMECC Sector Size\n");
- return -EINVAL;
- } else if ((val != 512) && (val != 1024)) {
- dev_err(host->dev,
- "Unsupported PMECC sector size: %d; should be 512 or 1024 bytes\n",
- val);
- return -EINVAL;
+ if (of_property_read_u32(np, "atmel,pmecc-sector-size", &val) == 0) {
+ if ((val != 512) && (val != 1024)) {
+ dev_err(host->dev,
+ "Unsupported PMECC sector size: %d; should be 512 or 1024 bytes\n",
+ val);
+ return -EINVAL;
+ }
+ host->pmecc_sector_size = (u16)val;
}
- host->pmecc_sector_size = (u16)val;
if (of_property_read_u32_array(np, "atmel,pmecc-lookup-table-offset",
offset, 2) != 0) {
dev_err(host->dev, "Cannot get PMECC lookup table offset\n");
return -EINVAL;
}
- table_offset = host->pmecc_sector_size == 512 ? offset[0] : offset[1];
-
- if (!table_offset) {
+ if (!offset[0] && !offset[1]) {
dev_err(host->dev, "Invalid PMECC lookup table offset\n");
return -EINVAL;
}
- host->pmecc_lookup_table_offset = table_offset;
+ host->pmecc_lookup_table_offset_512 = offset[0];
+ host->pmecc_lookup_table_offset_1024 = offset[1];
return 0;
}
diff --git a/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h b/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
index 0bdb2ce4da7..c005a62330b 100644
--- a/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
+++ b/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
@@ -1,6 +1,10 @@
#ifndef __BCM47XXNFLASH_H
#define __BCM47XXNFLASH_H
+#ifndef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
#include <linux/mtd/mtd.h>
#include <linux/mtd/nand.h>
diff --git a/drivers/mtd/nand/bcm47xxnflash/main.c b/drivers/mtd/nand/bcm47xxnflash/main.c
index 8363a9a5fa3..7bae569fdc7 100644
--- a/drivers/mtd/nand/bcm47xxnflash/main.c
+++ b/drivers/mtd/nand/bcm47xxnflash/main.c
@@ -9,14 +9,14 @@
*
*/
+#include "bcm47xxnflash.h"
+
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/platform_device.h>
#include <linux/bcma/bcma.h>
-#include "bcm47xxnflash.h"
-
MODULE_DESCRIPTION("NAND flash driver for BCMA bus");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rafał Miłecki");
@@ -77,6 +77,7 @@ static int bcm47xxnflash_remove(struct platform_device *pdev)
}
static struct platform_driver bcm47xxnflash_driver = {
+ .probe = bcm47xxnflash_probe,
.remove = bcm47xxnflash_remove,
.driver = {
.name = "bcma_nflash",
@@ -88,13 +89,10 @@ static int __init bcm47xxnflash_init(void)
{
int err;
- /*
- * Platform device "bcma_nflash" exists on SoCs and is registered very
- * early, it won't be added during runtime (use platform_driver_probe).
- */
- err = platform_driver_probe(&bcm47xxnflash_driver, bcm47xxnflash_probe);
+ err = platform_driver_register(&bcm47xxnflash_driver);
if (err)
- pr_err("Failed to register serial flash driver: %d\n", err);
+ pr_err("Failed to register bcm47xx nand flash driver: %d\n",
+ err);
return err;
}
diff --git a/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c b/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c
index 595de4012e7..b2ab373c9ee 100644
--- a/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c
+++ b/drivers/mtd/nand/bcm47xxnflash/ops_bcm4706.c
@@ -9,13 +9,13 @@
*
*/
+#include "bcm47xxnflash.h"
+
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/bcma/bcma.h>
-#include "bcm47xxnflash.h"
-
/* Broadcom uses 1'000'000 but it seems to be too many. Tests on WNDR4500 has
* shown ~1000 retries as maxiumum. */
#define NFLASH_READY_RETRIES 10000
diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c
index feae55c7b88..94e17af8e45 100644
--- a/drivers/mtd/nand/davinci_nand.c
+++ b/drivers/mtd/nand/davinci_nand.c
@@ -606,7 +606,7 @@ static int __init nand_davinci_probe(struct platform_device *pdev)
if (pdev->id < 0 || pdev->id > 3)
return -ENODEV;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
if (!info) {
dev_err(&pdev->dev, "unable to allocate memory\n");
ret = -ENOMEM;
@@ -623,11 +623,11 @@ static int __init nand_davinci_probe(struct platform_device *pdev)
goto err_nomem;
}
- vaddr = ioremap(res1->start, resource_size(res1));
- base = ioremap(res2->start, resource_size(res2));
+ vaddr = devm_request_and_ioremap(&pdev->dev, res1);
+ base = devm_request_and_ioremap(&pdev->dev, res2);
if (!vaddr || !base) {
dev_err(&pdev->dev, "ioremap failed\n");
- ret = -EINVAL;
+ ret = -EADDRNOTAVAIL;
goto err_ioremap;
}
@@ -717,7 +717,7 @@ static int __init nand_davinci_probe(struct platform_device *pdev)
}
info->chip.ecc.mode = ecc_mode;
- info->clk = clk_get(&pdev->dev, "aemif");
+ info->clk = devm_clk_get(&pdev->dev, "aemif");
if (IS_ERR(info->clk)) {
ret = PTR_ERR(info->clk);
dev_dbg(&pdev->dev, "unable to get AEMIF clock, err %d\n", ret);
@@ -845,8 +845,6 @@ err_timing:
clk_disable_unprepare(info->clk);
err_clk_enable:
- clk_put(info->clk);
-
spin_lock_irq(&davinci_nand_lock);
if (ecc_mode == NAND_ECC_HW_SYNDROME)
ecc4_busy = false;
@@ -855,13 +853,7 @@ err_clk_enable:
err_ecc:
err_clk:
err_ioremap:
- if (base)
- iounmap(base);
- if (vaddr)
- iounmap(vaddr);
-
err_nomem:
- kfree(info);
return ret;
}
@@ -874,15 +866,9 @@ static int __exit nand_davinci_remove(struct platform_device *pdev)
ecc4_busy = false;
spin_unlock_irq(&davinci_nand_lock);
- iounmap(info->base);
- iounmap(info->vaddr);
-
nand_release(&info->mtd);
clk_disable_unprepare(info->clk);
- clk_put(info->clk);
-
- kfree(info);
return 0;
}
diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c
index ad6222627fe..f1f7f12ab50 100644
--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -176,8 +176,8 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr, int oob)
ifc_nand_ctrl->page = page_addr;
/* Program ROW0/COL0 */
- out_be32(&ifc->ifc_nand.row0, page_addr);
- out_be32(&ifc->ifc_nand.col0, (oob ? IFC_NAND_COL_MS : 0) | column);
+ iowrite32be(page_addr, &ifc->ifc_nand.row0);
+ iowrite32be((oob ? IFC_NAND_COL_MS : 0) | column, &ifc->ifc_nand.col0);
buf_num = page_addr & priv->bufnum_mask;
@@ -239,18 +239,19 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
int i;
/* set the chip select for NAND Transaction */
- out_be32(&ifc->ifc_nand.nand_csel, priv->bank << IFC_NAND_CSEL_SHIFT);
+ iowrite32be(priv->bank << IFC_NAND_CSEL_SHIFT,
+ &ifc->ifc_nand.nand_csel);
dev_vdbg(priv->dev,
"%s: fir0=%08x fcr0=%08x\n",
__func__,
- in_be32(&ifc->ifc_nand.nand_fir0),
- in_be32(&ifc->ifc_nand.nand_fcr0));
+ ioread32be(&ifc->ifc_nand.nand_fir0),
+ ioread32be(&ifc->ifc_nand.nand_fcr0));
ctrl->nand_stat = 0;
/* start read/write seq */
- out_be32(&ifc->ifc_nand.nandseq_strt, IFC_NAND_SEQ_STRT_FIR_STRT);
+ iowrite32be(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
/* wait for command complete flag or timeout */
wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat,
@@ -273,7 +274,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
int sector_end = sector + chip->ecc.steps - 1;
for (i = sector / 4; i <= sector_end / 4; i++)
- eccstat[i] = in_be32(&ifc->ifc_nand.nand_eccstat[i]);
+ eccstat[i] = ioread32be(&ifc->ifc_nand.nand_eccstat[i]);
for (i = sector; i <= sector_end; i++) {
errors = check_read_ecc(mtd, ctrl, eccstat, i);
@@ -313,31 +314,33 @@ static void fsl_ifc_do_read(struct nand_chip *chip,
/* Program FIR/IFC_NAND_FCR0 for Small/Large page */
if (mtd->writesize > 512) {
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
- (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
- (IFC_FIR_OP_CMD1 << IFC_NAND_FIR0_OP3_SHIFT) |
- (IFC_FIR_OP_RBCD << IFC_NAND_FIR0_OP4_SHIFT));
- out_be32(&ifc->ifc_nand.nand_fir1, 0x0);
-
- out_be32(&ifc->ifc_nand.nand_fcr0,
- (NAND_CMD_READ0 << IFC_NAND_FCR0_CMD0_SHIFT) |
- (NAND_CMD_READSTART << IFC_NAND_FCR0_CMD1_SHIFT));
+ iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
+ (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
+ (IFC_FIR_OP_CMD1 << IFC_NAND_FIR0_OP3_SHIFT) |
+ (IFC_FIR_OP_RBCD << IFC_NAND_FIR0_OP4_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
+ iowrite32be(0x0, &ifc->ifc_nand.nand_fir1);
+
+ iowrite32be((NAND_CMD_READ0 << IFC_NAND_FCR0_CMD0_SHIFT) |
+ (NAND_CMD_READSTART << IFC_NAND_FCR0_CMD1_SHIFT),
+ &ifc->ifc_nand.nand_fcr0);
} else {
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
- (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
- (IFC_FIR_OP_RBCD << IFC_NAND_FIR0_OP3_SHIFT));
- out_be32(&ifc->ifc_nand.nand_fir1, 0x0);
+ iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
+ (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
+ (IFC_FIR_OP_RBCD << IFC_NAND_FIR0_OP3_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
+ iowrite32be(0x0, &ifc->ifc_nand.nand_fir1);
if (oob)
- out_be32(&ifc->ifc_nand.nand_fcr0,
- NAND_CMD_READOOB << IFC_NAND_FCR0_CMD0_SHIFT);
+ iowrite32be(NAND_CMD_READOOB <<
+ IFC_NAND_FCR0_CMD0_SHIFT,
+ &ifc->ifc_nand.nand_fcr0);
else
- out_be32(&ifc->ifc_nand.nand_fcr0,
- NAND_CMD_READ0 << IFC_NAND_FCR0_CMD0_SHIFT);
+ iowrite32be(NAND_CMD_READ0 <<
+ IFC_NAND_FCR0_CMD0_SHIFT,
+ &ifc->ifc_nand.nand_fcr0);
}
}
@@ -357,7 +360,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
switch (command) {
/* READ0 read the entire buffer to use hardware ECC. */
case NAND_CMD_READ0:
- out_be32(&ifc->ifc_nand.nand_fbcr, 0);
+ iowrite32be(0, &ifc->ifc_nand.nand_fbcr);
set_addr(mtd, 0, page_addr, 0);
ifc_nand_ctrl->read_bytes = mtd->writesize + mtd->oobsize;
@@ -372,7 +375,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
/* READOOB reads only the OOB because no ECC is performed. */
case NAND_CMD_READOOB:
- out_be32(&ifc->ifc_nand.nand_fbcr, mtd->oobsize - column);
+ iowrite32be(mtd->oobsize - column, &ifc->ifc_nand.nand_fbcr);
set_addr(mtd, column, page_addr, 1);
ifc_nand_ctrl->read_bytes = mtd->writesize + mtd->oobsize;
@@ -388,19 +391,19 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
if (command == NAND_CMD_PARAM)
timing = IFC_FIR_OP_RBCD;
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_UA << IFC_NAND_FIR0_OP1_SHIFT) |
- (timing << IFC_NAND_FIR0_OP2_SHIFT));
- out_be32(&ifc->ifc_nand.nand_fcr0,
- command << IFC_NAND_FCR0_CMD0_SHIFT);
- out_be32(&ifc->ifc_nand.row3, column);
+ iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_UA << IFC_NAND_FIR0_OP1_SHIFT) |
+ (timing << IFC_NAND_FIR0_OP2_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
+ iowrite32be(command << IFC_NAND_FCR0_CMD0_SHIFT,
+ &ifc->ifc_nand.nand_fcr0);
+ iowrite32be(column, &ifc->ifc_nand.row3);
/*
* although currently it's 8 bytes for READID, we always read
* the maximum 256 bytes(for PARAM)
*/
- out_be32(&ifc->ifc_nand.nand_fbcr, 256);
+ iowrite32be(256, &ifc->ifc_nand.nand_fbcr);
ifc_nand_ctrl->read_bytes = 256;
set_addr(mtd, 0, 0, 0);
@@ -415,16 +418,16 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
/* ERASE2 uses the block and page address from ERASE1 */
case NAND_CMD_ERASE2:
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP1_SHIFT) |
- (IFC_FIR_OP_CMD1 << IFC_NAND_FIR0_OP2_SHIFT));
+ iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP1_SHIFT) |
+ (IFC_FIR_OP_CMD1 << IFC_NAND_FIR0_OP2_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
- out_be32(&ifc->ifc_nand.nand_fcr0,
- (NAND_CMD_ERASE1 << IFC_NAND_FCR0_CMD0_SHIFT) |
- (NAND_CMD_ERASE2 << IFC_NAND_FCR0_CMD1_SHIFT));
+ iowrite32be((NAND_CMD_ERASE1 << IFC_NAND_FCR0_CMD0_SHIFT) |
+ (NAND_CMD_ERASE2 << IFC_NAND_FCR0_CMD1_SHIFT),
+ &ifc->ifc_nand.nand_fcr0);
- out_be32(&ifc->ifc_nand.nand_fbcr, 0);
+ iowrite32be(0, &ifc->ifc_nand.nand_fbcr);
ifc_nand_ctrl->read_bytes = 0;
fsl_ifc_run_command(mtd);
return;
@@ -440,26 +443,28 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
(NAND_CMD_SEQIN << IFC_NAND_FCR0_CMD0_SHIFT) |
(NAND_CMD_PAGEPROG << IFC_NAND_FCR0_CMD1_SHIFT);
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
- (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
- (IFC_FIR_OP_WBCD << IFC_NAND_FIR0_OP3_SHIFT) |
- (IFC_FIR_OP_CW1 << IFC_NAND_FIR0_OP4_SHIFT));
+ iowrite32be(
+ (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP1_SHIFT) |
+ (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP2_SHIFT) |
+ (IFC_FIR_OP_WBCD << IFC_NAND_FIR0_OP3_SHIFT) |
+ (IFC_FIR_OP_CW1 << IFC_NAND_FIR0_OP4_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
} else {
nand_fcr0 = ((NAND_CMD_PAGEPROG <<
IFC_NAND_FCR0_CMD1_SHIFT) |
(NAND_CMD_SEQIN <<
IFC_NAND_FCR0_CMD2_SHIFT));
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_CMD2 << IFC_NAND_FIR0_OP1_SHIFT) |
- (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP2_SHIFT) |
- (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP3_SHIFT) |
- (IFC_FIR_OP_WBCD << IFC_NAND_FIR0_OP4_SHIFT));
- out_be32(&ifc->ifc_nand.nand_fir1,
- (IFC_FIR_OP_CW1 << IFC_NAND_FIR1_OP5_SHIFT));
+ iowrite32be(
+ (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_CMD2 << IFC_NAND_FIR0_OP1_SHIFT) |
+ (IFC_FIR_OP_CA0 << IFC_NAND_FIR0_OP2_SHIFT) |
+ (IFC_FIR_OP_RA0 << IFC_NAND_FIR0_OP3_SHIFT) |
+ (IFC_FIR_OP_WBCD << IFC_NAND_FIR0_OP4_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
+ iowrite32be(IFC_FIR_OP_CW1 << IFC_NAND_FIR1_OP5_SHIFT,
+ &ifc->ifc_nand.nand_fir1);
if (column >= mtd->writesize)
nand_fcr0 |=
@@ -474,7 +479,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
column -= mtd->writesize;
ifc_nand_ctrl->oob = 1;
}
- out_be32(&ifc->ifc_nand.nand_fcr0, nand_fcr0);
+ iowrite32be(nand_fcr0, &ifc->ifc_nand.nand_fcr0);
set_addr(mtd, column, page_addr, ifc_nand_ctrl->oob);
return;
}
@@ -482,10 +487,11 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
/* PAGEPROG reuses all of the setup from SEQIN and adds the length */
case NAND_CMD_PAGEPROG: {
if (ifc_nand_ctrl->oob) {
- out_be32(&ifc->ifc_nand.nand_fbcr,
- ifc_nand_ctrl->index - ifc_nand_ctrl->column);
+ iowrite32be(ifc_nand_ctrl->index -
+ ifc_nand_ctrl->column,
+ &ifc->ifc_nand.nand_fbcr);
} else {
- out_be32(&ifc->ifc_nand.nand_fbcr, 0);
+ iowrite32be(0, &ifc->ifc_nand.nand_fbcr);
}
fsl_ifc_run_command(mtd);
@@ -493,12 +499,12 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
}
case NAND_CMD_STATUS:
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP1_SHIFT));
- out_be32(&ifc->ifc_nand.nand_fcr0,
- NAND_CMD_STATUS << IFC_NAND_FCR0_CMD0_SHIFT);
- out_be32(&ifc->ifc_nand.nand_fbcr, 1);
+ iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP1_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
+ iowrite32be(NAND_CMD_STATUS << IFC_NAND_FCR0_CMD0_SHIFT,
+ &ifc->ifc_nand.nand_fcr0);
+ iowrite32be(1, &ifc->ifc_nand.nand_fbcr);
set_addr(mtd, 0, 0, 0);
ifc_nand_ctrl->read_bytes = 1;
@@ -512,10 +518,10 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
return;
case NAND_CMD_RESET:
- out_be32(&ifc->ifc_nand.nand_fir0,
- IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT);
- out_be32(&ifc->ifc_nand.nand_fcr0,
- NAND_CMD_RESET << IFC_NAND_FCR0_CMD0_SHIFT);
+ iowrite32be(IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT,
+ &ifc->ifc_nand.nand_fir0);
+ iowrite32be(NAND_CMD_RESET << IFC_NAND_FCR0_CMD0_SHIFT,
+ &ifc->ifc_nand.nand_fcr0);
fsl_ifc_run_command(mtd);
return;
@@ -639,18 +645,18 @@ static int fsl_ifc_wait(struct mtd_info *mtd, struct nand_chip *chip)
u32 nand_fsr;
/* Use READ_STATUS command, but wait for the device to be ready */
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_RDSTAT << IFC_NAND_FIR0_OP1_SHIFT));
- out_be32(&ifc->ifc_nand.nand_fcr0, NAND_CMD_STATUS <<
- IFC_NAND_FCR0_CMD0_SHIFT);
- out_be32(&ifc->ifc_nand.nand_fbcr, 1);
+ iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_RDSTAT << IFC_NAND_FIR0_OP1_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
+ iowrite32be(NAND_CMD_STATUS << IFC_NAND_FCR0_CMD0_SHIFT,
+ &ifc->ifc_nand.nand_fcr0);
+ iowrite32be(1, &ifc->ifc_nand.nand_fbcr);
set_addr(mtd, 0, 0, 0);
ifc_nand_ctrl->read_bytes = 1;
fsl_ifc_run_command(mtd);
- nand_fsr = in_be32(&ifc->ifc_nand.nand_fsr);
+ nand_fsr = ioread32be(&ifc->ifc_nand.nand_fsr);
/*
* The chip always seems to report that it is
@@ -744,34 +750,34 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
uint32_t cs = priv->bank;
/* Save CSOR and CSOR_ext */
- csor = in_be32(&ifc->csor_cs[cs].csor);
- csor_ext = in_be32(&ifc->csor_cs[cs].csor_ext);
+ csor = ioread32be(&ifc->csor_cs[cs].csor);
+ csor_ext = ioread32be(&ifc->csor_cs[cs].csor_ext);
/* chage PageSize 8K and SpareSize 1K*/
csor_8k = (csor & ~(CSOR_NAND_PGS_MASK)) | 0x0018C000;
- out_be32(&ifc->csor_cs[cs].csor, csor_8k);
- out_be32(&ifc->csor_cs[cs].csor_ext, 0x0000400);
+ iowrite32be(csor_8k, &ifc->csor_cs[cs].csor);
+ iowrite32be(0x0000400, &ifc->csor_cs[cs].csor_ext);
/* READID */
- out_be32(&ifc->ifc_nand.nand_fir0,
- (IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
- (IFC_FIR_OP_UA << IFC_NAND_FIR0_OP1_SHIFT) |
- (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT));
- out_be32(&ifc->ifc_nand.nand_fcr0,
- NAND_CMD_READID << IFC_NAND_FCR0_CMD0_SHIFT);
- out_be32(&ifc->ifc_nand.row3, 0x0);
+ iowrite32be((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
+ (IFC_FIR_OP_UA << IFC_NAND_FIR0_OP1_SHIFT) |
+ (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
+ &ifc->ifc_nand.nand_fir0);
+ iowrite32be(NAND_CMD_READID << IFC_NAND_FCR0_CMD0_SHIFT,
+ &ifc->ifc_nand.nand_fcr0);
+ iowrite32be(0x0, &ifc->ifc_nand.row3);
- out_be32(&ifc->ifc_nand.nand_fbcr, 0x0);
+ iowrite32be(0x0, &ifc->ifc_nand.nand_fbcr);
/* Program ROW0/COL0 */
- out_be32(&ifc->ifc_nand.row0, 0x0);
- out_be32(&ifc->ifc_nand.col0, 0x0);
+ iowrite32be(0x0, &ifc->ifc_nand.row0);
+ iowrite32be(0x0, &ifc->ifc_nand.col0);
/* set the chip select for NAND Transaction */
- out_be32(&ifc->ifc_nand.nand_csel, cs << IFC_NAND_CSEL_SHIFT);
+ iowrite32be(cs << IFC_NAND_CSEL_SHIFT, &ifc->ifc_nand.nand_csel);
/* start read seq */
- out_be32(&ifc->ifc_nand.nandseq_strt, IFC_NAND_SEQ_STRT_FIR_STRT);
+ iowrite32be(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
/* wait for command complete flag or timeout */
wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat,
@@ -781,8 +787,8 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
printk(KERN_ERR "fsl-ifc: Failed to Initialise SRAM\n");
/* Restore CSOR and CSOR_ext */
- out_be32(&ifc->csor_cs[cs].csor, csor);
- out_be32(&ifc->csor_cs[cs].csor_ext, csor_ext);
+ iowrite32be(csor, &ifc->csor_cs[cs].csor);
+ iowrite32be(csor_ext, &ifc->csor_cs[cs].csor_ext);
}
static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
@@ -799,7 +805,7 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
/* fill in nand_chip structure */
/* set up function call table */
- if ((in_be32(&ifc->cspr_cs[priv->bank].cspr)) & CSPR_PORT_SIZE_16)
+ if ((ioread32be(&ifc->cspr_cs[priv->bank].cspr)) & CSPR_PORT_SIZE_16)
chip->read_byte = fsl_ifc_read_byte16;
else
chip->read_byte = fsl_ifc_read_byte;
@@ -813,13 +819,13 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
chip->bbt_td = &bbt_main_descr;
chip->bbt_md = &bbt_mirror_descr;
- out_be32(&ifc->ifc_nand.ncfgr, 0x0);
+ iowrite32be(0x0, &ifc->ifc_nand.ncfgr);
/* set up nand options */
chip->bbt_options = NAND_BBT_USE_FLASH;
- if (in_be32(&ifc->cspr_cs[priv->bank].cspr) & CSPR_PORT_SIZE_16) {
+ if (ioread32be(&ifc->cspr_cs[priv->bank].cspr) & CSPR_PORT_SIZE_16) {
chip->read_byte = fsl_ifc_read_byte16;
chip->options |= NAND_BUSWIDTH_16;
} else {
@@ -832,7 +838,7 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
chip->ecc.read_page = fsl_ifc_read_page;
chip->ecc.write_page = fsl_ifc_write_page;
- csor = in_be32(&ifc->csor_cs[priv->bank].csor);
+ csor = ioread32be(&ifc->csor_cs[priv->bank].csor);
/* Hardware generates ECC per 512 Bytes */
chip->ecc.size = 512;
@@ -884,7 +890,7 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
chip->ecc.mode = NAND_ECC_SOFT;
}
- ver = in_be32(&ifc->ifc_rev);
+ ver = ioread32be(&ifc->ifc_rev);
if (ver == FSL_IFC_V1_1_0)
fsl_ifc_sram_init(priv);
@@ -910,7 +916,7 @@ static int fsl_ifc_chip_remove(struct fsl_ifc_mtd *priv)
static int match_bank(struct fsl_ifc_regs __iomem *ifc, int bank,
phys_addr_t addr)
{
- u32 cspr = in_be32(&ifc->cspr_cs[bank].cspr);
+ u32 cspr = ioread32be(&ifc->cspr_cs[bank].cspr);
if (!(cspr & CSPR_V))
return 0;
@@ -997,17 +1003,16 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
dev_set_drvdata(priv->dev, priv);
- out_be32(&ifc->ifc_nand.nand_evter_en,
- IFC_NAND_EVTER_EN_OPC_EN |
- IFC_NAND_EVTER_EN_FTOER_EN |
- IFC_NAND_EVTER_EN_WPER_EN);
+ iowrite32be(IFC_NAND_EVTER_EN_OPC_EN |
+ IFC_NAND_EVTER_EN_FTOER_EN |
+ IFC_NAND_EVTER_EN_WPER_EN,
+ &ifc->ifc_nand.nand_evter_en);
/* enable NAND Machine Interrupts */
- out_be32(&ifc->ifc_nand.nand_evter_intr_en,
- IFC_NAND_EVTER_INTR_OPCIR_EN |
- IFC_NAND_EVTER_INTR_FTOERIR_EN |
- IFC_NAND_EVTER_INTR_WPERIR_EN);
-
+ iowrite32be(IFC_NAND_EVTER_INTR_OPCIR_EN |
+ IFC_NAND_EVTER_INTR_FTOERIR_EN |
+ IFC_NAND_EVTER_INTR_WPERIR_EN,
+ &ifc->ifc_nand.nand_evter_intr_en);
priv->mtd.name = kasprintf(GFP_KERNEL, "%x.flash", (unsigned)res.start);
if (!priv->mtd.name) {
ret = -ENOMEM;
diff --git a/drivers/mtd/nand/gpmi-nand/bch-regs.h b/drivers/mtd/nand/gpmi-nand/bch-regs.h
index a0924515c39..588f5374047 100644
--- a/drivers/mtd/nand/gpmi-nand/bch-regs.h
+++ b/drivers/mtd/nand/gpmi-nand/bch-regs.h
@@ -61,6 +61,16 @@
& BM_BCH_FLASH0LAYOUT0_ECC0) \
)
+#define MX6Q_BP_BCH_FLASH0LAYOUT0_GF_13_14 10
+#define MX6Q_BM_BCH_FLASH0LAYOUT0_GF_13_14 \
+ (0x1 << MX6Q_BP_BCH_FLASH0LAYOUT0_GF_13_14)
+#define BF_BCH_FLASH0LAYOUT0_GF(v, x) \
+ ((GPMI_IS_MX6Q(x) && ((v) == 14)) \
+ ? (((1) << MX6Q_BP_BCH_FLASH0LAYOUT0_GF_13_14) \
+ & MX6Q_BM_BCH_FLASH0LAYOUT0_GF_13_14) \
+ : 0 \
+ )
+
#define BP_BCH_FLASH0LAYOUT0_DATA0_SIZE 0
#define BM_BCH_FLASH0LAYOUT0_DATA0_SIZE \
(0xfff << BP_BCH_FLASH0LAYOUT0_DATA0_SIZE)
@@ -93,6 +103,16 @@
& BM_BCH_FLASH0LAYOUT1_ECCN) \
)
+#define MX6Q_BP_BCH_FLASH0LAYOUT1_GF_13_14 10
+#define MX6Q_BM_BCH_FLASH0LAYOUT1_GF_13_14 \
+ (0x1 << MX6Q_BP_BCH_FLASH0LAYOUT1_GF_13_14)
+#define BF_BCH_FLASH0LAYOUT1_GF(v, x) \
+ ((GPMI_IS_MX6Q(x) && ((v) == 14)) \
+ ? (((1) << MX6Q_BP_BCH_FLASH0LAYOUT1_GF_13_14) \
+ & MX6Q_BM_BCH_FLASH0LAYOUT1_GF_13_14) \
+ : 0 \
+ )
+
#define BP_BCH_FLASH0LAYOUT1_DATAN_SIZE 0
#define BM_BCH_FLASH0LAYOUT1_DATAN_SIZE \
(0xfff << BP_BCH_FLASH0LAYOUT1_DATAN_SIZE)
@@ -103,4 +123,6 @@
? (((v) >> 2) & MX6Q_BM_BCH_FLASH0LAYOUT1_DATAN_SIZE) \
: ((v) & BM_BCH_FLASH0LAYOUT1_DATAN_SIZE) \
)
+
+#define HW_BCH_VERSION 0x00000160
#endif
diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-lib.c b/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
index d84699c7968..4f8857fa48a 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-lib.c
@@ -208,6 +208,11 @@ void gpmi_dump_info(struct gpmi_nand_data *this)
}
/* start to print out the BCH info */
+ pr_err("Show BCH registers :\n");
+ for (i = 0; i <= HW_BCH_VERSION / 0x10 + 1; i++) {
+ reg = readl(r->bch_regs + i * 0x10);
+ pr_err("offset 0x%.3x : 0x%.8x\n", i * 0x10, reg);
+ }
pr_err("BCH Geometry :\n");
pr_err("GF length : %u\n", geo->gf_len);
pr_err("ECC Strength : %u\n", geo->ecc_strength);
@@ -232,6 +237,7 @@ int bch_set_geometry(struct gpmi_nand_data *this)
unsigned int metadata_size;
unsigned int ecc_strength;
unsigned int page_size;
+ unsigned int gf_len;
int ret;
if (common_nfc_set_geometry(this))
@@ -242,6 +248,7 @@ int bch_set_geometry(struct gpmi_nand_data *this)
metadata_size = bch_geo->metadata_size;
ecc_strength = bch_geo->ecc_strength >> 1;
page_size = bch_geo->page_size;
+ gf_len = bch_geo->gf_len;
ret = gpmi_enable_clk(this);
if (ret)
@@ -263,11 +270,13 @@ int bch_set_geometry(struct gpmi_nand_data *this)
writel(BF_BCH_FLASH0LAYOUT0_NBLOCKS(block_count)
| BF_BCH_FLASH0LAYOUT0_META_SIZE(metadata_size)
| BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this)
+ | BF_BCH_FLASH0LAYOUT0_GF(gf_len, this)
| BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(block_size, this),
r->bch_regs + HW_BCH_FLASH0LAYOUT0);
writel(BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size)
| BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this)
+ | BF_BCH_FLASH0LAYOUT1_GF(gf_len, this)
| BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(block_size, this),
r->bch_regs + HW_BCH_FLASH0LAYOUT1);
diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
index e9b1c47e3cf..717881a3d1b 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
@@ -94,6 +94,25 @@ static inline int get_ecc_strength(struct gpmi_nand_data *this)
return round_down(ecc_strength, 2);
}
+static inline bool gpmi_check_ecc(struct gpmi_nand_data *this)
+{
+ struct bch_geometry *geo = &this->bch_geometry;
+
+ /* Do the sanity check. */
+ if (GPMI_IS_MX23(this) || GPMI_IS_MX28(this)) {
+ /* The mx23/mx28 only support the GF13. */
+ if (geo->gf_len == 14)
+ return false;
+
+ if (geo->ecc_strength > MXS_ECC_STRENGTH_MAX)
+ return false;
+ } else if (GPMI_IS_MX6Q(this)) {
+ if (geo->ecc_strength > MX6_ECC_STRENGTH_MAX)
+ return false;
+ }
+ return true;
+}
+
int common_nfc_set_geometry(struct gpmi_nand_data *this)
{
struct bch_geometry *geo = &this->bch_geometry;
@@ -112,17 +131,24 @@ int common_nfc_set_geometry(struct gpmi_nand_data *this)
/* The default for the length of Galois Field. */
geo->gf_len = 13;
- /* The default for chunk size. There is no oobsize greater then 512. */
+ /* The default for chunk size. */
geo->ecc_chunk_size = 512;
- while (geo->ecc_chunk_size < mtd->oobsize)
+ while (geo->ecc_chunk_size < mtd->oobsize) {
geo->ecc_chunk_size *= 2; /* keep C >= O */
+ geo->gf_len = 14;
+ }
geo->ecc_chunk_count = mtd->writesize / geo->ecc_chunk_size;
/* We use the same ECC strength for all chunks. */
geo->ecc_strength = get_ecc_strength(this);
- if (!geo->ecc_strength) {
- pr_err("wrong ECC strength.\n");
+ if (!gpmi_check_ecc(this)) {
+ dev_err(this->dev,
+ "We can not support this nand chip."
+ " Its required ecc strength(%d) is beyond our"
+ " capability(%d).\n", geo->ecc_strength,
+ (GPMI_IS_MX6Q(this) ? MX6_ECC_STRENGTH_MAX
+ : MXS_ECC_STRENGTH_MAX));
return -EINVAL;
}
@@ -920,8 +946,7 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
dma_addr_t auxiliary_phys;
unsigned int i;
unsigned char *status;
- unsigned int failed;
- unsigned int corrected;
+ unsigned int max_bitflips = 0;
int ret;
pr_debug("page number is : %d\n", page);
@@ -945,35 +970,25 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
payload_virt, payload_phys);
if (ret) {
pr_err("Error in ECC-based read: %d\n", ret);
- goto exit_nfc;
+ return ret;
}
/* handle the block mark swapping */
block_mark_swapping(this, payload_virt, auxiliary_virt);
/* Loop over status bytes, accumulating ECC status. */
- failed = 0;
- corrected = 0;
- status = auxiliary_virt + nfc_geo->auxiliary_status_offset;
+ status = auxiliary_virt + nfc_geo->auxiliary_status_offset;
for (i = 0; i < nfc_geo->ecc_chunk_count; i++, status++) {
if ((*status == STATUS_GOOD) || (*status == STATUS_ERASED))
continue;
if (*status == STATUS_UNCORRECTABLE) {
- failed++;
+ mtd->ecc_stats.failed++;
continue;
}
- corrected += *status;
- }
-
- /*
- * Propagate ECC status to the owning MTD only when failed or
- * corrected times nearly reaches our ECC correction threshold.
- */
- if (failed || corrected >= (nfc_geo->ecc_strength - 1)) {
- mtd->ecc_stats.failed += failed;
- mtd->ecc_stats.corrected += corrected;
+ mtd->ecc_stats.corrected += *status;
+ max_bitflips = max_t(unsigned int, max_bitflips, *status);
}
if (oob_required) {
@@ -995,8 +1010,8 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
this->payload_virt, this->payload_phys,
nfc_geo->payload_size,
payload_virt, payload_phys);
-exit_nfc:
- return ret;
+
+ return max_bitflips;
}
static int gpmi_ecc_write_page(struct mtd_info *mtd, struct nand_chip *chip,
@@ -1668,8 +1683,8 @@ exit_nfc_init:
release_resources(this);
exit_acquire_resources:
platform_set_drvdata(pdev, NULL);
- kfree(this);
dev_err(this->dev, "driver registration failed: %d\n", ret);
+ kfree(this);
return ret;
}
diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.h b/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
index 3d93a5e3909..07294773127 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
@@ -284,6 +284,10 @@ extern int gpmi_read_page(struct gpmi_nand_data *,
#define STATUS_ERASED 0xff
#define STATUS_UNCORRECTABLE 0xfe
+/* BCH's bit correction capability. */
+#define MXS_ECC_STRENGTH_MAX 20 /* mx23 and mx28 */
+#define MX6_ECC_STRENGTH_MAX 40
+
/* Use the platform_id to distinguish different Archs. */
#define IS_MX23 0x0
#define IS_MX28 0x1
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
index 60ac5b98b71..07e5784e5cd 100644
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -530,12 +530,23 @@ static void send_page_v1(struct mtd_info *mtd, unsigned int ops)
static void send_read_id_v3(struct mxc_nand_host *host)
{
+ struct nand_chip *this = &host->nand;
+
/* Read ID into main buffer */
writel(NFC_ID, NFC_V3_LAUNCH);
wait_op_done(host, true);
memcpy32_fromio(host->data_buf, host->main_area0, 16);
+
+ if (this->options & NAND_BUSWIDTH_16) {
+ /* compress the ID info */
+ host->data_buf[1] = host->data_buf[2];
+ host->data_buf[2] = host->data_buf[4];
+ host->data_buf[3] = host->data_buf[6];
+ host->data_buf[4] = host->data_buf[8];
+ host->data_buf[5] = host->data_buf[10];
+ }
}
/* Request the NANDFC to perform a read of the NAND device ID. */
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 3766682a028..43214151b88 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -825,13 +825,8 @@ static void panic_nand_wait(struct mtd_info *mtd, struct nand_chip *chip,
static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip)
{
- unsigned long timeo = jiffies;
int status, state = chip->state;
-
- if (state == FL_ERASING)
- timeo += (HZ * 400) / 1000;
- else
- timeo += (HZ * 20) / 1000;
+ unsigned long timeo = (state == FL_ERASING ? 400 : 20);
led_trigger_event(nand_led_trigger, LED_FULL);
@@ -849,6 +844,7 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip)
if (in_interrupt() || oops_in_progress)
panic_nand_wait(mtd, chip, timeo);
else {
+ timeo = jiffies + msecs_to_jiffies(timeo);
while (time_before(jiffies, timeo)) {
if (chip->dev_ready) {
if (chip->dev_ready(mtd))
diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c
index b7cfe0d3712..053c9a2d47c 100644
--- a/drivers/mtd/nand/nand_ecc.c
+++ b/drivers/mtd/nand/nand_ecc.c
@@ -55,8 +55,7 @@ struct mtd_info;
#define MODULE_AUTHOR(x) /* x */
#define MODULE_DESCRIPTION(x) /* x */
-#define printk printf
-#define KERN_ERR ""
+#define pr_err printf
#endif
/*
@@ -507,7 +506,7 @@ int __nand_correct_data(unsigned char *buf,
if ((bitsperbyte[b0] + bitsperbyte[b1] + bitsperbyte[b2]) == 1)
return 1; /* error in ECC data; no action needed */
- printk(KERN_ERR "uncorrectable error : ");
+ pr_err("%s: uncorrectable ECC error", __func__);
return -1;
}
EXPORT_SYMBOL(__nand_correct_data);
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 8f30d385bfa..891c52a30e6 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -1468,12 +1468,12 @@ int do_read_error(struct nandsim *ns, int num)
void do_bit_flips(struct nandsim *ns, int num)
{
- if (bitflips && random32() < (1 << 22)) {
+ if (bitflips && prandom_u32() < (1 << 22)) {
int flips = 1;
if (bitflips > 1)
- flips = (random32() % (int) bitflips) + 1;
+ flips = (prandom_u32() % (int) bitflips) + 1;
while (flips--) {
- int pos = random32() % (num * 8);
+ int pos = prandom_u32() % (num * 8);
ns->buf.byte[pos / 8] ^= (1 << (pos % 8));
NS_WARN("read_page: flipping bit %d in page %d "
"reading from %d ecc: corrected=%u failed=%u\n",
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 1d333497cfc..8e820ddf4e0 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -22,9 +22,12 @@
#include <linux/omap-dma.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
#ifdef CONFIG_MTD_NAND_OMAP_BCH
#include <linux/bch.h>
+#include <linux/platform_data/elm.h>
#endif
#include <linux/platform_data/mtd-nand-omap2.h>
@@ -117,6 +120,33 @@
#define OMAP24XX_DMA_GPMC 4
+#define BCH8_MAX_ERROR 8 /* upto 8 bit correctable */
+#define BCH4_MAX_ERROR 4 /* upto 4 bit correctable */
+
+#define SECTOR_BYTES 512
+/* 4 bit padding to make byte aligned, 56 = 52 + 4 */
+#define BCH4_BIT_PAD 4
+#define BCH8_ECC_MAX ((SECTOR_BYTES + BCH8_ECC_OOB_BYTES) * 8)
+#define BCH4_ECC_MAX ((SECTOR_BYTES + BCH4_ECC_OOB_BYTES) * 8)
+
+/* GPMC ecc engine settings for read */
+#define BCH_WRAPMODE_1 1 /* BCH wrap mode 1 */
+#define BCH8R_ECC_SIZE0 0x1a /* ecc_size0 = 26 */
+#define BCH8R_ECC_SIZE1 0x2 /* ecc_size1 = 2 */
+#define BCH4R_ECC_SIZE0 0xd /* ecc_size0 = 13 */
+#define BCH4R_ECC_SIZE1 0x3 /* ecc_size1 = 3 */
+
+/* GPMC ecc engine settings for write */
+#define BCH_WRAPMODE_6 6 /* BCH wrap mode 6 */
+#define BCH_ECC_SIZE0 0x0 /* ecc_size0 = 0, no oob protection */
+#define BCH_ECC_SIZE1 0x20 /* ecc_size1 = 32 */
+
+#ifdef CONFIG_MTD_NAND_OMAP_BCH
+static u_char bch8_vector[] = {0xf3, 0xdb, 0x14, 0x16, 0x8b, 0xd2, 0xbe, 0xcc,
+ 0xac, 0x6b, 0xff, 0x99, 0x7b};
+static u_char bch4_vector[] = {0x00, 0x6b, 0x31, 0xdd, 0x41, 0xbc, 0x10};
+#endif
+
/* oob info generated runtime depending on ecc algorithm and layout selected */
static struct nand_ecclayout omap_oobinfo;
/* Define some generic bad / good block scan pattern which are used
@@ -156,6 +186,9 @@ struct omap_nand_info {
#ifdef CONFIG_MTD_NAND_OMAP_BCH
struct bch_control *bch;
struct nand_ecclayout ecclayout;
+ bool is_elm_used;
+ struct device *elm_dev;
+ struct device_node *of_node;
#endif
};
@@ -1031,6 +1064,13 @@ static int omap_dev_ready(struct mtd_info *mtd)
* omap3_enable_hwecc_bch - Program OMAP3 GPMC to perform BCH ECC correction
* @mtd: MTD device structure
* @mode: Read/Write mode
+ *
+ * When using BCH, sector size is hardcoded to 512 bytes.
+ * Using wrapping mode 6 both for reading and writing if ELM module not uses
+ * for error correction.
+ * On writing,
+ * eccsize0 = 0 (no additional protected byte in spare area)
+ * eccsize1 = 32 (skip 32 nibbles = 16 bytes per sector in spare area)
*/
static void omap3_enable_hwecc_bch(struct mtd_info *mtd, int mode)
{
@@ -1039,32 +1079,57 @@ static void omap3_enable_hwecc_bch(struct mtd_info *mtd, int mode)
struct omap_nand_info *info = container_of(mtd, struct omap_nand_info,
mtd);
struct nand_chip *chip = mtd->priv;
- u32 val;
+ u32 val, wr_mode;
+ unsigned int ecc_size1, ecc_size0;
+
+ /* Using wrapping mode 6 for writing */
+ wr_mode = BCH_WRAPMODE_6;
- nerrors = (info->nand.ecc.bytes == 13) ? 8 : 4;
- dev_width = (chip->options & NAND_BUSWIDTH_16) ? 1 : 0;
- nsectors = 1;
/*
- * Program GPMC to perform correction on one 512-byte sector at a time.
- * Using 4 sectors at a time (i.e. ecc.size = 2048) is also possible and
- * gives a slight (5%) performance gain (but requires additional code).
+ * ECC engine enabled for valid ecc_size0 nibbles
+ * and disabled for ecc_size1 nibbles.
*/
+ ecc_size0 = BCH_ECC_SIZE0;
+ ecc_size1 = BCH_ECC_SIZE1;
+
+ /* Perform ecc calculation on 512-byte sector */
+ nsectors = 1;
+
+ /* Update number of error correction */
+ nerrors = info->nand.ecc.strength;
+
+ /* Multi sector reading/writing for NAND flash with page size < 4096 */
+ if (info->is_elm_used && (mtd->writesize <= 4096)) {
+ if (mode == NAND_ECC_READ) {
+ /* Using wrapping mode 1 for reading */
+ wr_mode = BCH_WRAPMODE_1;
+
+ /*
+ * ECC engine enabled for ecc_size0 nibbles
+ * and disabled for ecc_size1 nibbles.
+ */
+ ecc_size0 = (nerrors == 8) ?
+ BCH8R_ECC_SIZE0 : BCH4R_ECC_SIZE0;
+ ecc_size1 = (nerrors == 8) ?
+ BCH8R_ECC_SIZE1 : BCH4R_ECC_SIZE1;
+ }
+
+ /* Perform ecc calculation for one page (< 4096) */
+ nsectors = info->nand.ecc.steps;
+ }
writel(ECC1, info->reg.gpmc_ecc_control);
- /*
- * When using BCH, sector size is hardcoded to 512 bytes.
- * Here we are using wrapping mode 6 both for reading and writing, with:
- * size0 = 0 (no additional protected byte in spare area)
- * size1 = 32 (skip 32 nibbles = 16 bytes per sector in spare area)
- */
- val = (32 << ECCSIZE1_SHIFT) | (0 << ECCSIZE0_SHIFT);
+ /* Configure ecc size for BCH */
+ val = (ecc_size1 << ECCSIZE1_SHIFT) | (ecc_size0 << ECCSIZE0_SHIFT);
writel(val, info->reg.gpmc_ecc_size_config);
+ dev_width = (chip->options & NAND_BUSWIDTH_16) ? 1 : 0;
+
/* BCH configuration */
val = ((1 << 16) | /* enable BCH */
(((nerrors == 8) ? 1 : 0) << 12) | /* 8 or 4 bits */
- (0x06 << 8) | /* wrap mode = 6 */
+ (wr_mode << 8) | /* wrap mode */
(dev_width << 7) | /* bus width */
(((nsectors-1) & 0x7) << 4) | /* number of sectors */
(info->gpmc_cs << 1) | /* ECC CS */
@@ -1072,7 +1137,7 @@ static void omap3_enable_hwecc_bch(struct mtd_info *mtd, int mode)
writel(val, info->reg.gpmc_ecc_config);
- /* clear ecc and enable bits */
+ /* Clear ecc and enable bits */
writel(ECCCLEAR | ECC1, info->reg.gpmc_ecc_control);
}
@@ -1162,6 +1227,298 @@ static int omap3_calculate_ecc_bch8(struct mtd_info *mtd, const u_char *dat,
}
/**
+ * omap3_calculate_ecc_bch - Generate bytes of ECC bytes
+ * @mtd: MTD device structure
+ * @dat: The pointer to data on which ecc is computed
+ * @ecc_code: The ecc_code buffer
+ *
+ * Support calculating of BCH4/8 ecc vectors for the page
+ */
+static int omap3_calculate_ecc_bch(struct mtd_info *mtd, const u_char *dat,
+ u_char *ecc_code)
+{
+ struct omap_nand_info *info = container_of(mtd, struct omap_nand_info,
+ mtd);
+ unsigned long nsectors, bch_val1, bch_val2, bch_val3, bch_val4;
+ int i, eccbchtsel;
+
+ nsectors = ((readl(info->reg.gpmc_ecc_config) >> 4) & 0x7) + 1;
+ /*
+ * find BCH scheme used
+ * 0 -> BCH4
+ * 1 -> BCH8
+ */
+ eccbchtsel = ((readl(info->reg.gpmc_ecc_config) >> 12) & 0x3);
+
+ for (i = 0; i < nsectors; i++) {
+
+ /* Read hw-computed remainder */
+ bch_val1 = readl(info->reg.gpmc_bch_result0[i]);
+ bch_val2 = readl(info->reg.gpmc_bch_result1[i]);
+ if (eccbchtsel) {
+ bch_val3 = readl(info->reg.gpmc_bch_result2[i]);
+ bch_val4 = readl(info->reg.gpmc_bch_result3[i]);
+ }
+
+ if (eccbchtsel) {
+ /* BCH8 ecc scheme */
+ *ecc_code++ = (bch_val4 & 0xFF);
+ *ecc_code++ = ((bch_val3 >> 24) & 0xFF);
+ *ecc_code++ = ((bch_val3 >> 16) & 0xFF);
+ *ecc_code++ = ((bch_val3 >> 8) & 0xFF);
+ *ecc_code++ = (bch_val3 & 0xFF);
+ *ecc_code++ = ((bch_val2 >> 24) & 0xFF);
+ *ecc_code++ = ((bch_val2 >> 16) & 0xFF);
+ *ecc_code++ = ((bch_val2 >> 8) & 0xFF);
+ *ecc_code++ = (bch_val2 & 0xFF);
+ *ecc_code++ = ((bch_val1 >> 24) & 0xFF);
+ *ecc_code++ = ((bch_val1 >> 16) & 0xFF);
+ *ecc_code++ = ((bch_val1 >> 8) & 0xFF);
+ *ecc_code++ = (bch_val1 & 0xFF);
+ /*
+ * Setting 14th byte to zero to handle
+ * erased page & maintain compatibility
+ * with RBL
+ */
+ *ecc_code++ = 0x0;
+ } else {
+ /* BCH4 ecc scheme */
+ *ecc_code++ = ((bch_val2 >> 12) & 0xFF);
+ *ecc_code++ = ((bch_val2 >> 4) & 0xFF);
+ *ecc_code++ = ((bch_val2 & 0xF) << 4) |
+ ((bch_val1 >> 28) & 0xF);
+ *ecc_code++ = ((bch_val1 >> 20) & 0xFF);
+ *ecc_code++ = ((bch_val1 >> 12) & 0xFF);
+ *ecc_code++ = ((bch_val1 >> 4) & 0xFF);
+ *ecc_code++ = ((bch_val1 & 0xF) << 4);
+ /*
+ * Setting 8th byte to zero to handle
+ * erased page
+ */
+ *ecc_code++ = 0x0;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * erased_sector_bitflips - count bit flips
+ * @data: data sector buffer
+ * @oob: oob buffer
+ * @info: omap_nand_info
+ *
+ * Check the bit flips in erased page falls below correctable level.
+ * If falls below, report the page as erased with correctable bit
+ * flip, else report as uncorrectable page.
+ */
+static int erased_sector_bitflips(u_char *data, u_char *oob,
+ struct omap_nand_info *info)
+{
+ int flip_bits = 0, i;
+
+ for (i = 0; i < info->nand.ecc.size; i++) {
+ flip_bits += hweight8(~data[i]);
+ if (flip_bits > info->nand.ecc.strength)
+ return 0;
+ }
+
+ for (i = 0; i < info->nand.ecc.bytes - 1; i++) {
+ flip_bits += hweight8(~oob[i]);
+ if (flip_bits > info->nand.ecc.strength)
+ return 0;
+ }
+
+ /*
+ * Bit flips falls in correctable level.
+ * Fill data area with 0xFF
+ */
+ if (flip_bits) {
+ memset(data, 0xFF, info->nand.ecc.size);
+ memset(oob, 0xFF, info->nand.ecc.bytes);
+ }
+
+ return flip_bits;
+}
+
+/**
+ * omap_elm_correct_data - corrects page data area in case error reported
+ * @mtd: MTD device structure
+ * @data: page data
+ * @read_ecc: ecc read from nand flash
+ * @calc_ecc: ecc read from HW ECC registers
+ *
+ * Calculated ecc vector reported as zero in case of non-error pages.
+ * In case of error/erased pages non-zero error vector is reported.
+ * In case of non-zero ecc vector, check read_ecc at fixed offset
+ * (x = 13/7 in case of BCH8/4 == 0) to find page programmed or not.
+ * To handle bit flips in this data, count the number of 0's in
+ * read_ecc[x] and check if it greater than 4. If it is less, it is
+ * programmed page, else erased page.
+ *
+ * 1. If page is erased, check with standard ecc vector (ecc vector
+ * for erased page to find any bit flip). If check fails, bit flip
+ * is present in erased page. Count the bit flips in erased page and
+ * if it falls under correctable level, report page with 0xFF and
+ * update the correctable bit information.
+ * 2. If error is reported on programmed page, update elm error
+ * vector and correct the page with ELM error correction routine.
+ *
+ */
+static int omap_elm_correct_data(struct mtd_info *mtd, u_char *data,
+ u_char *read_ecc, u_char *calc_ecc)
+{
+ struct omap_nand_info *info = container_of(mtd, struct omap_nand_info,
+ mtd);
+ int eccsteps = info->nand.ecc.steps;
+ int i , j, stat = 0;
+ int eccsize, eccflag, ecc_vector_size;
+ struct elm_errorvec err_vec[ERROR_VECTOR_MAX];
+ u_char *ecc_vec = calc_ecc;
+ u_char *spare_ecc = read_ecc;
+ u_char *erased_ecc_vec;
+ enum bch_ecc type;
+ bool is_error_reported = false;
+
+ /* Initialize elm error vector to zero */
+ memset(err_vec, 0, sizeof(err_vec));
+
+ if (info->nand.ecc.strength == BCH8_MAX_ERROR) {
+ type = BCH8_ECC;
+ erased_ecc_vec = bch8_vector;
+ } else {
+ type = BCH4_ECC;
+ erased_ecc_vec = bch4_vector;
+ }
+
+ ecc_vector_size = info->nand.ecc.bytes;
+
+ /*
+ * Remove extra byte padding for BCH8 RBL
+ * compatibility and erased page handling
+ */
+ eccsize = ecc_vector_size - 1;
+
+ for (i = 0; i < eccsteps ; i++) {
+ eccflag = 0; /* initialize eccflag */
+
+ /*
+ * Check any error reported,
+ * In case of error, non zero ecc reported.
+ */
+
+ for (j = 0; (j < eccsize); j++) {
+ if (calc_ecc[j] != 0) {
+ eccflag = 1; /* non zero ecc, error present */
+ break;
+ }
+ }
+
+ if (eccflag == 1) {
+ /*
+ * Set threshold to minimum of 4, half of ecc.strength/2
+ * to allow max bit flip in byte to 4
+ */
+ unsigned int threshold = min_t(unsigned int, 4,
+ info->nand.ecc.strength / 2);
+
+ /*
+ * Check data area is programmed by counting
+ * number of 0's at fixed offset in spare area.
+ * Checking count of 0's against threshold.
+ * In case programmed page expects at least threshold
+ * zeros in byte.
+ * If zeros are less than threshold for programmed page/
+ * zeros are more than threshold erased page, either
+ * case page reported as uncorrectable.
+ */
+ if (hweight8(~read_ecc[eccsize]) >= threshold) {
+ /*
+ * Update elm error vector as
+ * data area is programmed
+ */
+ err_vec[i].error_reported = true;
+ is_error_reported = true;
+ } else {
+ /* Error reported in erased page */
+ int bitflip_count;
+ u_char *buf = &data[info->nand.ecc.size * i];
+
+ if (memcmp(calc_ecc, erased_ecc_vec, eccsize)) {
+ bitflip_count = erased_sector_bitflips(
+ buf, read_ecc, info);
+
+ if (bitflip_count)
+ stat += bitflip_count;
+ else
+ return -EINVAL;
+ }
+ }
+ }
+
+ /* Update the ecc vector */
+ calc_ecc += ecc_vector_size;
+ read_ecc += ecc_vector_size;
+ }
+
+ /* Check if any error reported */
+ if (!is_error_reported)
+ return 0;
+
+ /* Decode BCH error using ELM module */
+ elm_decode_bch_error_page(info->elm_dev, ecc_vec, err_vec);
+
+ for (i = 0; i < eccsteps; i++) {
+ if (err_vec[i].error_reported) {
+ for (j = 0; j < err_vec[i].error_count; j++) {
+ u32 bit_pos, byte_pos, error_max, pos;
+
+ if (type == BCH8_ECC)
+ error_max = BCH8_ECC_MAX;
+ else
+ error_max = BCH4_ECC_MAX;
+
+ if (info->nand.ecc.strength == BCH8_MAX_ERROR)
+ pos = err_vec[i].error_loc[j];
+ else
+ /* Add 4 to take care 4 bit padding */
+ pos = err_vec[i].error_loc[j] +
+ BCH4_BIT_PAD;
+
+ /* Calculate bit position of error */
+ bit_pos = pos % 8;
+
+ /* Calculate byte position of error */
+ byte_pos = (error_max - pos - 1) / 8;
+
+ if (pos < error_max) {
+ if (byte_pos < 512)
+ data[byte_pos] ^= 1 << bit_pos;
+ else
+ spare_ecc[byte_pos - 512] ^=
+ 1 << bit_pos;
+ }
+ /* else, not interested to correct ecc */
+ }
+ }
+
+ /* Update number of correctable errors */
+ stat += err_vec[i].error_count;
+
+ /* Update page data with sector size */
+ data += info->nand.ecc.size;
+ spare_ecc += ecc_vector_size;
+ }
+
+ for (i = 0; i < eccsteps; i++)
+ /* Return error if uncorrectable error present */
+ if (err_vec[i].error_uncorrectable)
+ return -EINVAL;
+
+ return stat;
+}
+
+/**
* omap3_correct_data_bch - Decode received data and correct errors
* @mtd: MTD device structure
* @data: page data
@@ -1194,6 +1551,92 @@ static int omap3_correct_data_bch(struct mtd_info *mtd, u_char *data,
}
/**
+ * omap_write_page_bch - BCH ecc based write page function for entire page
+ * @mtd: mtd info structure
+ * @chip: nand chip info structure
+ * @buf: data buffer
+ * @oob_required: must write chip->oob_poi to OOB
+ *
+ * Custom write page method evolved to support multi sector writing in one shot
+ */
+static int omap_write_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
+ const uint8_t *buf, int oob_required)
+{
+ int i;
+ uint8_t *ecc_calc = chip->buffers->ecccalc;
+ uint32_t *eccpos = chip->ecc.layout->eccpos;
+
+ /* Enable GPMC ecc engine */
+ chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
+
+ /* Write data */
+ chip->write_buf(mtd, buf, mtd->writesize);
+
+ /* Update ecc vector from GPMC result registers */
+ chip->ecc.calculate(mtd, buf, &ecc_calc[0]);
+
+ for (i = 0; i < chip->ecc.total; i++)
+ chip->oob_poi[eccpos[i]] = ecc_calc[i];
+
+ /* Write ecc vector to OOB area */
+ chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
+ return 0;
+}
+
+/**
+ * omap_read_page_bch - BCH ecc based page read function for entire page
+ * @mtd: mtd info structure
+ * @chip: nand chip info structure
+ * @buf: buffer to store read data
+ * @oob_required: caller requires OOB data read to chip->oob_poi
+ * @page: page number to read
+ *
+ * For BCH ecc scheme, GPMC used for syndrome calculation and ELM module
+ * used for error correction.
+ * Custom method evolved to support ELM error correction & multi sector
+ * reading. On reading page data area is read along with OOB data with
+ * ecc engine enabled. ecc vector updated after read of OOB data.
+ * For non error pages ecc vector reported as zero.
+ */
+static int omap_read_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
+ uint8_t *buf, int oob_required, int page)
+{
+ uint8_t *ecc_calc = chip->buffers->ecccalc;
+ uint8_t *ecc_code = chip->buffers->ecccode;
+ uint32_t *eccpos = chip->ecc.layout->eccpos;
+ uint8_t *oob = &chip->oob_poi[eccpos[0]];
+ uint32_t oob_pos = mtd->writesize + chip->ecc.layout->eccpos[0];
+ int stat;
+ unsigned int max_bitflips = 0;
+
+ /* Enable GPMC ecc engine */
+ chip->ecc.hwctl(mtd, NAND_ECC_READ);
+
+ /* Read data */
+ chip->read_buf(mtd, buf, mtd->writesize);
+
+ /* Read oob bytes */
+ chip->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_pos, -1);
+ chip->read_buf(mtd, oob, chip->ecc.total);
+
+ /* Calculate ecc bytes */
+ chip->ecc.calculate(mtd, buf, ecc_calc);
+
+ memcpy(ecc_code, &chip->oob_poi[eccpos[0]], chip->ecc.total);
+
+ stat = chip->ecc.correct(mtd, buf, ecc_code, ecc_calc);
+
+ if (stat < 0) {
+ mtd->ecc_stats.failed++;
+ } else {
+ mtd->ecc_stats.corrected += stat;
+ max_bitflips = max_t(unsigned int, max_bitflips, stat);
+ }
+
+ return max_bitflips;
+}
+
+/**
* omap3_free_bch - Release BCH ecc resources
* @mtd: MTD device structure
*/
@@ -1218,43 +1661,86 @@ static int omap3_init_bch(struct mtd_info *mtd, int ecc_opt)
struct omap_nand_info *info = container_of(mtd, struct omap_nand_info,
mtd);
#ifdef CONFIG_MTD_NAND_OMAP_BCH8
- const int hw_errors = 8;
+ const int hw_errors = BCH8_MAX_ERROR;
#else
- const int hw_errors = 4;
+ const int hw_errors = BCH4_MAX_ERROR;
#endif
+ enum bch_ecc bch_type;
+ const __be32 *parp;
+ int lenp;
+ struct device_node *elm_node;
+
info->bch = NULL;
- max_errors = (ecc_opt == OMAP_ECC_BCH8_CODE_HW) ? 8 : 4;
+ max_errors = (ecc_opt == OMAP_ECC_BCH8_CODE_HW) ?
+ BCH8_MAX_ERROR : BCH4_MAX_ERROR;
if (max_errors != hw_errors) {
pr_err("cannot configure %d-bit BCH ecc, only %d-bit supported",
max_errors, hw_errors);
goto fail;
}
- /* software bch library is only used to detect and locate errors */
- info->bch = init_bch(13, max_errors, 0x201b /* hw polynomial */);
- if (!info->bch)
- goto fail;
+ info->nand.ecc.size = 512;
+ info->nand.ecc.hwctl = omap3_enable_hwecc_bch;
+ info->nand.ecc.mode = NAND_ECC_HW;
+ info->nand.ecc.strength = max_errors;
- info->nand.ecc.size = 512;
- info->nand.ecc.hwctl = omap3_enable_hwecc_bch;
- info->nand.ecc.correct = omap3_correct_data_bch;
- info->nand.ecc.mode = NAND_ECC_HW;
+ if (hw_errors == BCH8_MAX_ERROR)
+ bch_type = BCH8_ECC;
+ else
+ bch_type = BCH4_ECC;
- /*
- * The number of corrected errors in an ecc block that will trigger
- * block scrubbing defaults to the ecc strength (4 or 8).
- * Set mtd->bitflip_threshold here to define a custom threshold.
- */
+ /* Detect availability of ELM module */
+ parp = of_get_property(info->of_node, "elm_id", &lenp);
+ if ((parp == NULL) && (lenp != (sizeof(void *) * 2))) {
+ pr_err("Missing elm_id property, fall back to Software BCH\n");
+ info->is_elm_used = false;
+ } else {
+ struct platform_device *pdev;
- if (max_errors == 8) {
- info->nand.ecc.strength = 8;
- info->nand.ecc.bytes = 13;
- info->nand.ecc.calculate = omap3_calculate_ecc_bch8;
+ elm_node = of_find_node_by_phandle(be32_to_cpup(parp));
+ pdev = of_find_device_by_node(elm_node);
+ info->elm_dev = &pdev->dev;
+ elm_config(info->elm_dev, bch_type);
+ info->is_elm_used = true;
+ }
+
+ if (info->is_elm_used && (mtd->writesize <= 4096)) {
+
+ if (hw_errors == BCH8_MAX_ERROR)
+ info->nand.ecc.bytes = BCH8_SIZE;
+ else
+ info->nand.ecc.bytes = BCH4_SIZE;
+
+ info->nand.ecc.correct = omap_elm_correct_data;
+ info->nand.ecc.calculate = omap3_calculate_ecc_bch;
+ info->nand.ecc.read_page = omap_read_page_bch;
+ info->nand.ecc.write_page = omap_write_page_bch;
} else {
- info->nand.ecc.strength = 4;
- info->nand.ecc.bytes = 7;
- info->nand.ecc.calculate = omap3_calculate_ecc_bch4;
+ /*
+ * software bch library is only used to detect and
+ * locate errors
+ */
+ info->bch = init_bch(13, max_errors,
+ 0x201b /* hw polynomial */);
+ if (!info->bch)
+ goto fail;
+
+ info->nand.ecc.correct = omap3_correct_data_bch;
+
+ /*
+ * The number of corrected errors in an ecc block that will
+ * trigger block scrubbing defaults to the ecc strength (4 or 8)
+ * Set mtd->bitflip_threshold here to define a custom threshold.
+ */
+
+ if (max_errors == 8) {
+ info->nand.ecc.bytes = 13;
+ info->nand.ecc.calculate = omap3_calculate_ecc_bch8;
+ } else {
+ info->nand.ecc.bytes = 7;
+ info->nand.ecc.calculate = omap3_calculate_ecc_bch4;
+ }
}
pr_info("enabling NAND BCH ecc with %d-bit correction\n", max_errors);
@@ -1270,7 +1756,7 @@ fail:
*/
static int omap3_init_bch_tail(struct mtd_info *mtd)
{
- int i, steps;
+ int i, steps, offset;
struct omap_nand_info *info = container_of(mtd, struct omap_nand_info,
mtd);
struct nand_ecclayout *layout = &info->ecclayout;
@@ -1292,11 +1778,21 @@ static int omap3_init_bch_tail(struct mtd_info *mtd)
goto fail;
}
+ /* ECC layout compatible with RBL for BCH8 */
+ if (info->is_elm_used && (info->nand.ecc.bytes == BCH8_SIZE))
+ offset = 2;
+ else
+ offset = mtd->oobsize - layout->eccbytes;
+
/* put ecc bytes at oob tail */
for (i = 0; i < layout->eccbytes; i++)
- layout->eccpos[i] = mtd->oobsize-layout->eccbytes+i;
+ layout->eccpos[i] = offset + i;
+
+ if (info->is_elm_used && (info->nand.ecc.bytes == BCH8_SIZE))
+ layout->oobfree[0].offset = 2 + layout->eccbytes * steps;
+ else
+ layout->oobfree[0].offset = 2;
- layout->oobfree[0].offset = 2;
layout->oobfree[0].length = mtd->oobsize-2-layout->eccbytes;
info->nand.ecc.layout = layout;
@@ -1360,6 +1856,9 @@ static int omap_nand_probe(struct platform_device *pdev)
info->nand.options = pdata->devsize;
info->nand.options |= NAND_SKIP_BBTSCAN;
+#ifdef CONFIG_MTD_NAND_OMAP_BCH
+ info->of_node = pdata->of_node;
+#endif
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (res == NULL) {
diff --git a/drivers/mtd/ofpart.c b/drivers/mtd/ofpart.c
index dbd3aa574ea..30bd907a260 100644
--- a/drivers/mtd/ofpart.c
+++ b/drivers/mtd/ofpart.c
@@ -174,7 +174,14 @@ out:
return rc;
}
+static void __exit ofpart_parser_exit(void)
+{
+ deregister_mtd_parser(&ofpart_parser);
+ deregister_mtd_parser(&ofoldpart_parser);
+}
+
module_init(ofpart_parser_init);
+module_exit(ofpart_parser_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Parser for MTD partitioning information in device tree");
diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c
index db2f22e7966..70106607c24 100644
--- a/drivers/mtd/tests/mtd_nandecctest.c
+++ b/drivers/mtd/tests/mtd_nandecctest.c
@@ -44,7 +44,7 @@ struct nand_ecc_test {
static void single_bit_error_data(void *error_data, void *correct_data,
size_t size)
{
- unsigned int offset = random32() % (size * BITS_PER_BYTE);
+ unsigned int offset = prandom_u32() % (size * BITS_PER_BYTE);
memcpy(error_data, correct_data, size);
__change_bit_le(offset, error_data);
@@ -55,9 +55,9 @@ static void double_bit_error_data(void *error_data, void *correct_data,
{
unsigned int offset[2];
- offset[0] = random32() % (size * BITS_PER_BYTE);
+ offset[0] = prandom_u32() % (size * BITS_PER_BYTE);
do {
- offset[1] = random32() % (size * BITS_PER_BYTE);
+ offset[1] = prandom_u32() % (size * BITS_PER_BYTE);
} while (offset[0] == offset[1]);
memcpy(error_data, correct_data, size);
@@ -68,7 +68,7 @@ static void double_bit_error_data(void *error_data, void *correct_data,
static unsigned int random_ecc_bit(size_t size)
{
- unsigned int offset = random32() % (3 * BITS_PER_BYTE);
+ unsigned int offset = prandom_u32() % (3 * BITS_PER_BYTE);
if (size == 256) {
/*
@@ -76,7 +76,7 @@ static unsigned int random_ecc_bit(size_t size)
* and 17th bit) in ECC code for 256 byte data block
*/
while (offset == 16 || offset == 17)
- offset = random32() % (3 * BITS_PER_BYTE);
+ offset = prandom_u32() % (3 * BITS_PER_BYTE);
}
return offset;
diff --git a/drivers/mtd/tests/mtd_stresstest.c b/drivers/mtd/tests/mtd_stresstest.c
index 2d7e6cffd6d..787f539d16c 100644
--- a/drivers/mtd/tests/mtd_stresstest.c
+++ b/drivers/mtd/tests/mtd_stresstest.c
@@ -55,7 +55,7 @@ static int rand_eb(void)
unsigned int eb;
again:
- eb = random32();
+ eb = prandom_u32();
/* Read or write up 2 eraseblocks at a time - hence 'ebcnt - 1' */
eb %= (ebcnt - 1);
if (bbt[eb])
@@ -67,7 +67,7 @@ static int rand_offs(void)
{
unsigned int offs;
- offs = random32();
+ offs = prandom_u32();
offs %= bufsize;
return offs;
}
@@ -76,7 +76,7 @@ static int rand_len(int offs)
{
unsigned int len;
- len = random32();
+ len = prandom_u32();
len %= (bufsize - offs);
return len;
}
@@ -191,7 +191,7 @@ static int do_write(void)
static int do_operation(void)
{
- if (random32() & 1)
+ if (prandom_u32() & 1)
return do_read();
else
return do_write();
diff --git a/drivers/mtd/tests/mtd_torturetest.c b/drivers/mtd/tests/mtd_torturetest.c
index c4cde1e9edd..3a9f6a6a79f 100644
--- a/drivers/mtd/tests/mtd_torturetest.c
+++ b/drivers/mtd/tests/mtd_torturetest.c
@@ -208,7 +208,7 @@ static inline int write_pattern(int ebnum, void *buf)
static int __init tort_init(void)
{
int err = 0, i, infinite = !cycles_count;
- int bad_ebs[ebcnt];
+ int *bad_ebs;
printk(KERN_INFO "\n");
printk(KERN_INFO "=================================================\n");
@@ -250,28 +250,24 @@ static int __init tort_init(void)
err = -ENOMEM;
patt_5A5 = kmalloc(mtd->erasesize, GFP_KERNEL);
- if (!patt_5A5) {
- pr_err("error: cannot allocate memory\n");
+ if (!patt_5A5)
goto out_mtd;
- }
patt_A5A = kmalloc(mtd->erasesize, GFP_KERNEL);
- if (!patt_A5A) {
- pr_err("error: cannot allocate memory\n");
+ if (!patt_A5A)
goto out_patt_5A5;
- }
patt_FF = kmalloc(mtd->erasesize, GFP_KERNEL);
- if (!patt_FF) {
- pr_err("error: cannot allocate memory\n");
+ if (!patt_FF)
goto out_patt_A5A;
- }
check_buf = kmalloc(mtd->erasesize, GFP_KERNEL);
- if (!check_buf) {
- pr_err("error: cannot allocate memory\n");
+ if (!check_buf)
goto out_patt_FF;
- }
+
+ bad_ebs = kcalloc(ebcnt, sizeof(*bad_ebs), GFP_KERNEL);
+ if (!bad_ebs)
+ goto out_check_buf;
err = 0;
@@ -290,7 +286,6 @@ static int __init tort_init(void)
/*
* Check if there is a bad eraseblock among those we are going to test.
*/
- memset(&bad_ebs[0], 0, sizeof(int) * ebcnt);
if (mtd_can_have_bb(mtd)) {
for (i = eb; i < eb + ebcnt; i++) {
err = mtd_block_isbad(mtd, (loff_t)i * mtd->erasesize);
@@ -394,6 +389,8 @@ out:
pr_info("finished after %u erase cycles\n",
erase_cycles);
+ kfree(bad_ebs);
+out_check_buf:
kfree(check_buf);
out_patt_FF:
kfree(patt_FF);
diff --git a/drivers/mtd/ubi/debug.h b/drivers/mtd/ubi/debug.h
index 33f8f3b2c9b..cba89fcd158 100644
--- a/drivers/mtd/ubi/debug.h
+++ b/drivers/mtd/ubi/debug.h
@@ -86,7 +86,7 @@ static inline int ubi_dbg_is_bgt_disabled(const struct ubi_device *ubi)
static inline int ubi_dbg_is_bitflip(const struct ubi_device *ubi)
{
if (ubi->dbg.emulate_bitflips)
- return !(random32() % 200);
+ return !(prandom_u32() % 200);
return 0;
}
@@ -100,7 +100,7 @@ static inline int ubi_dbg_is_bitflip(const struct ubi_device *ubi)
static inline int ubi_dbg_is_write_failure(const struct ubi_device *ubi)
{
if (ubi->dbg.emulate_io_failures)
- return !(random32() % 500);
+ return !(prandom_u32() % 500);
return 0;
}
@@ -114,7 +114,7 @@ static inline int ubi_dbg_is_write_failure(const struct ubi_device *ubi)
static inline int ubi_dbg_is_erase_failure(const struct ubi_device *ubi)
{
if (ubi->dbg.emulate_io_failures)
- return !(random32() % 400);
+ return !(prandom_u32() % 400);
return 0;
}
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index a7efec29303..9b017d9c58e 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -381,7 +381,7 @@ static void b44_set_flow_ctrl(struct b44 *bp, u32 local, u32 remote)
}
#ifdef CONFIG_BCM47XX
-#include <asm/mach-bcm47xx/nvram.h>
+#include <bcm47xx_nvram.h>
static void b44_wap54g10_workaround(struct b44 *bp)
{
char buf[20];
@@ -393,7 +393,7 @@ static void b44_wap54g10_workaround(struct b44 *bp)
* see https://dev.openwrt.org/ticket/146
* check and reset bit "isolate"
*/
- if (nvram_getenv("boardnum", buf, sizeof(buf)) < 0)
+ if (bcm47xx_nvram_getenv("boardnum", buf, sizeof(buf)) < 0)
return;
if (simple_strtoul(buf, NULL, 0) == 2) {
err = __b44_readphy(bp, 0, MII_BMCR, &val);
diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c
index bf985c04524..639049d7e92 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -15,7 +15,7 @@
#include <linux/mii.h>
#include <linux/interrupt.h>
#include <linux/dma-mapping.h>
-#include <asm/mach-bcm47xx/nvram.h>
+#include <bcm47xx_nvram.h>
static const struct bcma_device_id bgmac_bcma_tbl[] = {
BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_4706_MAC_GBIT, BCMA_ANY_REV, BCMA_ANY_CLASS),
@@ -908,7 +908,7 @@ static void bgmac_chip_reset(struct bgmac *bgmac)
BGMAC_CHIPCTL_1_IF_TYPE_RMII;
char buf[2];
- if (nvram_getenv("et_swtype", buf, 1) > 0) {
+ if (bcm47xx_nvram_getenv("et_swtype", buf, 1) > 0) {
if (kstrtou8(buf, 0, &et_swtype))
bgmac_err(bgmac, "Failed to parse et_swtype (%s)\n",
buf);
@@ -1386,7 +1386,7 @@ static int bgmac_probe(struct bcma_device *core)
}
bgmac->int_mask = BGMAC_IS_ERRMASK | BGMAC_IS_RX | BGMAC_IS_TX_MASK;
- if (nvram_getenv("et0_no_txint", NULL, 0) == 0)
+ if (bcm47xx_nvram_getenv("et0_no_txint", NULL, 0) == 0)
bgmac->int_mask &= ~BGMAC_IS_TX_MASK;
/* TODO: reset the external phy. Specs are needed */
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 7ab0b2fba50..3338437b559 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -79,6 +79,17 @@ config ASUS_LAPTOP
If you have an ACPI-compatible ASUS laptop, say Y or M here.
+config CHROMEOS_LAPTOP
+ tristate "Chrome OS Laptop"
+ depends on I2C
+ depends on DMI
+ ---help---
+ This driver instantiates i2c and smbus devices such as
+ light sensors and touchpads.
+
+ If you have a supported Chromebook, choose Y or M here.
+ The module will be called chromeos_laptop.
+
config DELL_LAPTOP
tristate "Dell Laptop Extras"
depends on X86
@@ -288,9 +299,11 @@ config IDEAPAD_LAPTOP
depends on ACPI
depends on RFKILL && INPUT
depends on SERIO_I8042
+ depends on BACKLIGHT_CLASS_DEVICE
select INPUT_SPARSEKMAP
help
- This is a driver for the rfkill switches on Lenovo IdeaPad netbooks.
+ This is a driver for Lenovo IdeaPad netbooks contains drivers for
+ rfkill switch, hotkey, fan control and backlight control.
config THINKPAD_ACPI
tristate "ThinkPad ACPI Laptop Extras"
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index bf7e4f935b1..ace2b38942f 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -50,3 +50,4 @@ obj-$(CONFIG_INTEL_MID_POWER_BUTTON) += intel_mid_powerbtn.o
obj-$(CONFIG_INTEL_OAKTRAIL) += intel_oaktrail.o
obj-$(CONFIG_SAMSUNG_Q10) += samsung-q10.o
obj-$(CONFIG_APPLE_GMUX) += apple-gmux.o
+obj-$(CONFIG_CHROMEOS_LAPTOP) += chromeos_laptop.o
diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index afed7018a2b..c9076bdaf2c 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -511,6 +511,24 @@ static struct dmi_system_id acer_quirks[] = {
},
.driver_data = &quirk_fujitsu_amilo_li_1718,
},
+ {
+ .callback = dmi_matched,
+ .ident = "Lenovo Ideapad S205-10382JG",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "10382JG"),
+ },
+ .driver_data = &quirk_lenovo_ideapad_s205,
+ },
+ {
+ .callback = dmi_matched,
+ .ident = "Lenovo Ideapad S205-1038DPG",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "1038DPG"),
+ },
+ .driver_data = &quirk_lenovo_ideapad_s205,
+ },
{}
};
@@ -1204,6 +1222,9 @@ static acpi_status WMID_set_capabilities(void)
devices = *((u32 *) obj->buffer.pointer);
} else if (obj->type == ACPI_TYPE_INTEGER) {
devices = (u32) obj->integer.value;
+ } else {
+ kfree(out.pointer);
+ return AE_ERROR;
}
} else {
kfree(out.pointer);
diff --git a/drivers/platform/x86/asus-laptop.c b/drivers/platform/x86/asus-laptop.c
index d9f9a0dbc6f..0eea09c1c13 100644
--- a/drivers/platform/x86/asus-laptop.c
+++ b/drivers/platform/x86/asus-laptop.c
@@ -128,10 +128,12 @@ MODULE_PARM_DESC(als_status, "Set the ALS status on boot "
/*
* Some events we use, same for all Asus
*/
-#define ATKD_BR_UP 0x10 /* (event & ~ATKD_BR_UP) = brightness level */
-#define ATKD_BR_DOWN 0x20 /* (event & ~ATKD_BR_DOWN) = britghness level */
-#define ATKD_BR_MIN ATKD_BR_UP
-#define ATKD_BR_MAX (ATKD_BR_DOWN | 0xF) /* 0x2f */
+#define ATKD_BRNUP_MIN 0x10
+#define ATKD_BRNUP_MAX 0x1f
+#define ATKD_BRNDOWN_MIN 0x20
+#define ATKD_BRNDOWN_MAX 0x2f
+#define ATKD_BRNDOWN 0x20
+#define ATKD_BRNUP 0x2f
#define ATKD_LCD_ON 0x33
#define ATKD_LCD_OFF 0x34
@@ -301,40 +303,65 @@ static const struct key_entry asus_keymap[] = {
{KE_KEY, 0x17, { KEY_ZOOM } },
{KE_KEY, 0x1f, { KEY_BATTERY } },
/* End of Lenovo SL Specific keycodes */
+ {KE_KEY, ATKD_BRNDOWN, { KEY_BRIGHTNESSDOWN } },
+ {KE_KEY, ATKD_BRNUP, { KEY_BRIGHTNESSUP } },
{KE_KEY, 0x30, { KEY_VOLUMEUP } },
{KE_KEY, 0x31, { KEY_VOLUMEDOWN } },
{KE_KEY, 0x32, { KEY_MUTE } },
- {KE_KEY, 0x33, { KEY_SWITCHVIDEOMODE } },
- {KE_KEY, 0x34, { KEY_SWITCHVIDEOMODE } },
+ {KE_KEY, 0x33, { KEY_DISPLAYTOGGLE } }, /* LCD on */
+ {KE_KEY, 0x34, { KEY_DISPLAY_OFF } }, /* LCD off */
{KE_KEY, 0x40, { KEY_PREVIOUSSONG } },
{KE_KEY, 0x41, { KEY_NEXTSONG } },
- {KE_KEY, 0x43, { KEY_STOPCD } },
+ {KE_KEY, 0x43, { KEY_STOPCD } }, /* Stop/Eject */
{KE_KEY, 0x45, { KEY_PLAYPAUSE } },
- {KE_KEY, 0x4c, { KEY_MEDIA } },
+ {KE_KEY, 0x4c, { KEY_MEDIA } }, /* WMP Key */
{KE_KEY, 0x50, { KEY_EMAIL } },
{KE_KEY, 0x51, { KEY_WWW } },
{KE_KEY, 0x55, { KEY_CALC } },
+ {KE_IGNORE, 0x57, }, /* Battery mode */
+ {KE_IGNORE, 0x58, }, /* AC mode */
{KE_KEY, 0x5C, { KEY_SCREENLOCK } }, /* Screenlock */
- {KE_KEY, 0x5D, { KEY_WLAN } },
- {KE_KEY, 0x5E, { KEY_WLAN } },
- {KE_KEY, 0x5F, { KEY_WLAN } },
- {KE_KEY, 0x60, { KEY_SWITCHVIDEOMODE } },
- {KE_KEY, 0x61, { KEY_SWITCHVIDEOMODE } },
- {KE_KEY, 0x62, { KEY_SWITCHVIDEOMODE } },
- {KE_KEY, 0x63, { KEY_SWITCHVIDEOMODE } },
- {KE_KEY, 0x6B, { KEY_F13 } }, /* Lock Touchpad */
+ {KE_KEY, 0x5D, { KEY_WLAN } }, /* WLAN Toggle */
+ {KE_KEY, 0x5E, { KEY_WLAN } }, /* WLAN Enable */
+ {KE_KEY, 0x5F, { KEY_WLAN } }, /* WLAN Disable */
+ {KE_KEY, 0x60, { KEY_TOUCHPAD_ON } },
+ {KE_KEY, 0x61, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD only */
+ {KE_KEY, 0x62, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT only */
+ {KE_KEY, 0x63, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT */
+ {KE_KEY, 0x64, { KEY_SWITCHVIDEOMODE } }, /* SDSP TV */
+ {KE_KEY, 0x65, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + TV */
+ {KE_KEY, 0x66, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + TV */
+ {KE_KEY, 0x67, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + TV */
+ {KE_KEY, 0x6B, { KEY_TOUCHPAD_TOGGLE } }, /* Lock Touchpad */
{KE_KEY, 0x6C, { KEY_SLEEP } }, /* Suspend */
{KE_KEY, 0x6D, { KEY_SLEEP } }, /* Hibernate */
- {KE_KEY, 0x7E, { KEY_BLUETOOTH } },
- {KE_KEY, 0x7D, { KEY_BLUETOOTH } },
+ {KE_IGNORE, 0x6E, }, /* Low Battery notification */
+ {KE_KEY, 0x7D, { KEY_BLUETOOTH } }, /* Bluetooth Enable */
+ {KE_KEY, 0x7E, { KEY_BLUETOOTH } }, /* Bluetooth Disable */
{KE_KEY, 0x82, { KEY_CAMERA } },
- {KE_KEY, 0x88, { KEY_WLAN } },
- {KE_KEY, 0x8A, { KEY_PROG1 } },
+ {KE_KEY, 0x88, { KEY_RFKILL } }, /* Radio Toggle Key */
+ {KE_KEY, 0x8A, { KEY_PROG1 } }, /* Color enhancement mode */
+ {KE_KEY, 0x8C, { KEY_SWITCHVIDEOMODE } }, /* SDSP DVI only */
+ {KE_KEY, 0x8D, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + DVI */
+ {KE_KEY, 0x8E, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + DVI */
+ {KE_KEY, 0x8F, { KEY_SWITCHVIDEOMODE } }, /* SDSP TV + DVI */
+ {KE_KEY, 0x90, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + DVI */
+ {KE_KEY, 0x91, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + TV + DVI */
+ {KE_KEY, 0x92, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + TV + DVI */
+ {KE_KEY, 0x93, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + TV + DVI */
{KE_KEY, 0x95, { KEY_MEDIA } },
{KE_KEY, 0x99, { KEY_PHONE } },
- {KE_KEY, 0xc4, { KEY_KBDILLUMUP } },
- {KE_KEY, 0xc5, { KEY_KBDILLUMDOWN } },
- {KE_KEY, 0xb5, { KEY_CALC } },
+ {KE_KEY, 0xA0, { KEY_SWITCHVIDEOMODE } }, /* SDSP HDMI only */
+ {KE_KEY, 0xA1, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + HDMI */
+ {KE_KEY, 0xA2, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + HDMI */
+ {KE_KEY, 0xA3, { KEY_SWITCHVIDEOMODE } }, /* SDSP TV + HDMI */
+ {KE_KEY, 0xA4, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + HDMI */
+ {KE_KEY, 0xA5, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + TV + HDMI */
+ {KE_KEY, 0xA6, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + TV + HDMI */
+ {KE_KEY, 0xA7, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + TV + HDMI */
+ {KE_KEY, 0xB5, { KEY_CALC } },
+ {KE_KEY, 0xC4, { KEY_KBDILLUMUP } },
+ {KE_KEY, 0xC5, { KEY_KBDILLUMDOWN } },
{KE_END, 0},
};
@@ -1521,15 +1548,19 @@ static void asus_acpi_notify(struct acpi_device *device, u32 event)
dev_name(&asus->device->dev), event,
count);
- /* Brightness events are special */
- if (event >= ATKD_BR_MIN && event <= ATKD_BR_MAX) {
+ if (event >= ATKD_BRNUP_MIN && event <= ATKD_BRNUP_MAX)
+ event = ATKD_BRNUP;
+ else if (event >= ATKD_BRNDOWN_MIN &&
+ event <= ATKD_BRNDOWN_MAX)
+ event = ATKD_BRNDOWN;
- /* Ignore them completely if the acpi video driver is used */
+ /* Brightness events are special */
+ if (event == ATKD_BRNDOWN || event == ATKD_BRNUP) {
if (asus->backlight_device != NULL) {
/* Update the backlight device. */
asus_backlight_notify(asus);
+ return ;
}
- return ;
}
/* Accelerometer "coarse orientation change" event */
diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c
index be790402e0f..210b5b87212 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -59,6 +59,17 @@ static struct quirk_entry quirk_asus_unknown = {
.wapf = 0,
};
+/*
+ * For those machines that need software to control bt/wifi status
+ * and can't adjust brightness through ACPI interface
+ * and have duplicate events(ACPI and WMI) for display toggle
+ */
+static struct quirk_entry quirk_asus_x55u = {
+ .wapf = 4,
+ .wmi_backlight_power = true,
+ .no_display_toggle = true,
+};
+
static struct quirk_entry quirk_asus_x401u = {
.wapf = 4,
};
@@ -77,6 +88,15 @@ static struct dmi_system_id asus_quirks[] = {
DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
DMI_MATCH(DMI_PRODUCT_NAME, "X401U"),
},
+ .driver_data = &quirk_asus_x55u,
+ },
+ {
+ .callback = dmi_matched,
+ .ident = "ASUSTeK COMPUTER INC. X401A",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X401A"),
+ },
.driver_data = &quirk_asus_x401u,
},
{
@@ -95,6 +115,15 @@ static struct dmi_system_id asus_quirks[] = {
DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
DMI_MATCH(DMI_PRODUCT_NAME, "X501U"),
},
+ .driver_data = &quirk_asus_x55u,
+ },
+ {
+ .callback = dmi_matched,
+ .ident = "ASUSTeK COMPUTER INC. X501A",