diff options
Diffstat (limited to 'fs')
413 files changed, 26822 insertions, 18881 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 58e6cbce4156..08f2e1e9a7e6 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -603,10 +603,11 @@ static int v9fs_cache_register(void) if (ret < 0) return ret; #ifdef CONFIG_9P_FSCACHE - return fscache_register_netfs(&v9fs_cache_netfs); -#else - return ret; + ret = fscache_register_netfs(&v9fs_cache_netfs); + if (ret < 0) + v9fs_destroy_inode_cache(); #endif + return ret; } static void v9fs_cache_unregister(void) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index d384a8b77ee8..aa5ecf479a57 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -183,7 +183,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) else flock.length = fl->fl_end - fl->fl_start + 1; flock.proc_id = fl->fl_pid; - flock.client_id = utsname()->nodename; + flock.client_id = fid->clnt->name; if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; @@ -260,7 +260,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) else glock.length = fl->fl_end - fl->fl_start + 1; glock.proc_id = fl->fl_pid; - glock.client_id = utsname()->nodename; + glock.client_id = fid->clnt->name; res = p9_client_getlock_dotl(fid, &glock); if (res < 0) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 25b018efb8ab..94de6d1482e2 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -146,7 +146,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, char type = 0, ext[32]; int major = -1, minor = -1; - strncpy(ext, stat->extension, sizeof(ext)); + strlcpy(ext, stat->extension, sizeof(ext)); sscanf(ext, "%c %u %u", &type, &major, &minor); switch (type) { case 'c': @@ -1186,7 +1186,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, * this even with .u extension. So check * for non NULL stat->extension */ - strncpy(ext, stat->extension, sizeof(ext)); + strlcpy(ext, stat->extension, sizeof(ext)); /* HARDLINKCOUNT %u */ sscanf(ext, "%13s %u", tag_name, &i_nlink); if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 53687bbf2296..a7c481402c46 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -267,14 +267,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT)) + if (!(flags & O_CREAT) || dentry->d_inode) return finish_no_open(file, res); - else if (dentry->d_inode) { - if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) - return -EEXIST; - else - return finish_no_open(file, res); - } v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 5f95d1ed9c6d..b9acadafa4a1 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } static int adfs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/affs/file.c b/fs/affs/file.c index af3261b78102..8669b6ecddee 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); affs_truncate(inode); } } @@ -836,7 +836,7 @@ affs_truncate(struct inode *inode) struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - u32 size = inode->i_size; + loff_t size = inode->i_size; int res; res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 34494fbead0a..646337dc5201 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -685,16 +685,12 @@ not_found: spin_unlock(&dentry->d_lock); out_bad: - if (dentry->d_inode) { - /* don't unhash if we have submounts */ - if (have_submounts(dentry)) - goto out_skip; - } + /* don't unhash if we have submounts */ + if (check_submounts_and_drop(dentry) != 0) + goto out_skip; _debug("dropping dentry %s/%s", parent->d_name.name, dentry->d_name.name); - shrink_dcache_parent(dentry); - d_drop(dentry); dput(parent); key_put(key); @@ -755,10 +751,6 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) _enter("{%x:%u},{%s},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -820,10 +812,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) _enter("{%x:%u},{%s}", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -940,10 +928,6 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, _enter("{%x:%u},{%s},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -1009,10 +993,6 @@ static int afs_link(struct dentry *from, struct inode *dir, dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -1057,10 +1037,6 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, content); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - ret = -EINVAL; if (strlen(content) >= AFSPATHMAX) goto error; @@ -1131,10 +1107,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dvnode->fid.vid, new_dvnode->fid.vnode, new_dentry->d_name.name); - ret = -ENAMETOOLONG; - if (new_dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(orig_dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -26,6 +26,7 @@ #include <linux/mm.h> #include <linux/mman.h> #include <linux/mmu_context.h> +#include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> @@ -35,6 +36,10 @@ #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> +#include <linux/anon_inodes.h> +#include <linux/migrate.h> +#include <linux/ramfs.h> +#include <linux/percpu-refcount.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> @@ -61,14 +66,29 @@ struct aio_ring { #define AIO_RING_PAGES 8 +struct kioctx_table { + struct rcu_head rcu; + unsigned nr; + struct kioctx *table[]; +}; + +struct kioctx_cpu { + unsigned reqs_available; +}; + struct kioctx { - atomic_t users; + struct percpu_ref users; atomic_t dead; - /* This needs improving */ unsigned long user_id; - struct hlist_node list; + struct __percpu kioctx_cpu *cpu; + + /* + * For percpu reqs_available, number of slots we move to/from global + * counter at a time: + */ + unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. @@ -88,10 +108,18 @@ struct kioctx { long nr_pages; struct rcu_head rcu_head; - struct work_struct rcu_work; + struct work_struct free_work; struct { - atomic_t reqs_active; + /* + * This counts the number of available slots in the ringbuffer, + * so we avoid overflowing it: it's decremented (if positive) + * when allocating a kiocb and incremented when the resulting + * io_event is pulled off the ringbuffer. + * + * We batch accesses to it with a percpu version. + */ + atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { @@ -110,6 +138,9 @@ struct kioctx { } ____cacheline_aligned_in_smp; struct page *internal_pages[AIO_RING_PAGES]; + struct file *aio_ring_file; + + unsigned id; }; /*------ sysctl variables----*/ @@ -138,15 +169,77 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - long i; + int i; + struct file *aio_ring_file = ctx->aio_ring_file; - for (i = 0; i < ctx->nr_pages; i++) + for (i = 0; i < ctx->nr_pages; i++) { + pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, + page_count(ctx->ring_pages[i])); put_page(ctx->ring_pages[i]); + } if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); + + if (aio_ring_file) { + truncate_setsize(aio_ring_file->f_inode, 0); + fput(aio_ring_file); + ctx->aio_ring_file = NULL; + } +} + +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &generic_file_vm_ops; + return 0; } +static const struct file_operations aio_ring_fops = { + .mmap = aio_ring_mmap, +}; + +static int aio_set_page_dirty(struct page *page) +{ + return 0; +} + +#if IS_ENABLED(CONFIG_MIGRATION) +static int aio_migratepage(struct address_space *mapping, struct page *new, + struct page *old, enum migrate_mode mode) +{ + struct kioctx *ctx = mapping->private_data; + unsigned long flags; + unsigned idx = old->index; + int rc; + + /* Writeback must be complete */ + BUG_ON(PageWriteback(old)); + put_page(old); + + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + if (rc != MIGRATEPAGE_SUCCESS) { + get_page(old); + return rc; + } + + get_page(new); + + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + return rc; +} +#endif + +static const struct address_space_operations aio_ctx_aops = { + .set_page_dirty = aio_set_page_dirty, +#if IS_ENABLED(CONFIG_MIGRATION) + .migratepage = aio_migratepage, +#endif +}; + static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; @@ -154,20 +247,45 @@ static int aio_setup_ring(struct kioctx *ctx) struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; + int i; + struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); + if (IS_ERR(file)) { + ctx->aio_ring_file = NULL; + return -EAGAIN; + } + + file->f_inode->i_mapping->a_ops = &aio_ctx_aops; + file->f_inode->i_mapping->private_data = ctx; + file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; + + for (i = 0; i < nr_pages; i++) { + struct page *page; + page = find_or_create_page(file->f_inode->i_mapping, + i, GFP_HIGHUSER | __GFP_ZERO); + if (!page) + break; + pr_debug("pid(%d) page[%d]->count=%d\n", + current->pid, i, page_count(page)); + SetPageUptodate(page); + SetPageDirty(page); + unlock_page(page); + } + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); - ctx->nr_events = 0; ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), @@ -178,10 +296,11 @@ static int aio_setup_ring(struct kioctx *ctx) ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); - ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, 0, &populate); if (IS_ERR((void *)ctx->mmap_base)) { up_write(&mm->mmap_sem); ctx->mmap_size = 0; @@ -190,23 +309,34 @@ static int aio_setup_ring(struct kioctx *ctx) } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + + /* We must do this while still holding mmap_sem for write, as we + * need to be protected against userspace attempting to mremap() + * or munmap() the ring buffer. + */ ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 1, 0, ctx->ring_pages, NULL); + + /* Dropping the reference here is safe as the page cache will hold + * onto the pages for us. It is also required so that page migration + * can unmap the pages and get the right reference count. + */ + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); + up_write(&mm->mmap_sem); if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } - if (populate) - mm_populate(ctx->mmap_base, populate); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ - ring->id = ctx->user_id; + ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; @@ -238,11 +368,9 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, - struct io_event *res) +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; - int ret = -EINVAL; /* * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it @@ -252,28 +380,20 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, cancel = ACCESS_ONCE(kiocb->ki_cancel); do { if (!cancel || cancel == KIOCB_CANCELLED) - return ret; + return -EINVAL; old = cancel; cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); } while (cancel != old); - atomic_inc(&kiocb->ki_users); - spin_unlock_irq(&ctx->ctx_lock); - - memset(res, 0, sizeof(*res)); - res->obj = (u64)(unsigned long)kiocb->ki_obj.user; - res->data = kiocb->ki_user_data; - ret = cancel(kiocb, res); - - spin_lock_irq(&ctx->ctx_lock); - - return ret; + return cancel(kiocb); } static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + + free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } @@ -282,12 +402,13 @@ static void free_ioctx_rcu(struct rcu_head *head) * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ -static void free_ioctx(struct kioctx *ctx) +static void free_ioctx(struct work_struct *work) { + struct kioctx *ctx = container_of(work, struct kioctx, free_work); struct aio_ring *ring; - struct io_event res; struct kiocb *req; - unsigned head, avail; + unsigned cpu, avail; + DEFINE_WAIT(wait); spin_lock_irq(&ctx->ctx_lock); @@ -296,28 +417,38 @@ static void free_ioctx(struct kioctx *ctx) struct kiocb, ki_list); list_del_init(&req->ki_list); - kiocb_cancel(ctx, req, &res); + kiocb_cancel(ctx, req); } spin_unlock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_pages[0]); - head = ring->head; - kunmap_atomic(ring); + for_each_possible_cpu(cpu) { + struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); - while (atomic_read(&ctx->reqs_active) > 0) { - wait_event(ctx->wait, - head != ctx->tail || - atomic_read(&ctx->reqs_active) <= 0); + atomic_add(kcpu->reqs_available, &ctx->reqs_available); + kcpu->reqs_available = 0; + } - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + while (1) { + prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); - atomic_sub(avail, &ctx->reqs_active); - head += avail; - head %= ctx->nr_events; + ring = kmap_atomic(ctx->ring_pages[0]); + avail = (ring->head <= ring->tail) + ? ring->tail - ring->head + : ctx->nr_events - ring->head + ring->tail; + + atomic_add(avail, &ctx->reqs_available); + ring->head = ring->tail; + kunmap_atomic(ring); + + if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) + break; + + schedule(); } + finish_wait(&ctx->wait, &wait); - WARN_ON(atomic_read(&ctx->reqs_active) < 0); + WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); aio_free_ring(ctx); @@ -333,10 +464,68 @@ static void free_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static void put_ioctx(struct kioctx *ctx) +static void free_ioctx_ref(struct percpu_ref *ref) { - if (unlikely(atomic_dec_and_test(&ctx->users))) - free_ioctx(ctx); + struct kioctx *ctx = container_of(ref, struct kioctx, users); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); +} + +static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) +{ + unsigned i, new_nr; + struct kioctx_table *table, *old; + struct aio_ring *ring; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + while (1) { + if (table) + for (i = 0; i < table->nr; i++) + if (!table->table[i]) { + ctx->id = i; + table->table[i] = ctx; + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); + + ring = kmap_atomic(ctx->ring_pages[0]); + ring->id = ctx->id; + kunmap_atomic(ring); + return 0; + } + + new_nr = (table ? table->nr : 1) * 4; + + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); + + table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * + new_nr, GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->nr = new_nr; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + old = rcu_dereference(mm->ioctx_table); + + if (!old) { + rcu_assign_pointer(mm->ioctx_table, table); + } else if (table->nr > old->nr) { + memcpy(table->table, old->table, + old->nr * sizeof(struct kioctx *)); + + rcu_assign_pointer(mm->ioctx_table, table); + kfree_rcu(old, rcu); + } else { + kfree(table); + table = old; + } + } } /* ioctx_alloc @@ -348,6 +537,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) struct kioctx *ctx; int err = -ENOMEM; + /* + * We keep track of the number of available ringbuffer slots, to prevent + * overflow (reqs_available), and we also use percpu counters for this. + * + * So since up to half the slots might be on other cpu's percpu counters + * and unavailable, double nr_events so userspace sees what they + * expected: additionally, we move req_batch slots to/from percpu + * counters at a time, so make sure that isn't 0: + */ + nr_events = max(nr_events, num_possible_cpus() * 4); + nr_events *= 2; + /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || (nr_events > (0x10000000U / sizeof(struct kiocb)))) { @@ -355,7 +556,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-EINVAL); } - if (!nr_events || (unsigned long)nr_events > aio_max_nr) + if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); @@ -364,8 +565,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - atomic_set(&ctx->users, 2); - atomic_set(&ctx->dead, 0); + if (percpu_ref_init(&ctx->users, free_ioctx_ref)) + goto out_freectx; + spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); @@ -373,12 +575,21 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); + ctx->cpu = alloc_percpu(struct kioctx_cpu); + if (!ctx->cpu) + goto out_freeref; + if (aio_setup_ring(ctx) < 0) - goto out_freectx; + goto out_freepcpu; + + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); + if (ctx->req_batch < 1) + ctx->req_batch = 1; /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); - if (aio_nr + nr_events > aio_max_nr || + if (aio_nr + nr_events > (aio_max_nr * 2UL) || aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); goto out_cleanup; @@ -386,49 +597,54 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); - /* now link into global list. */ - spin_lock(&mm->ioctx_lock); - hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); - spin_unlock(&mm->ioctx_lock); + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + + err = ioctx_add_table(ctx, mm); + if (err) + goto out_cleanup_put; pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; +out_cleanup_put: + percpu_ref_put(&ctx->users); out_cleanup: err = -EAGAIN; aio_free_ring(ctx); +out_freepcpu: + free_percpu(ctx->cpu); +out_freeref: + free_percpu(ctx->users.pcpu_count); out_freectx: + if (ctx->aio_ring_file) + fput(ctx->aio_ring_file); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } -static void kill_ioctx_work(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - - wake_up_all(&ctx->wait); - put_ioctx(ctx); -} - -static void kill_ioctx_rcu(struct rcu_head *head) -{ - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - - INIT_WORK(&ctx->rcu_work, kill_ioctx_work); - schedule_work(&ctx->rcu_work); -} - /* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); + struct kioctx_table *table; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + WARN_ON(ctx != table->table[ctx->id]); + table->table[ctx->id] = NULL; + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); + + /* percpu_ref_kill() will do the necessary call_rcu() */ + wake_up_all(&ctx->wait); /* * It'd be more correct to do this in free_ioctx(), after all @@ -445,24 +661,23 @@ static void kill_ioctx(struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); - /* Between hlist_del_rcu() and dropping the initial ref */ - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + percpu_ref_kill(&ctx->users); } } /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. */ -ssize_t wait_on_sync_kiocb(struct kiocb *iocb) +ssize_t wait_on_sync_kiocb(struct kiocb *req) { - while (atomic_read(&iocb->ki_users)) { + while (!req->ki_ctx) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!atomic_read(&iocb->ki_users)) + if (req->ki_ctx) break; io_schedule(); } __set_current_state(TASK_RUNNING); - return iocb->ki_user_data; + return req->ki_user_data; } EXPORT_SYMBOL(wait_on_sync_kiocb); @@ -476,16 +691,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { + struct kioctx_table *table; struct kioctx *ctx; - struct hlist_node *n; - - hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { - if (1 != atomic_read(&ctx->users)) - printk(KERN_DEBUG - "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), - atomic_read(&ctx->dead), - atomic_read(&ctx->reqs_active)); + unsigned i = 0; + + while (1) { + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + do { + if (!table || i >= table->nr) { + rcu_read_unlock(); + rcu_assign_pointer(mm->ioctx_table, NULL); + if (table) + kfree(table); + return; + } + + ctx = table->table[i++]; + } while (!ctx); + + rcu_read_unlock(); + /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -496,40 +723,75 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(ctx); + kill_ioctx(mm, ctx); + } +} + +static void put_reqs_available(struct kioctx *ctx, unsigned nr) +{ + struct kioctx_cpu *kcpu; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { + kcpu->reqs_available -= ctx->req_batch; + atomic_add(ctx->req_batch, &ctx->reqs_available); + } + + preempt_enable(); +} + +static bool get_reqs_available(struct kioctx *ctx) +{ + struct kioctx_cpu *kcpu; + bool ret = false; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + if (!kcpu->reqs_available) { + int old, avail = atomic_read(&ctx->reqs_available); + + do { + if (avail < ctx->req_batch) + goto out; + + old = avail; + avail = atomic_cmpxchg(&ctx->reqs_available, + avail, avail - ctx->req_batch); + } while (avail != old); + + kcpu->reqs_available += ctx->req_batch; } + + ret = true; + kcpu->reqs_available--; +out: + preempt_enable(); + return ret; } /* aio_get_req - * Allocate a slot for an aio request. Increments the ki_users count - * of the kioctx so that the kioctx stays around until all requests are - * complete. Returns NULL if no requests are free. - * - * Returns with kiocb->ki_users set to 2. The io submit code path holds - * an extra reference while submitting the i/o. - * This prevents races between the aio code path referencing the - * req (after submitting it) and aio_complete() freeing the req. + * Allocate a slot for an aio request. + * Returns NULL if no requests are free. */ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) + if (!get_reqs_available(ctx)) return NULL; - if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) - goto out_put; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) goto out_put; - atomic_set(&req->ki_users, 2); req->ki_ctx = ctx; - return req; out_put: - atomic_dec(&ctx->reqs_active); + put_reqs_available(ctx, 1); return NULL; } @@ -539,35 +801,32 @@ static void kiocb_free(struct kiocb *req) fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); - if (req->ki_dtor) - req->ki_dtor(req); - if (req->ki_iovec != &req->ki_inline_vec) - kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); } -void aio_put_req(struct kiocb *req) -{ - if (atomic_dec_and_test(&req->ki_users)) - kiocb_free(req); -} -EXPORT_SYMBOL(aio_put_req); - static struct kioctx *lookup_ioctx(unsigned long ctx_id) { + struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; + struct kioctx_table *table; + unsigned id; + + if (get_user(id, &ring->id)) + return NULL; rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); - hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id) { - atomic_inc(&ctx->users); - ret = ctx; - break; - } - } + if (!table || id >= table->nr) + goto out; + ctx = table->table[id]; + if (ctx && ctx->user_id == ctx_id) { + percpu_ref_get(&ctx->users); + ret = ctx; + } +out: rcu_read_unlock(); return ret; } @@ -591,16 +850,16 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * - the sync task helpfully left a reference to itself in the iocb */ if (is_sync_kiocb(iocb)) { - BUG_ON(atomic_read(&iocb->ki_users) != 1); iocb->ki_user_data = res; - atomic_set(&iocb->ki_users, 0); + smp_wmb(); + iocb->ki_ctx = ERR_PTR(-EXDEV); wake_up_process(iocb->ki_obj.tsk); return; } /* * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after decrementing reqs_active. + * need to issue a wakeup after incrementing reqs_available. */ rcu_read_lock(); @@ -613,17 +872,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) } /* - * cancelled requests don't get events, userland was given one - * when the event got cancelled. - */ - if (unlikely(xchg(&iocb->ki_cancel, - KIOCB_CANCELLED) == KIOCB_CANCELLED)) { - atomic_dec(&ctx->reqs_active); - /* Still need the wake_up in case free_ioctx is waiting */ - goto put_rq; - } - - /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. @@ -675,9 +923,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (iocb->ki_eventfd != NULL) eventfd_signal(iocb->ki_eventfd, 1); -put_rq: /* everything turned out well, dispose of the aiocb. */ - aio_put_req(iocb); + kiocb_free(iocb); /* * We have to order our ring_info tail store above and test @@ -702,7 +949,7 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; - unsigned head, pos; + unsigned head, tail, pos; long ret = 0; int copy_ret; @@ -710,11 +957,12 @@ static long aio_read_events_ring(struct kioctx *ctx, ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; + tail = ring->tail; kunmap_atomic(ring); - pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); - if (head == ctx->tail) + if (head == tail) goto out; while (ret < nr) { @@ -722,8 +970,8 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event *ev; struct page *page; - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; - if (head == ctx->tail) + avail = (head <= tail ? tail : ctx->nr_events) - head; + if (head == tail) break; avail = min(avail, nr - ret); @@ -754,9 +1002,9 @@ static long aio_read_events_ring(struct kioctx *ctx, kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); - pr_debug("%li h%u t%u\n", ret, head, ctx->tail); + pr_debug("%li h%u t%u\n", ret, head, tail); - atomic_sub(ret, &ctx->reqs_active); + put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); @@ -854,8 +1102,8 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(ioctx); - put_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); + percpu_ref_put(&ioctx->users); } out: @@ -872,101 +1120,37 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(ioctx); - put_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); + percpu_ref_put(&ioctx->users); return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); return -EINVAL; } -static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) -{ - struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; - - BUG_ON(ret <= 0); - - while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { - ssize_t this = min((ssize_t)iov->iov_len, ret); - iov->iov_base += this; - iov->iov_len -= this; - iocb->ki_left -= this; - ret -= this; - if (iov->iov_len == 0) { - iocb->ki_cur_seg++; - iov++; - } - } - - /* the caller should not have done more io than what fit in - * the remaining iovecs */ - BUG_ON(ret > 0 && iocb->ki_left == 0); -} - typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, unsigned long, loff_t); -static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret = 0; - - /* This matches the pread()/pwrite() logic */ - if (iocb->ki_pos < 0) - return -EINVAL; - - if (rw == WRITE) - file_start_write(file); - do { - ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], - iocb->ki_nr_segs - iocb->ki_cur_seg, - iocb->ki_pos); - if (ret > 0) - aio_advance_iovec(iocb, ret); - - /* retry all partial writes. retry partial reads as long as its a - * regular file. */ - } while (ret > 0 && iocb->ki_left > 0 && - (rw == WRITE || - (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); - if (rw == WRITE) - file_end_write(file); - - /* This means we must have transferred all that we could */ - /* No need to retry anymore */ - if ((ret == 0) || (iocb->ki_left == 0)) - ret = iocb->ki_nbytes - iocb->ki_left; - - /* If we managed to write some out we return that, rather than - * the eventual error. */ - if (rw == WRITE - && ret < 0 && ret != -EIOCBQUEUED - && iocb->ki_nbytes - iocb->ki_left) - ret = iocb->ki_nbytes - iocb->ki_left; - - return ret; -} - -static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, + int rw, char __user *buf, + unsigned long *nr_segs, + struct iovec **iovec, + bool compat) { ssize_t ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; + *nr_segs = kiocb->ki_nbytes; #ifdef CONFIG_COMPAT if (compat) ret = compat_rw_copy_check_uvector(rw, - (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + (struct compat_iovec __user *)buf, + *nr_segs, 1, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, - (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + (struct iovec __user *)buf, + *nr_segs, 1, *iovec, iovec); if (ret < 0) return ret; @@ -975,15 +1159,17 @@ static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) return 0; } -static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(struct kiocb *kiocb, + int rw, char __user *buf, + unsigned long *nr_segs, + struct iovec *iovec) { - if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) return -EFAULT; - kiocb->ki_iovec = &kiocb->ki_inline_vec; - kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; - kiocb->ki_nr_segs = 1; + iovec->iov_base = buf; + iovec->iov_len = kiocb->ki_nbytes; + *nr_segs = 1; return 0; } @@ -992,15 +1178,18 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_run_iocb(struct kiocb *req, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, + char __user *buf, bool compat) { struct file *file = req->ki_filp; ssize_t ret; + unsigned long nr_segs; int rw; fmode_t mode; aio_rw_op *rw_op; + struct iovec inline_vec, *iovec = &inline_vec; - switch (req->ki_opcode) { + switch (opcode) { case IOCB_CMD_PREAD: case IOCB_CMD_PREADV: mode = FMODE_READ; @@ -1021,21 +1210,38 @@ rw_common: if (!rw_op) return -EINVAL; - ret = (req->ki_opcode == IOCB_CMD_PREADV || - req->ki_opcode == IOCB_CMD_PWRITEV) - ? aio_setup_vectored_rw(rw, req, compat) - : aio_setup_single_vector(rw, req); + ret = (opcode == IOCB_CMD_PREADV || + opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, + &iovec, compat) + : aio_setup_single_vector(req, rw, buf, &nr_segs, + iovec); if (ret) return ret; ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); - if (ret < 0) + if (ret < 0) { + if (iovec != &inline_vec) + kfree(iovec); return ret; + } req->ki_nbytes = ret; - req->ki_left = ret; - ret = aio_rw_vect_retry(req, rw, rw_op); + /* XXX: move/kill - rw_verify_area()? */ + /* This matches the pread()/pwrite() logic */ + if (req->ki_pos < 0) { + ret = -EINVAL; + break; + } + + if (rw == WRITE) + file_start_write(file); + + ret = rw_op(req, iovec, nr_segs, req->ki_pos); + + if (rw == WRITE) + file_end_write(file); break; case IOCB_CMD_FDSYNC: @@ -1057,6 +1263,9 @@ rw_common: return -EINVAL; } + if (iovec != &inline_vec) + kfree(iovec); + if (ret != -EIOCBQUEUED) { /* * There's no easy way to restart the syscall since other AIO's @@ -1128,21 +1337,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_obj.user = user_iocb; req->ki_user_data = iocb->aio_data; req->ki_pos = iocb->aio_offset; + req->ki_nbytes = iocb->aio_nbytes; - req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; - req->ki_left = req->ki_nbytes = iocb->aio_nbytes; - req->ki_opcode = iocb->aio_lio_opcode; - - ret = aio_run_iocb(req, compat); + ret = aio_run_iocb(req, iocb->aio_lio_opcode, + (char __user *)(unsigned long)iocb->aio_buf, + compat); if (ret) goto out_put_req; - aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: - atomic_dec(&ctx->reqs_active); - aio_put_req(req); /* drop extra ref to req */ - aio_put_req(req); /* drop i/o ref to req */ + put_reqs_available(ctx, 1); + kiocb_free(req); return ret; } @@ -1195,7 +1401,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, } blk_finish_plug(&plug); - put_ioctx(ctx); + percpu_ref_put(&ctx->users); return i ? i : ret; } @@ -1252,7 +1458,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1270,21 +1475,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, kiocb = lookup_kiocb(ctx, iocb, key); if (kiocb) - ret = kiocb_cancel(ctx, kiocb, &res); + ret = kiocb_cancel(ctx, kiocb); else ret = -EINVAL; spin_unlock_irq(&ctx->ctx_lock); if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. + /* + * The result argument is no longer used - the io_event is + * always delivered via the ring buffer. -EINPROGRESS indicates + * cancellation is progress: */ - if (copy_to_user(result, &res, sizeof(res))) - ret = -EFAULT; + ret = -EINPROGRESS; } - put_ioctx(ctx); + percpu_ref_put(&ctx->users); return ret; } @@ -1313,7 +1519,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, timeout); - put_ioctx(ioctx); + percpu_ref_put(&ioctx->users); } return ret; } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 47a65df8c871..85c961849953 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -109,6 +109,72 @@ static struct file_system_type anon_inode_fs_type = { }; /** + * anon_inode_getfile_private - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * + * Similar to anon_inode_getfile, but each file holds a single inode. + * + */ +struct file *anon_inode_getfile_private(const char *name, + const struct file_operations *fops, + void *priv, int flags) +{ + struct qstr this; + struct path path; + struct file *file; + struct inode *inode; + + if (fops->owner && !try_module_get(fops->owner)) + return ERR_PTR(-ENOENT); + + inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb); + if (IS_ERR(inode)) { + file = ERR_PTR(-ENOMEM); + goto err_module; + } + + /* + * Link the inode to a directory entry by creating a unique name + * using the inode sequence number. + */ + file = ERR_PTR(-ENOMEM); + this.name = name; + this.len = strlen(name); + this.hash = 0; + path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); + if (!path.dentry) + goto err_module; + + path.mnt = mntget(anon_inode_mnt); + + d_instantiate(path.dentry, inode); + + file = alloc_file(&path, OPEN_FMODE(flags), fops); + if (IS_ERR(file)) + goto err_dput; + + file->f_mapping = inode->i_mapping; + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); + file->private_data = priv; + + return file; + +err_dput: + path_put(&path); +err_module: + module_put(fops->owner); + return file; +} +EXPORT_SYMBOL_GPL(anon_inode_getfile_private); + +/** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 743c7c2c949d..0f00da329e71 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -183,13 +183,14 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, return 0; } +/* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, int test(struct path *path, void *data), void *data) { struct path path; - int err = kern_path(pathname, 0, &path); + int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); if (err) return err; err = -ENOENT; @@ -197,10 +198,9 @@ static int find_autofs_mount(const char *pathname, if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); - if (!err) /* already found some */ - path_put(res); *res = path; err = 0; + break; } } if (!follow_up(&path)) @@ -486,12 +486,11 @@ static int autofs_dev_ioctl_askumount(struct file *fp, * mount if there is one or 0 if it isn't a mountpoint. * * If we aren't supplied with a file descriptor then we - * lookup the nameidata of the path and check if it is the - * root of a mount. If a type is given we are looking for - * a particular autofs mount and if we don't find a match - * we return fail. If the located nameidata path is the - * root of a mount we return 1 along with the super magic - * of the mount or 0 otherwise. + * lookup the path and check if it is the root of a mount. + * If a type is given we are looking for a particular autofs + * mount and if we don't find a match we return fail. If the + * located path is the root of a mount we return 1 along with + * the super magic of the mount or 0 otherwise. * * In both cases the the device number (as returned by * new_encode_dev()) is also returned. @@ -519,9 +518,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) - err = kern_path(name, LOOKUP_FOLLOW, &path); + err = kern_path_mountpoint(AT_FDCWD, + name, &path, LOOKUP_FOLLOW); else - err = find_autofs_mount(name, &path, test_by_type, &type); + err = find_autofs_mount(name, &path, + test_by_type, &type); if (err) goto out; devid = new_encode_dev(path.dentry->d_sb->s_dev); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 3db70dae40d3..689e40d983ad 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -109,13 +109,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; - mutex_lock(&sbi->wq_mutex); - /* Check if we have become catatonic */ - if (sbi->catatonic) { - mutex_unlock(&sbi->wq_mutex); - return; - } switch (type) { /* Kernel protocol v4 missing and expire packets */ case autofs_ptype_missing: @@ -427,7 +421,6 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->tgid = current->tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; - mutex_unlock(&sbi->wq_mutex); if (sbi->version < 5) { if (notify == NFY_MOUNT) @@ -449,15 +442,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); - /* autofs4_notify_daemon() may block */ + /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */ autofs4_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; - mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); } /* diff --git a/fs/bfs/file.c b/fs/bfs/file.c index ad3ea1497cc3..ae2892218335 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } static int bfs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 8fb42916d8a2..fc60b31453ee 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -716,13 +716,14 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) return 0; bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); - - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_integrity_pool) + if (!bs->bio_integrity_pool) return -1; - if (!bs->bio_integrity_pool) + bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_integrity_pool) { + mempool_destroy(bs->bio_integrity_pool); return -1; + } return 0; } @@ -734,7 +735,7 @@ void bioset_integrity_free(struct bio_set *bs) mempool_destroy(bs->bio_integrity_pool); if (bs->bvec_integrity_pool) - mempool_destroy(bs->bio_integrity_pool); + mempool_destroy(bs->bvec_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); @@ -917,8 +917,8 @@ void bio_copy_data(struct bio *dst, struct bio *src) src_p = kmap_atomic(src_bv->bv_page); dst_p = kmap_atomic(dst_bv->bv_page); - memcpy(dst_p + dst_bv->bv_offset, - src_p + src_bv->bv_offset, + memcpy(dst_p + dst_offset, + src_p + src_offset, bytes); kunmap_atomic(dst_p); @@ -1956,7 +1956,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_subsys_state(current, blkio_subsys_id); + css = task_css(current, blkio_subsys_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/fs/block_dev.c b/fs/block_dev.c index c7bda5cd3da7..1e86823a9cbd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -592,7 +592,7 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } -static inline int sb_is_blkdev_sb(struct super_block *sb) +int sb_is_blkdev_sb(struct super_block *sb) { return sb == blockdev_superblock; } @@ -1519,7 +1519,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); @@ -1542,7 +1542,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, return 0; size -= pos; - if (size < iocb->ki_left) + if (size < iocb->ki_nbytes) nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); return generic_file_aio_read(iocb, iov, nr_segs, pos); } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2a9bd5bd24c3..9efb94e95858 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -744,7 +744,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) WARN_ON(atomic_xchg( &fs_info->mutually_exclusive_operation_running, 1)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); - return PTR_RET(task); + return PTR_ERR_OR_ZERO(task); } static int btrfs_dev_replace_kthread(void *data) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d12107e90987..72da4df53c9a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1718,7 +1718,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; - if (num_written > 0 || num_written == -EIOCBQUEUED) { + if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0 && num_written > 0) num_written = err; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4f419bafd071..b4f9904c4c6b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode) { - loff_t oldsize; int ret = 0; - oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); - truncate_pagecache(inode, oldsize, 0); + truncate_pagecache(inode, 0); /* * We don't need an orphan item because truncating the free space cache diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8af6c03953ad..22ebc13b6c99 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3106,7 +3106,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - ret = PTR_RET(inode); + ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ESTALE) goto out; @@ -4349,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); if (newsize > oldsize) { - truncate_pagecache(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) return ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b4b15467426b..e46e0ed74925 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -220,7 +220,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) len = PAGE_ALIGN(len); if (p->buf == p->inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS); + tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); if (!tmp_buf) { tmp_buf = vmalloc(len); if (!tmp_buf) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b0203b1322ac..043b215769c2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3329,7 +3329,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); - return PTR_RET(tsk); + return PTR_ERR_OR_ZERO(tsk); } int btrfs_recover_balance(struct btrfs_fs_info *fs_info) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index d4c1206af9fc..43eb5592cdea 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -378,6 +378,31 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) } /* + * check if the backing cache is updated to FS-Cache + * - called by FS-Cache when evaluates if need to invalidate the cache + */ +static bool cachefiles_check_consistency(struct fscache_operation *op) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("{OBJ%x}", op->object->debug_id); + + object = container_of(op->object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_check_auxdata(object); + cachefiles_end_secure(cache, saved_cred); + + _leave(" = %d", ret); + return ret; +} + +/* * notification the attributes on an object have changed * - called with reads/writes excluded by FS-Cache */ @@ -522,4 +547,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = { .write_page = cachefiles_write_page, .uncache_page = cachefiles_uncache_page, .dissociate_pages = cachefiles_dissociate_pages, + .check_consistency = cachefiles_check_consistency, }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 49382519907a..5349473df1b1 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -235,6 +235,7 @@ extern int cachefiles_set_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); extern int cachefiles_update_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); +extern int cachefiles_check_auxdata(struct cachefiles_object *object); extern int cachefiles_check_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 25badd1aec5c..f4a08d7fa2f7 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -56,7 +56,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object, object->fscache.cookie->parent, object->fscache.cookie->netfs_data, object->fscache.cookie->flags); - if (keybuf) + if (keybuf && cookie->def) keylen = cookie->def->get_key(cookie->netfs_data, keybuf, CACHEFILES_KEYBUF_SIZE); else diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 2476e5162609..12b0eef84183 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -157,6 +157,43 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, } /* + * check the consistency between the backing cache and the FS-Cache cookie + */ +int cachefiles_check_auxdata(struct cachefiles_object *object) +{ + struct cachefiles_xattr *auxbuf; + enum fscache_checkaux validity; + struct dentry *dentry = object->dentry; + ssize_t xlen; + int ret; + + ASSERT(dentry); + ASSERT(dentry->d_inode); + ASSERT(object->fscache.cookie->def->check_aux); + + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); + if (!auxbuf) + return -ENOMEM; + + xlen = vfs_getxattr(dentry, cachefiles_xattr_cache, + &auxbuf->type, 512 + 1); + ret = -ESTALE; + if (xlen < 1 || + auxbuf->type != object->fscache.cookie->def->type) + goto error; + + xlen--; + validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen); + if (validity != FSCACHE_CHECKAUX_OKAY) + goto error; + + ret = 0; +error: + kfree(auxbuf); + return ret; +} + +/* * check the state xattr on a cache file * - return -ESTALE if the object should be deleted */ diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 49bc78243db9..ac9a2ef5bb9b 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -16,3 +16,12 @@ config CEPH_FS If unsure, say N. +if CEPH_FS +config CEPH_FSCACHE + bool "Enable Ceph client caching support" + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for Ceph clients using FS-Cache + +endif diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index bd352125e829..32e30106a2f0 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -9,3 +9,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5318a3b704f6..6df8bd481425 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -11,6 +11,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/osd_client.h> /* @@ -70,15 +71,16 @@ static int ceph_set_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct inode *inode; struct ceph_inode_info *ci; - int undo = 0; struct ceph_snap_context *snapc; + int ret; if (unlikely(!mapping)) return !TestSetPageDirty(page); - if (TestSetPageDirty(page)) { + if (PageDirty(page)) { dout("%p set_page_dirty %p idx %lu -- already dirty\n", mapping->host, page, page->index); + BUG_ON(!PagePrivate(page)); return 0; } @@ -107,35 +109,19 @@ static int ceph_set_page_dirty(struct page *page) snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); - /* now adjust page */ - spin_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, page->mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - - /* - * Reference snap context in page->private. Also set - * PagePrivate so that we get invalidatepage callback. - */ - page->private = (unsigned long)snapc; - SetPagePrivate(page); - } else { - dout("ANON set_page_dirty %p (raced truncate?)\n", page); - undo = 1; - } - - spin_unlock_irq(&mapping->tree_lock); - - if (undo) - /* whoops, we failed to dirty the page */ - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + BUG_ON(PagePrivate(page)); + page->private = (unsigned long)snapc; + SetPagePrivate(page); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + ret = __set_page_dirty_nobuffers(page); + WARN_ON(!PageLocked(page)); + WARN_ON(!page->mapping); - BUG_ON(!PageDirty(page)); - return 1; + return ret; } /* @@ -150,11 +136,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, struct ceph_inode_info *ci; struct ceph_snap_context *snapc = page_snap_context(page); - BUG_ON(!PageLocked(page)); - BUG_ON(!PagePrivate(page)); - BUG_ON(!page->mapping); - inode = page->mapping->host; + ci = ceph_inode(inode); + + if (offset != 0 || length != PAGE_CACHE_SIZE) { + dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", + inode, page, page->index, offset, length); + return; + } + + ceph_invalidate_fscache_page(inode, page); + + if (!PagePrivate(page)) + return; /* * We can get non-dirty pages here due to races between @@ -164,31 +158,28 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, if (!PageDirty(page)) pr_err("%p invalidatepage %p page not dirty\n", inode, page); - if (offset == 0 && length == PAGE_CACHE_SIZE) - ClearPageChecked(page); + ClearPageChecked(page); - ci = ceph_inode(inode); - if (offset == 0 && length == PAGE_CACHE_SIZE) { - dout("%p invalidatepage %p idx %lu full dirty page\n", - inode, page, page->index); - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); - page->private = 0; - ClearPagePrivate(page); - } else { - dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n", - inode, page, page->index, offset, length); - } + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); + + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); } -/* just a sanity check */ static int ceph_releasepage(struct page *page, gfp_t g) { struct inode *inode = page->mapping ? page->mapping->host : NULL; dout("%p releasepage %p idx %lu\n", inode, page, page->index); WARN_ON(PageDirty(page)); - WARN_ON(PagePrivate(page)); - return 0; + + /* Can we release the page from the cache? */ + if (!ceph_release_fscache_page(page, g)) + return 0; + + return !PagePrivate(page); } /* @@ -198,11 +189,16 @@ static int readpage_nounlock(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = + struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; int err = 0; u64 len = PAGE_CACHE_SIZE; + err = ceph_readpage_from_fscache(inode, page); + + if (err == 0) + goto out; + dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, @@ -220,6 +216,9 @@ static int readpage_nounlock(struct file *filp, struct page *page) } SetPageUptodate(page); + if (err == 0) + ceph_readpage_to_fscache(inode, page); + out: return err < 0 ? err : 0; } @@ -262,6 +261,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) page->index); flush_dcache_page(page); SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); unlock_page(page); page_cache_release(page); bytes -= PAGE_CACHE_SIZE; @@ -331,11 +331,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) page = list_entry(page_list->prev, struct page, lru); BUG_ON(PageLocked(page)); list_del(&page->lru); - + dout("start_read %p adding %p idx %lu\n", inode, page, page->index); if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_NOFS)) { + ceph_fscache_uncache_page(inode, page); page_cache_release(page); dout("start_read %p add_to_page_cache failed %p\n", inode, page); @@ -378,6 +379,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, int rc = 0; int max = 0; + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, + &nr_pages); + + if (rc == 0) + goto out; + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; @@ -392,6 +399,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, BUG_ON(rc == 0); } out: + ceph_fscache_readpages_cancel(inode, page_list); + dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } @@ -497,6 +506,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + ceph_readpage_to_fscache(inode, page); + set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, @@ -552,7 +563,6 @@ static void ceph_release_pages(struct page **pages, int num) pagevec_release(&pvec); } - /* * async writeback completion handler. * diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c new file mode 100644 index 000000000000..6bfe65e0b038 --- /dev/null +++ b/fs/ceph/cache.c @@ -0,0 +1,398 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include "super.h" +#include "cache.h" + +struct ceph_aux_inode { + struct timespec mtime; + loff_t size; +}; + +struct fscache_netfs ceph_cache_netfs = { + .name = "ceph", + .version = 0, +}; + +static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_fs_client* fsc = cookie_netfs_data; + uint16_t klen; + + klen = sizeof(fsc->client->fsid); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &fsc->client->fsid, klen); + return klen; +} + +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { + .name = "CEPH.fsid", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = ceph_fscache_session_get_key, +}; + +int ceph_fscache_register(void) +{ + return fscache_register_netfs(&ceph_cache_netfs); +} + +void ceph_fscache_unregister(void) +{ + fscache_unregister_netfs(&ceph_cache_netfs); +} + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, + &ceph_fscache_fsid_object_def, + fsc); + + if (fsc->fscache == NULL) { + pr_err("Unable to resgister fsid: %p fscache cookie", fsc); + return 0; + } + + fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); + if (fsc->revalidate_wq == NULL) + return -ENOMEM; + + return 0; +} + +static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + uint16_t klen; + + /* use ceph virtual inode (id + snaphot) */ + klen = sizeof(ci->i_vino); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &ci->i_vino, klen); + return klen; +} + +static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct ceph_aux_inode aux; + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + memcpy(buffer, &aux, sizeof(aux)); + + return sizeof(aux); +} + +static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + *size = inode->i_size; +} + +static enum fscache_checkaux ceph_fscache_inode_check_aux( + void *cookie_netfs_data, const void *data, uint16_t dlen) +{ + struct ceph_aux_inode aux; + struct ceph_inode_info* ci = cookie_netfs_data; + struct inode* inode = &ci->vfs_inode; + + if (dlen != sizeof(aux)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + if (memcmp(data, &aux, sizeof(aux)) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + dout("ceph inode 0x%p cached okay", ci); + return FSCACHE_CHECKAUX_OKAY; +} + +static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) +{ + struct ceph_inode_info* ci = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dout("ceph inode 0x%p now uncached", ci); + + while (1) { + nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +static const struct fscache_cookie_def ceph_fscache_inode_object_def = { + .name = "CEPH.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = ceph_fscache_inode_get_key, + .get_attr = ceph_fscache_inode_get_attr, + .get_aux = ceph_fscache_inode_get_aux, + .check_aux = ceph_fscache_inode_check_aux, + .now_uncached = ceph_fscache_inode_now_uncached, +}; + +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci) +{ + struct inode* inode = &ci->vfs_inode; + + /* No caching for filesystem */ + if (fsc->fscache == NULL) + return; + + /* Only cache for regular files that are read only */ + if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + return; + + /* Avoid multiple racing open requests */ + mutex_lock(&inode->i_mutex); + + if (ci->fscache) + goto done; + + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci); +done: + mutex_unlock(&inode->i_mutex); + +} + +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ + struct fscache_cookie* cookie; + + if ((cookie = ci->fscache) == NULL) + return; + + ci->fscache = NULL; + + fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); + fscache_relinquish_cookie(cookie, 0); +} + +static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); +} + +static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +static inline int cache_valid(struct ceph_inode_info *ci) +{ + return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && + (ci->i_fscache_gen == ci->i_rdcache_gen)); +} + + +/* Atempt to read from the fscache, + * + * This function is called from the readpage_nounlock context. DO NOT attempt to + * unlock the page here (or in the callback). + */ +int ceph_readpage_from_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(ci->fscache, page, + ceph_vfs_readpage_complete, NULL, + GFP_KERNEL); + + switch (ret) { + case 0: /* Page found */ + dout("page read submitted\n"); + return 0; + case -ENOBUFS: /* Pages were not found, and can't be */ + case -ENODATA: /* Pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, + ceph_vfs_readpage_complete_unlock, + NULL, mapping_gfp_mask(mapping)); + + switch (ret) { + case 0: /* All pages found */ + dout("all-page read submitted\n"); + return 0; + case -ENOBUFS: /* Some pages were not found, and can't be */ + case -ENODATA: /* some pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +void ceph_readpage_to_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!PageFsCache(page)) + return; + + if (!cache_valid(ci)) + return; + + ret = fscache_write_page(ci->fscache, page, GFP_KERNEL); + if (ret) + fscache_uncache_page(ci->fscache, page); +} + +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + fscache_wait_on_page_write(ci->fscache, page); + fscache_uncache_page(ci->fscache, page); +} + +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ + if (fsc->revalidate_wq) + destroy_workqueue(fsc->revalidate_wq); + + fscache_relinquish_cookie(fsc->fscache, 0); + fsc->fscache = NULL; +} + +static void ceph_revalidate_work(struct work_struct *work) +{ + int issued; + u32 orig_gen; + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_revalidate_work); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + orig_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); + + if (!(issued & CEPH_CAP_FILE_CACHE)) { + dout("revalidate_work lost cache before validation %p\n", + inode); + goto out; + } + + if (!fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + + spin_lock(&ci->i_ceph_lock); + /* Update the new valid generation (backwards sanity check too) */ + if (orig_gen > ci->i_fscache_gen) { + ci->i_fscache_gen = orig_gen; + } + spin_unlock(&ci->i_ceph_lock); + +out: + iput(&ci->vfs_inode); +} + +void ceph_queue_revalidate(struct inode *inode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + + if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + return; + + ihold(inode); + + if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, + &ci->i_revalidate_work)) { + dout("ceph_queue_revalidate %p\n", inode); + } else { + dout("ceph_queue_revalidate %p failed\n)", inode); + iput(inode); + } +} + +void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ + ci->fscache = NULL; + /* The first load is verifed cookie open time */ + ci->i_fscache_gen = 1; + INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); +} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h new file mode 100644 index 000000000000..ba949408a336 --- /dev/null +++ b/fs/ceph/cache.h @@ -0,0 +1,159 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#ifndef _CEPH_CACHE_H +#define _CEPH_CACHE_H + +#ifdef CONFIG_CEPH_FSCACHE + +extern struct fscache_netfs ceph_cache_netfs; + +int ceph_fscache_register(void); +void ceph_fscache_unregister(void); + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); + +void ceph_fscache_inode_init(struct ceph_inode_info *ci); +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci); +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); + +int ceph_readpage_from_fscache(struct inode *inode, struct page *page); +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +void ceph_readpage_to_fscache(struct inode *inode, struct page *page); +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); +void ceph_queue_revalidate(struct inode *inode); + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(ceph_inode(inode)->fscache); +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_uncache_page(ci->fscache, page); +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + struct inode* inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_maybe_release_page(ci->fscache, page, gfp); +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_readpages_cancel(ci->fscache, pages); +} + +#else + +static inline int ceph_fscache_register(void) +{ + return 0; +} + +static inline void ceph_fscache_unregister(void) +{ +} + +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + return 0; +} + +static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +} + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ +} + +static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, + struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *pages) +{ +} + +static inline int ceph_readpage_from_fscache(struct inode* inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void ceph_readpage_to_fscache(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ +} + +static inline void ceph_invalidate_fscache_page(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + return 1; +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ +} + +static inline void ceph_queue_revalidate(struct inode *inode) +{ +} + +#endif + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 25442b40c25a..13976c33332e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -10,6 +10,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> @@ -479,8 +480,9 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * i_rdcache_gen. */ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; + } /* * if we are newly issued FILE_SHARED, mark dir not complete; we @@ -2072,19 +2074,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); - if (!(need & CEPH_CAP_FILE_WR)) - mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - if (!(need & CEPH_CAP_FILE_WR)) - mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); } - if (need & CEPH_CAP_FILE_WR) { + have = __ceph_caps_issued(ci, &implemented); + + if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); - if (endoff > ci->i_wanted_max_size) { + if (endoff > ci->i_requested_max_size) { *check_max = 1; ret = 1; } @@ -2099,7 +2099,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } } - have = __ceph_caps_issued(ci, &implemented); if ((have & need) == need) { /* @@ -2141,14 +2140,17 @@ static void check_max_size(struct inode *inode, loff_t endoff) /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); - if ((endoff >= ci->i_max_size || - endoff > (inode->i_size << 1)) && - endoff > ci->i_wanted_max_size) { + if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { dout("write %p at large endoff %llu, req max_size\n", inode, endoff); ci->i_wanted_max_size = endoff; - check = 1; } + /* duplicate ceph_check_caps()'s logic */ + if (ci->i_auth_cap && + (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && + ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) + check = 1; spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -2334,6 +2336,38 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } /* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ + struct dentry *dn, *prev = NULL; + + dout("invalidate_aliases inode %p\n", inode); + d_prune_aliases(inode); + /* + * For non-directory inode, d_find_alias() only returns + * connected dentry. After calling d_invalidate(), the + * dentry become disconnected. + * + * For directory inode, d_find_alias() can return + * disconnected dentry. But directory inode should have + * one alias at most. + */ + while ((dn = d_find_alias(inode))) { + if (dn == prev) { + dput(dn); + break; + } + d_invalidate(dn); + if (prev) + dput(prev); + prev = dn; + } + if (prev) + dput(prev); +} + +/* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * @@ -2361,8 +2395,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int check_caps = 0; int wake = 0; int writeback = 0; - int revoked_rdcache = 0; int queue_invalidate = 0; + int deleted_inode = 0; + int queue_revalidate = 0; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); @@ -2377,9 +2412,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { - if (try_nonblocking_invalidate(inode) == 0) { - revoked_rdcache = 1; - } else { + if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { @@ -2387,6 +2420,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } + + ceph_fscache_invalidate(inode); } /* side effects now are allowed */ @@ -2407,8 +2442,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) + if ((issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); + if (inode->i_nlink == 0 && + (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = 1; + } if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); @@ -2424,6 +2463,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } } + /* Do we need to revalidate our fscache cookie. Don't bother on the + * first cache cap as we already validate at cookie creation time. */ + if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) + queue_revalidate = 1; + /* size/ctime/mtime/atime? */ ceph_fill_file_size(inode, issued, le32_to_cpu(grant->truncate_seq), @@ -2508,6 +2552,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, BUG_ON(cap->issued & ~cap->implemented); spin_unlock(&ci->i_ceph_lock); + if (writeback) /* * queue inode for writeback: we can't actually call @@ -2517,6 +2562,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); + if (deleted_inode) + invalidate_aliases(inode); + if (queue_revalidate) + ceph_queue_revalidate(inode); if (wake) wake_up_all(&ci->i_cap_wq); @@ -2673,8 +2722,10 @@ static void handle_cap_trunc(struct inode *inode, truncate_seq, truncate_size, size); spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) + if (queue_trunc) { ceph_queue_vmtruncate(inode); + ceph_fscache_invalidate(inode); + } } /* diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a40ceda47a32..868b61d56cac 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -793,6 +793,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_SHARED on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; err = ceph_mdsc_do_request(mdsc, dir, req); if (err) { d_drop(dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2ddf061c1c4a..3de89829e2a1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -8,9 +8,11 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/aio.h> +#include <linux/falloc.h> #include "super.h" #include "mds_client.h" +#include "cache.h" /* * Ceph file operations @@ -68,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: + /* First file open request creates the cookie, we want to keep + * this cookie around for the filetime of the inode as not to + * have to worry about fscache register / revoke / operation + * races. + * + * Also, if we know the operation is going to invalidate data + * (non readonly) just nuke the cache right away. + */ + ceph_fscache_register_inode_cookie(mdsc->fsc, ci); + if ((fmode & CEPH_FILE_MODE_WR)) + ceph_fscache_invalidate(inode); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -181,6 +197,7 @@ int ceph_open(struct inode *inode, struct file *file) spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } + spin_unlock(&ci->i_ceph_lock); dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); @@ -191,6 +208,7 @@ int ceph_open(struct inode *inode, struct file *file) } req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; if (flags & (O_CREAT|O_TRUNC)) parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); @@ -313,9 +331,9 @@ static int striped_read(struct inode *inode, { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - u64 pos, this_len; + u64 pos, this_len, left; int io_align, page_align; - int left, pages_left; + int pages_left; int read; struct page **page_pos; int ret; @@ -346,47 +364,40 @@ more: ret = 0; hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; - dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, + dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); - if (ret > 0) { - int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; - - if (read < pos - off) { - dout(" zero gap %llu to %llu\n", off + read, pos); - ceph_zero_page_vector_range(page_align + read, - pos - off - read, pages); + if (ret >= 0) { + int didpages; + if (was_short && (pos + ret < inode->i_size)) { + u64 tmp = min(this_len - ret, + inode->i_size - pos - ret); + dout(" zero gap %llu to %llu\n", + pos + ret, pos + ret + tmp); + ceph_zero_page_vector_range(page_align + read + ret, + tmp, pages); + ret += tmp; } + + didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; pos += ret; read = pos - off; left -= ret; page_pos += didpages; pages_left -= didpages; - /* hit stripe? */ - if (left && hit_stripe) + /* hit stripe and need continue*/ + if (left && hit_stripe && pos < inode->i_size) goto more; } - if (was_short) { + if (read > 0) { + ret = read; /* did we bounce off eof? */ if (pos + left > inode->i_size) *checkeof = 1; - - /* zero trailing bytes (inside i_size) */ - if (left > 0 && pos < inode->i_size) { - if (pos + left > inode->i_size) - left = inode->i_size - pos; - - dout("zero tail %d\n", left); - ceph_zero_page_vector_range(page_align + read, left, - pages); - read += left; - } } - if (ret >= 0) - ret = read; dout("striped_read returns %d\n", ret); return ret; } @@ -618,6 +629,8 @@ out: if (check_caps) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + } else if (ret != -EOLDSNAPC && written > 0) { + ret = written; } return ret; } @@ -659,7 +672,6 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || (fi->flags & CEPH_F_SYNC)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, base, len, ppos, &checkeof); @@ -711,13 +723,11 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, &ceph_sb_to_client(inode->i_sb)->client->osdc; ssize_t count, written = 0; int err, want, got; - bool hold_mutex; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; mutex_lock(&inode->i_mutex); - hold_mutex = true; err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (err) @@ -763,18 +773,31 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || (fi->flags & CEPH_F_SYNC)) { mutex_unlock(&inode->i_mutex); written = ceph_sync_write(file, iov->iov_base, count, pos, &iocb->ki_pos); + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" + "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), + pos, (unsigned)iov->iov_len); + mutex_lock(&inode->i_mutex); + goto retry_snap; + } } else { + /* + * No need to acquire the i_truncate_mutex. Because + * the MDS revokes Fwb caps before sending truncate + * message to us. We can't get Fwb cap while there + * are pending vmtruncate. So write and vmtruncate + * can not run at the same time + */ written = generic_file_buffered_write(iocb, iov, nr_segs, pos, &iocb->ki_pos, count, 0); mutex_unlock(&inode->i_mutex); } - hold_mutex = false; if (written >= 0) { int dirty; @@ -798,18 +821,12 @@ retry_snap: written = err; } - if (written == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); - mutex_lock(&inode->i_mutex); - hold_mutex = true; - goto retry_snap; - } + goto out_unlocked; + out: - if (hold_mutex) - mutex_unlock(&inode->i_mutex); + mutex_unlock(&inode->i_mutex); +out_unlocked: current->backing_dev_info = NULL; - return written ? written : err; } @@ -822,7 +839,6 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); @@ -871,6 +887,204 @@ out: return offset; } +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_CACHE_SIZE) { + loff_t size = round_down(length, PAGE_CACHE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 1, op, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, + &inode->i_mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; + } + ceph_osdc_put_request(req); + +out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + s32 stripe_unit = ceph_file_layout_su(ci->i_layout); + s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + s32 object_size = ceph_file_layout_object_size(ci->i_layout); + u64 object_set_size = object_size * stripe_count; + u64 nearly, t; + + /* round offset up to next period boundary */ + nearly = offset + object_set_size - 1; + t = nearly; + nearly -= do_div(t, object_set_size); + + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + return ret; +} + +static long ceph_fallocate(struct file *file, int mode, + loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + int want, got = 0; + int dirty; + int ret = 0; + loff_t endoff = 0; + loff_t size; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + mutex_lock(&inode->i_mutex); + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto unlock; + } + + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { + ret = -ENOSPC; + goto unlock; + } + + size = i_size_read(inode); + if (!(mode & FALLOC_FL_KEEP_SIZE)) + endoff = offset + length; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); + if (ret < 0) + goto unlock; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset < size) + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); + } else if (endoff > size) { + truncate_pagecache_range(inode, size, -1); + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, NULL); + } + + if (!ret) { + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + ceph_put_cap_refs(ci, got); +unlock: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, @@ -887,5 +1101,6 @@ const struct file_operations ceph_file_fops = { .splice_write = generic_file_splice_write, .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, + .fallocate = ceph_fallocate, }; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f3a2abf28a77..8549a48115f7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -12,6 +12,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/decode.h> /* @@ -344,6 +345,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) for (i = 0; i < CEPH_FILE_MODE_NUM; i++) ci->i_nr_by_mode[i] = 0; + mutex_init(&ci->i_truncate_mutex); ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; @@ -377,6 +379,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); + ceph_fscache_inode_init(ci); + return &ci->vfs_inode; } @@ -396,6 +400,8 @@ void ceph_destroy_inode(struct inode *inode) dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + ceph_fscache_unregister_inode_cookie(ci); + ceph_queue_caps_release(inode); /* @@ -430,7 +436,6 @@ void ceph_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, ceph_i_callback); } - /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -455,16 +460,20 @@ int ceph_fill_file_size(struct inode *inode, int issued, dout("truncate_seq %u -> %u\n", ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; + + /* the MDS should have revoked these caps */ + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're * not the only client referencing this file and we * don't hold those caps, then we need to check whether * the file is either opened or mmaped */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| - CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| - CEPH_CAP_FILE_EXCL| - CEPH_CAP_FILE_LAZYIO)) || + if ((issued & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_BUFFER)) || mapping_mapped(inode->i_mapping) || __ceph_caps_file_wanted(ci)) { ci->i_truncate_pending++; @@ -478,6 +487,10 @@ int ceph_fill_file_size(struct inode *inode, int issued, truncate_size); ci->i_truncate_size = truncate_size; } + + if (queue_trunc) + ceph_fscache_invalidate(inode); + return queue_trunc; } @@ -1066,7 +1079,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * complete. */ ceph_set_dentry_offset(req->r_old_dentry); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); dn = req->r_old_dentry; /* use old_dentry */ @@ -1419,18 +1432,20 @@ static void ceph_invalidate_work(struct work_struct *work) u32 orig_gen; int check = 0; + mutex_lock(&ci->i_truncate_mutex); spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); goto out; } orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(inode->i_mapping, 0); spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1445,6 +1460,7 @@ static void ceph_invalidate_work(struct work_struct *work) ci->i_rdcache_revoking); } spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); if (check) ceph_check_caps(ci, 0, NULL); @@ -1465,9 +1481,7 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - mutex_unlock(&inode->i_mutex); iput(inode); } @@ -1480,6 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); ihold(inode); + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); @@ -1500,11 +1515,13 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) u64 to; int wrbuffer_refs, finish = 0; + mutex_lock(&ci->i_truncate_mutex); retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { dout("__do_pending_vmtruncate %p none pending\n", inode); spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); return; } @@ -1521,6 +1538,9 @@ retry: goto retry; } + /* there should be no reader or writer */ + WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); + to = ci->i_truncate_size; wrbuffer_refs = ci->i_wrbuffer_ref; dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, @@ -1538,13 +1558,14 @@ retry: if (!finish) goto retry; + mutex_unlock(&ci->i_truncate_mutex); + if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); wake_up_all(&ci->i_cap_wq); } - /* * symlinks */ @@ -1586,8 +1607,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode); - err = inode_change_ok(inode, attr); if (err != 0) return err; @@ -1768,7 +1787,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode); + if (mask & CEPH_SETATTR_SIZE) + __ceph_do_pending_vmtruncate(inode); return err; out: spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index e0b4ef31d3c8..669622fd1ae3 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -196,8 +196,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); - if (r < 0) + if (r < 0) { + up_read(&osdc->map_sem); return -EIO; + } dl.file_offset -= dl.object_offset; dl.object_size = ceph_file_layout_object_size(ci->i_layout); dl.block_size = ceph_file_layout_su(ci->i_layout); @@ -209,8 +211,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, + ceph_file_layout_pg_pool(ci->i_layout)); + if (r < 0) { + up_read(&osdc->map_sem); + return r; + } dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 187bf214444d..b7bda5d9611d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -414,6 +414,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (mds >= mdsc->mdsmap->m_max_mds) + return ERR_PTR(-EINVAL); + s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); @@ -1028,6 +1031,37 @@ static void remove_session_caps(struct ceph_mds_session *session) { dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, NULL); + + spin_lock(&session->s_cap_lock); + if (session->s_nr_caps > 0) { + struct super_block *sb = session->s_mdsc->fsc->sb; + struct inode *inode; + struct ceph_cap *cap, *prev = NULL; + struct ceph_vino vino; + /* + * iterate_session_caps() skips inodes that are being + * deleted, we need to wait until deletions are complete. + * __wait_on_freeing_inode() is designed for the job, + * but it is not exported, so use lookup inode function + * to access it. + */ + while (!list_empty(&session->s_caps)) { + cap = list_entry(session->s_caps.next, + struct ceph_cap, session_caps); + if (cap == prev) + break; + prev = cap; + vino = cap->ci->i_vino; + spin_unlock(&session->s_cap_lock); + + inode = ceph_find_inode(sb, vino); + iput(inode); + + spin_lock(&session->s_cap_lock); + } + } + spin_unlock(&session->s_cap_lock); + BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); cleanup_cap_releases(session); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6627b26a800c..6a0951e43044 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -17,6 +17,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> @@ -142,6 +143,8 @@ enum { Opt_nodcache, Opt_ino32, Opt_noino32, + Opt_fscache, + Opt_nofscache }; static match_table_t fsopt_tokens = { @@ -167,6 +170,8 @@ static match_table_t fsopt_tokens = { {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, {Opt_noino32, "noino32"}, + {Opt_fscache, "fsc"}, + {Opt_nofscache, "nofsc"}, {-1, NULL} }; @@ -260,6 +265,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noino32: fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; break; + case Opt_fscache: + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + break; + case Opt_nofscache: + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + break; default: BUG_ON(token); } @@ -422,6 +433,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",dcache"); else seq_puts(m, ",nodcache"); + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) + seq_puts(m, ",fsc"); + else + seq_puts(m, ",nofsc"); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); @@ -530,11 +545,18 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; + /* setup fscache */ + if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && + (ceph_fscache_register_fs(fsc) != 0)) + goto fail_fscache; + /* caps */ fsc->min_caps = fsopt->max_readdir; return fsc; +fail_fscache: + ceph_fscache_unregister_fs(fsc); fail_trunc_wq: destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: @@ -554,6 +576,8 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + ceph_fscache_unregister_fs(fsc); + destroy_workqueue(fsc->wb_wq); destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); @@ -588,6 +612,8 @@ static void ceph_inode_init_once(void *foo) static int __init init_caches(void) { + int error = -ENOMEM; + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), @@ -611,15 +637,17 @@ static int __init init_caches(void) if (ceph_file_cachep == NULL) goto bad_file; - return 0; + if ((error = ceph_fscache_register())) + goto bad_file; + return 0; bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); - return -ENOMEM; + return error; } static void destroy_caches(void) @@ -629,10 +657,13 @@ static void destroy_caches(void) * destroy cache. */ rcu_barrier(); + kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); + + ceph_fscache_unregister(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cbded572345e..6014b0a3c405 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -16,6 +16,10 @@ #include <linux/ceph/libceph.h> +#ifdef CONFIG_CEPH_FSCACHE +#include <linux/fscache.h> +#endif + /* f_type in struct statfs */ #define CEPH_SUPER_MAGIC 0x00c36400 @@ -29,6 +33,7 @@ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) @@ -90,6 +95,11 @@ struct ceph_fs_client { struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; #endif + +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + struct workqueue_struct *revalidate_wq; +#endif }; @@ -288,6 +298,7 @@ struct ceph_inode_info { int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ @@ -319,6 +330,12 @@ struct ceph_inode_info { struct work_struct i_vmtruncate_work; +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + u32 i_fscache_gen; /* sequence, for delayed fscache validate */ + struct work_struct i_revalidate_work; +#endif + struct inode vfs_inode; /* at end */ }; diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS deleted file mode 100644 index ea940b1db77b..000000000000 --- a/fs/cifs/AUTHORS +++ /dev/null @@ -1,55 +0,0 @@ -Original Author -=============== -Steve French (sfrench@samba.org) - -The author wishes to express his appreciation and thanks to: -Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS -improvements. Thanks to IBM for allowing me time and test resources to pursue -this project, to Jim McDonough from IBM (and the Samba Team) for his help, to -the IBM Linux JFS team for explaining many esoteric Linux filesystem features. -Jeremy Allison of the Samba team has done invaluable work in adding the server -side of the original CIFS Unix extensions and reviewing and implementing -portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank -Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client) -for proving years ago that very good smb/cifs clients could be done on Unix-like -operating systems. Volker Lendecke, Andrew Tridgell, Urban Widmark, John -Newbigin and others for their work on the Linux smbfs module. Thanks to -the other members of the Storage Network Industry Association CIFS Technical -Workgroup for their work specifying this highly complex protocol and finally -thanks to the Samba team for their technical advice and encouragement. - -Patch Contributors ------------------- -Zwane Mwaikambo -Andi Kleen -Amrut Joshi -Shobhit Dayal -Sergey Vlasov -Richard Hughes -Yury Umanets -Mark Hamzy (for some of the early cifs IPv6 work) -Domen Puncer -Jesper Juhl (in particular for lots of whitespace/formatting cleanup) -Vince Negri and Dave Stahl (for finding an important caching bug) -Adrian Bunk (kcalloc cleanups) -Miklos Szeredi -Kazeon team for various fixes especially for 2.4 version. -Asser Ferno (Change Notify support) -Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup -Gunter Kukkukk (testing and suggestions for support of old servers) -Igor Mammedov (DFS support) -Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) - -Test case and Bug Report contributors -------------------------------------- -Thanks to those in the community who have submitted detailed bug reports -and debug of problems they have found: Jochen Dolze, David Blaine, -Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori, -Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen, -Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special -mention to the Stanford Checker (SWAT) which pointed out many minor -bugs in error paths. Valuable suggestions also have come from Al Viro -and Dave Miller. - -And thanks to the IBM LTC and Power test teams and SuSE testers for -finding multiple bugs during excellent stress test runs. diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES deleted file mode 100644 index bc0025cdd1c9..000000000000 --- a/fs/cifs/CHANGES +++ /dev/null @@ -1,1065 +0,0 @@ -Version 1.62 ------------- -Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened -to more strictly handle corrupt frames. - -Version 1.61 ------------- -Fix append problem to Samba servers (files opened with O_APPEND could -have duplicated data). Fix oops in cifs_lookup. Workaround problem -mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. -Disable use of server inode numbers when server only -partially supports them (e.g. for one server querying inode numbers on -FindFirst fails but QPathInfo queries works). Fix oops with dfs in -cifs_put_smb_ses. Fix mmap to work on directio mounts (needed -for OpenOffice when on forcedirectio mount e.g.) - -Version 1.60 -------------- -Fix memory leak in reconnect. Fix oops in DFS mount error path. -Set s_maxbytes to smaller (the max that vfs can handle) so that -sendfile will now work over cifs mounts again. Add noforcegid -and noforceuid mount parameters. Fix small mem leak when using -ntlmv2. Fix 2nd mount to same server but with different port to -be allowed (rather than reusing the 1st port) - only when the -user explicitly overrides the port on the 2nd mount. - -Version 1.59 ------------- -Client uses server inode numbers (which are persistent) rather than -client generated ones by default (mount option "serverino" turned -on by default if server supports it). Add forceuid and forcegid -mount options (so that when negotiating unix extensions specifying -which uid mounted does not immediately force the server's reported -uids to be overridden). Add support for scope mount parm. Improve -hard link detection to use same inode for both. Do not set -read-only dos attribute on directories (for chmod) since Windows -explorer special cases this attribute bit for directories for -a different purpose. - -Version 1.58 ------------- -Guard against buffer overruns in various UCS-2 to UTF-8 string conversions -when the UTF-8 string is composed of unusually long (more than 4 byte) converted -characters. Add support for mounting root of a share which redirects immediately -to DFS target. Convert string conversion functions from Unicode to more -accurately mark string length before allocating memory (which may help the -rare cases where a UTF-8 string is much larger than the UCS2 string that -we converted from). Fix endianness of the vcnum field used during -session setup to distinguish multiple mounts to same server from different -userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental -flag to be set to 2, and mount must enable krb5 to turn on extended security). -Performance of file create to Samba improved (posix create on lookup -removes 1 of 2 network requests sent on file create) - -Version 1.57 ------------- -Improve support for multiple security contexts to the same server. We -used to use the same "vcnumber" for all connections which could cause -the server to treat subsequent connections, especially those that -are authenticated as guest, as reconnections, invalidating the earlier -user's smb session. This fix allows cifs to mount multiple times to the -same server with different userids without risking invalidating earlier -established security contexts. fsync now sends SMB Flush operation -to better ensure that we wait for server to write all of the data to -server disk (not just write it over the network). Add new mount -parameter to allow user to disable sending the (slow) SMB flush on -fsync if desired (fsync still flushes all cached write data to the server). -Posix file open support added (turned off after one attempt if server -fails to support it properly, as with Samba server versions prior to 3.3.2) -Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too -little memory for the "nativeFileSystem" field returned by the server -during mount). Endian convert inode numbers if necessary (makes it easier -to compare inode numbers on network files from big endian systems). - -Version 1.56 ------------- -Add "forcemandatorylock" mount option to allow user to use mandatory -rather than posix (advisory) byte range locks, even though server would -support posix byte range locks. Fix query of root inode when prefixpath -specified and user does not have access to query information about the -top of the share. Fix problem in 2.6.28 resolving DFS paths to -Samba servers (worked to Windows). Fix rmdir so that pending search -(readdir) requests do not get invalid results which include the now -removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable -when using DFS. Add better file create support to servers which support -the CIFS POSIX protocol extensions (this adds support for new flags -on create, and improves semantics for write of locked ranges). - -Version 1.55 ------------- -Various fixes to make delete of open files behavior more predictable -(when delete of an open file fails we mark the file as "delete-on-close" -in a way that more servers accept, but only if we can first rename the -file to a temporary name). Add experimental support for more safely -handling fcntl(F_SETLEASE). Convert cifs to using blocking tcp -sends, and also let tcp autotune the socket send and receive buffers. -This reduces the number of EAGAIN errors returned by TCP/IP in -high stress workloads (and the number of retries on socket writes -when sending large SMBWriteX requests). Fix case in which a portion of -data can in some cases not get written to the file on the server before the -file is closed. Fix DFS parsing to properly handle path consumed field, -and to handle certain codepage conversions better. Fix mount and -umount race that can cause oops in mount or umount or reconnect. - -Version 1.54 ------------- -Fix premature write failure on congested networks (we would give up -on EAGAIN from the socket too quickly on large writes). -Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. -Fix endian problems in acl (mode from/to cifs acl) on bigendian -architectures. Fix problems with preserving timestamps on copying open -files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit -on parent directory when server supports Unix Extensions but not POSIX -create. Update cifs.upcall version to handle new Kerberos sec flags -(this requires update of cifs.upcall program from Samba). Fix memory leak -on dns_upcall (resolving DFS referralls). Fix plain text password -authentication (requires setting SecurityFlags to 0x30030 to enable -lanman and plain text though). Fix writes to be at correct offset when -file is open with O_APPEND and file is on a directio (forcediretio) mount. -Fix bug in rewinding readdir directory searches. Add nodfs mount option. - -Version 1.53 ------------- -DFS support added (Microsoft Distributed File System client support needed -for referrals which enable a hierarchical name space among servers). -Disable temporary caching of mode bits to servers which do not support -storing of mode (e.g. Windows servers, when client mounts without cifsacl -mount option) and add new "dynperm" mount option to enable temporary caching -of mode (enable old behavior). Fix hang on mount caused when server crashes -tcp session during negotiate protocol. - -Version 1.52 ------------- -Fix oops on second mount to server when null auth is used. -Enable experimental Kerberos support. Return writebehind errors on flush -and sync so that events like out of disk space get reported properly on -cached files. Fix setxattr failure to certain Samba versions. Fix mount -of second share to disconnected server session (autoreconnect on this). -Add ability to modify cifs acls for handling chmod (when mounted with -cifsacl flag). Fix prefixpath path separator so we can handle mounts -with prefixpaths longer than one directory (one path component) when -mounted to Windows servers. Fix slow file open when cifsacl -enabled. Fix memory leak in FindNext when the SMB call returns -EBADF. - - -Version 1.51 ------------- -Fix memory leak in statfs when mounted to very old servers (e.g. -Windows 9x). Add new feature "POSIX open" which allows servers -which support the current POSIX Extensions to provide better semantics -(e.g. delete for open files opened with posix open). Take into -account umask on posix mkdir not just older style mkdir. Add -ability to mount to IPC$ share (which allows CIFS named pipes to be -opened, read and written as if they were files). When 1st tree -connect fails (e.g. due to signing negotiation failure) fix -leak that causes cifsd not to stop and rmmod to fail to cleanup -cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on -bigendian architectures. Fix possible memory corruption when -EAGAIN returned on kern_recvmsg. Return better error if server -requires packet signing but client has disabled it. When mounted -with cifsacl mount option - mode bits are approximated based -on the contents of the ACL of the file or directory. When cifs -mount helper is missing convert make sure that UNC name -has backslash (not forward slash) between ip address of server -and the share name. - -Version 1.50 ------------- -Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is -done with "serverino" mount option). Add support for POSIX Unlink -(helps with certain sharing violation cases when server such as -Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix" -mount option to allow disabling the CIFS Unix Extensions for just -that mount. Fix hang on spinlock in find_writable_file (race when -reopening file after session crash). Byte range unlock request to -windows server could unlock more bytes (on server copy of file) -than intended if start of unlock request is well before start of -a previous byte range lock that we issued. - -Version 1.49 ------------- -IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6 -address after the "ip=" mount option, at least until mount.cifs is fixed to -handle DNS host to ipv6 name translation). Accept override of uid or gid -on mount even when Unix Extensions are negotiated (it used to be ignored -when Unix Extensions were ignored). This allows users to override the -default uid and gid for files when they are certain that the uids or -gids on the server do not match those of the client. Make "sec=none" -mount override username (so that null user connection is attempted) -to match what documentation said. Support for very large reads, over 127K, -available to some newer servers (such as Samba 3.0.26 and later but -note that it also requires setting CIFSMaxBufSize at module install -time to a larger value which may hurt performance in some cases). -Make sign option force signing (or fail if server does not support it). - -Version 1.48 ------------- -Fix mtime bouncing around from local idea of last write times to remote time. -Fix hang (in i_size_read) when simultaneous size update of same remote file -on smp system corrupts sequence number. Do not reread unnecessarily partial page -(which we are about to overwrite anyway) when writing out file opened rw. -When DOS attribute of file on non-Unix server's file changes on the server side -from read-only back to read-write, reflect this change in default file mode -(we had been leaving a file's mode read-only until the inode were reloaded). -Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute -when archive dos attribute not set and we are changing mode back to writeable -on server which does not support the Unix Extensions). Remove read only dos -attribute on chmod when adding any write permission (ie on any of -user/group/other (not all of user/group/other ie 0222) when -mounted to windows. Add support for POSIX MkDir (slight performance -enhancement and eliminates the network race between the mkdir and set -path info of the mode). - - -Version 1.47 ------------- -Fix oops in list_del during mount caused by unaligned string. -Fix file corruption which could occur on some large file -copies caused by writepages page i/o completion bug. -Seek to SEEK_END forces check for update of file size for non-cached -files. Allow file size to be updated on remote extend of locally open, -non-cached file. Fix reconnect to newer Samba servers (or other servers -which support the CIFS Unix/POSIX extensions) so that we again tell the -server the Unix/POSIX cifs capabilities which we support (SetFSInfo). -Add experimental support for new POSIX Open/Mkdir (which returns -stat information on the open, and allows setting the mode). - -Version 1.46 ------------- -Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps. -Allow null user to be specified on mount ("username="). Do not return -EINVAL on readdir when filldir fails due to overwritten blocksize -(fixes FC problem). Return error in rename 2nd attempt retry (ie report -if rename by handle also fails, after rename by path fails, we were -not reporting whether the retry worked or not). Fix NTLMv2 to -work to Windows servers (mount with option "sec=ntlmv2"). - -Version 1.45 ------------- -Do not time out lockw calls when using posix extensions. Do not -time out requests if server still responding reasonably fast -on requests on other threads. Improve POSIX locking emulation, -(lock cancel now works, and unlock of merged range works even -to Windows servers now). Fix oops on mount to lanman servers -(win9x, os/2 etc.) when null password. Do not send listxattr -(SMB to query all EAs) if nouser_xattr specified. Fix SE Linux -problem (instantiate inodes/dentries in right order for readdir). - -Version 1.44 ------------- -Rewritten sessionsetup support, including support for legacy SMB -session setup needed for OS/2 and older servers such as Windows 95 and 98. -Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst -so we can do search (ls etc.) to OS/2. Do not send NTCreateX -or recent levels of FindFirst unless server says it supports NT SMBs -(instead use legacy equivalents from LANMAN dialect). Fix to allow -NTLMv2 authentication support (now can use stronger password hashing -on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004). -Allow override of global cifs security flags on mount via "sec=" option(s). - -Version 1.43 ------------- -POSIX locking to servers which support CIFS POSIX Extensions -(disabled by default controlled by proc/fs/cifs/Experimental). -Handle conversion of long share names (especially Asian languages) -to Unicode during mount. Fix memory leak in sess struct on reconnect. -Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on -cifs open which helps rare case when setpathinfo fails or server does -not support it. - -Version 1.42 ------------- -Fix slow oplock break when mounted to different servers at the same time and -the tids match and we try to find matching fid on wrong server. Fix read -looping when signing required by server (2.6.16 kernel only). Fix readdir -vs. rename race which could cause each to hang. Return . and .. even -if server does not. Allow searches to skip first three entries and -begin at any location. Fix oops in find_writeable_file. - -Version 1.41 ------------- -Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can -configure stronger authentication. Fix sfu symlinks so they can -be followed (not just recognized). Fix wraparound of bcc on -read responses when buffer size over 64K and also fix wrap of -max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in -cifs_user_read and cifs_readpages (when EAGAIN on send of smb -on socket is returned over and over). Add POSIX (advisory) byte range -locking support (requires server with newest CIFS UNIX Extensions -to the protocol implemented). Slow down negprot slightly in port 139 -RFC1001 case to give session_init time on buggy servers. - -Version 1.40 ------------- -Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance -of readpages by eliminating one extra memcpy. Allow update of file size -from remote server even if file is open for write as long as mount is -directio. Recognize share mode security and send NTLM encrypted password -on tree connect if share mode negotiated. - -Version 1.39 ------------- -Defer close of a file handle slightly if pending writes depend on that handle -(this reduces the EBADF bad file handle errors that can be logged under heavy -stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 -Fix SFU style symlinks and mknod needed for servers which do not support the -CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative -dentries so files that the client sees as deleted but that later get created -on the server will be recognized. Add client side permission check on setattr. -Timeout stuck requests better (where server has never responded or sent corrupt -responses) - -Version 1.38 ------------- -Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket) -to be smaller at first (but increasing) so large write performance performance -over GigE is better. Do not hang thread on illegal byte range lock response -from Windows (Windows can send an RFC1001 size which does not match smb size) by -allowing an SMBs TCP length to be up to a few bytes longer than it should be. -wsize and rsize can now be larger than negotiated buffer size if server -supports large readx/writex, even when directio mount flag not specified. -Write size will in many cases now be 16K instead of 4K which greatly helps -file copy performance on lightly loaded networks. Fix oops in dnotify -when experimental config flag enabled. Make cifsFYI more granular. - -Version 1.37 ------------- -Fix readdir caching when unlink removes file in current search buffer, -and this is followed by a rewind search to just before the deleted entry. -Do not attempt to set ctime unless atime and/or mtime change requested -(most servers throw it away anyway). Fix length check of received smbs -to be more accurate. Fix big endian problem with mapchars mount option, -and with a field returned by statfs. - -Version 1.36 ------------- -Add support for mounting to older pre-CIFS servers such as Windows9x and ME. -For these older servers, add option for passing netbios name of server in -on mount (servernetbiosname). Add suspend support for power management, to -avoid cifsd thread preventing software suspend from working. -Add mount option for disabling the default behavior of sending byte range lock -requests to the server (necessary for certain applications which break with -mandatory lock behavior such as Evolution), and also mount option for -requesting case insensitive matching for path based requests (requesting -case sensitive is the default). - -Version 1.35 ------------- -Add writepage performance improvements. Fix path name conversions -for long filenames on mounts which were done with "mapchars" mount option -specified. Ensure multiplex ids do not collide. Fix case in which -rmmod can oops if done soon after last unmount. Fix truncated -search (readdir) output when resume filename was a long filename. -Fix filename conversion when mapchars mount option was specified and -filename was a long filename. - -Version 1.34 ------------- -Fix error mapping of the TOO_MANY_LINKS (hardlinks) case. -Do not oops if root user kills cifs oplock kernel thread or -kills the cifsd thread (NB: killing the cifs kernel threads is not -recommended, unmount and rmmod cifs will kill them when they are -no longer needed). Fix readdir to ASCII servers (ie older servers -which do not support Unicode) and also require asterisk. -Fix out of memory case in which data could be written one page -off in the page cache. - -Version 1.33 ------------- -Fix caching problem, in which readdir of directory containing a file -which was cached could cause the file's time stamp to be updated -without invalidating the readahead data (so we could get stale -file data on the client for that file even as the server copy changed). -Cleanup response processing so cifsd can not loop when abnormally -terminated. - - -Version 1.32 ------------- -Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one -transact response for an SMB request and search entry split across two frames. -Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server) -as new protocol extensions. Do not send Get/Set calls for POSIX ACLs -unless server explicitly claims to support them in CIFS Unix extensions -POSIX ACL capability bit. Fix packet signing when multiuser mounting with -different users from the same client to the same server. Fix oops in -cifs_close. Add mount option for remapping reserved characters in -filenames (also allow recognizing files with created by SFU which have any -of these seven reserved characters, except backslash, to be recognized). -Fix invalid transact2 message (we were sometimes trying to interpret -oplock breaks as SMB responses). Add ioctl for checking that the -current uid matches the uid of the mounter (needed by umount.cifs). -Reduce the number of large buffer allocations in cifs response processing -(significantly reduces memory pressure under heavy stress with multiple -processes accessing the same server at the same time). - -Version 1.31 ------------- -Fix updates of DOS attributes and time fields so that files on NT4 servers -do not get marked delete on close. Display sizes of cifs buffer pools in -cifs stats. Fix oops in unmount when cifsd thread being killed by -shutdown. Add generic readv/writev and aio support. Report inode numbers -consistently in readdir and lookup (when serverino mount option is -specified use the inode number that the server reports - for both lookup -and readdir, otherwise by default the locally generated inode number is used -for inodes created in either path since servers are not always able to -provide unique inode numbers when exporting multiple volumes from under one -sharename). - -Version 1.30 ------------- -Allow new nouser_xattr mount parm to disable xattr support for user namespace. -Do not flag user_xattr mount parm in dmesg. Retry failures setting file time -(mostly affects NT4 servers) by retry with handle based network operation. -Add new POSIX Query FS Info for returning statfs info more accurately. -Handle passwords with multiple commas in them. - -Version 1.29 ------------- -Fix default mode in sysfs of cifs module parms. Remove old readdir routine. -Fix capabilities flags for large readx so as to allow reads larger than 64K. - -Version 1.28 ------------- -Add module init parm for large SMB buffer size (to allow it to be changed -from its default of 16K) which is especially useful for large file copy -when mounting with the directio mount option. Fix oops after -returning from mount when experimental ExtendedSecurity enabled and -SpnegoNegotiated returning invalid error. Fix case to retry better when -peek returns from 1 to 3 bytes on socket which should have more data. -Fixed path based calls (such as cifs lookup) to handle path names -longer than 530 (now can handle PATH_MAX). Fix pass through authentication -from Samba server to DC (Samba required dummy LM password). - -Version 1.27 ------------- -Turn off DNOTIFY (directory change notification support) by default -(unless built with the experimental flag) to fix hang with KDE -file browser. Fix DNOTIFY flag mappings. Fix hang (in wait_event -waiting on an SMB response) in SendReceive when session dies but -reconnects quickly from another task. Add module init parms for -minimum number of large and small network buffers in the buffer pools, -and for the maximum number of simultaneous requests. - -Version 1.26 ------------- -Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later -and other POSIX CIFS compliant servers. Fix error mapping for getfacl -to EOPNOTSUPP when server does not support posix acls on the wire. Fix -improperly zeroed buffer in CIFS Unix extensions set times call. - -Version 1.25 ------------- -Fix internationalization problem in cifs readdir with filenames that map to -longer UTF-8 strings than the string on the wire was in Unicode. Add workaround -for readdir to netapp servers. Fix search rewind (seek into readdir to return -non-consecutive entries). Do not do readdir when server negotiates -buffer size to small to fit filename. Add support for reading POSIX ACLs from -the server (add also acl and noacl mount options). - -Version 1.24 ------------- -Optionally allow using server side inode numbers, rather than client generated -ones by specifying mount option "serverino" - this is required for some apps -to work which double check hardlinked files and have persistent inode numbers. - -Version 1.23 ------------- -Multiple bigendian fixes. On little endian systems (for reconnect after -network failure) fix tcp session reconnect code so we do not try first -to reconnect on reverse of port 445. Treat reparse points (NTFS junctions) -as directories rather than symlinks because we can do follow link on them. - -Version 1.22 ------------- -Add config option to enable XATTR (extended attribute) support, mapping -xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of -minor fixes pointed out by the Stanford SWAT checker (mostly missing -or out of order NULL pointer checks in little used error paths). - -Version 1.21 ------------- -Add new mount parm to control whether mode check (generic_permission) is done -on the client. If Unix extensions are enabled and the uids on the client -and server do not match, client permission checks are meaningless on -server uids that do not exist on the client (this does not affect the -normal ACL check which occurs on the server). Fix default uid -on mknod to match create and mkdir. Add optional mount parm to allow -override of the default uid behavior (in which the server sets the uid -and gid of newly created files). Normally for network filesystem mounts -user want the server to set the uid/gid on newly created files (rather than -using uid of the client processes you would in a local filesystem). - -Version 1.20 ------------- -Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps -info into /proc/fs/cifs/DebugData. Fix oops in rare oops in readdir -(in build_wildcard_path_from_dentry). Fix mknod to pass type field -(block/char/fifo) properly. Remove spurious mount warning log entry when -credentials passed as mount argument. Set major/minor device number in -inode for block and char devices when unix extensions enabled. - -Version 1.19 ------------- -Fix /proc/fs/cifs/Stats and DebugData display to handle larger -amounts of return data. Properly limit requests to MAX_REQ (50 -is the usual maximum active multiplex SMB/CIFS requests per server). -Do not kill cifsd (and thus hurt the other SMB session) when more than one -session to the same server (but with different userids) exists and one -of the two user's smb sessions is being removed while leaving the other. -Do not loop reconnecting in cifsd demultiplex thread when admin -kills the thread without going through unmount. - -Version 1.18 ------------- -Do not rename hardlinked files (since that should be a noop). Flush -cached write behind data when reopening a file after session abend, -except when already in write. Grab per socket sem during reconnect -to avoid oops in sendmsg if overlapping with reconnect. Do not -reset cached inode file size on readdir for files open for write on -client. - - -Version 1.17 ------------- -Update number of blocks in file so du command is happier (in Linux a fake -blocksize of 512 is required for calculating number of blocks in inode). -Fix prepare write of partial pages to read in data from server if possible. -Fix race on tcpStatus field between unmount and reconnection code, causing -cifsd process sometimes to hang around forever. Improve out of memory -checks in cifs_filldir - -Version 1.16 ------------- -Fix incorrect file size in file handle based setattr on big endian hardware. -Fix oops in build_path_from_dentry when out of memory. Add checks for invalid -and closing file structs in writepage/partialpagewrite. Add statistics -for each mounted share (new menuconfig option). Fix endianness problem in -volume information displayed in /proc/fs/cifs/DebugData (only affects -affects big endian architectures). Prevent renames while constructing -path names for open, mkdir and rmdir. - -Version 1.15 ------------- -Change to mempools for alloc smb request buffers and multiplex structs -to better handle low memory problems (and potential deadlocks). - -Version 1.14 ------------- -Fix incomplete listings of large directories on Samba servers when Unix -extensions enabled. Fix oops when smb_buffer can not be allocated. Fix -rename deadlock when writing out dirty pages at same time. - -Version 1.13 ------------- -Fix open of files in which O_CREATE can cause the mode to change in -some cases. Fix case in which retry of write overlaps file close. -Fix PPC64 build error. Reduce excessive stack usage in smb password -hashing. Fix overwrite of Linux user's view of file mode to Windows servers. - -Version 1.12 ------------- -Fixes for large file copy, signal handling, socket retry, buffer -allocation and low memory situations. - -Version 1.11 ------------- -Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize) -also now allowing support for specifying client netbiosname. NT4 support added. - -Version 1.10 ------------- -Fix reconnection (and certain failed mounts) to properly wake up the -blocked users thread so it does not seem hung (in some cases was blocked -until the cifs receive timeout expired). Fix spurious error logging -to kernel log when application with open network files killed. - -Version 1.09 ------------- -Fix /proc/fs module unload warning message (that could be logged -to the kernel log). Fix intermittent failure in connectathon -test7 (hardlink count not immediately refreshed in case in which -inode metadata can be incorrectly kept cached when time near zero) - -Version 1.08 ------------- -Allow file_mode and dir_mode (specified at mount time) to be enforced -locally (the server already enforced its own ACLs too) for servers -that do not report the correct mode (do not support the -CIFS Unix Extensions). - -Version 1.07 ------------- -Fix some small memory leaks in some unmount error paths. Fix major leak -of cache pages in readpages causing multiple read oriented stress -testcases (including fsx, and even large file copy) to fail over time. - -Version 1.06 ------------- -Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server. -This allows files that differ only in case and improves performance of file -creation and file open to such servers. Fix semaphore conflict which causes -slow delete of open file to Samba (which unfortunately can cause an oplock -break to self while vfs_unlink held i_sem) which can hang for 20 seconds. - -Version 1.05 ------------- -fixes to cifs_readpages for fsx test case - -Version 1.04 ------------- -Fix caching data integrity bug when extending file size especially when no -oplock on file. Fix spurious logging of valid already parsed mount options -that are parsed outside of the cifs vfs such as nosuid. - - -Version 1.03 ------------- -Connect to server when port number override not specified, and tcp port -unitialized. Reset search to restart at correct file when kernel routine -filldir returns error during large directory searches (readdir). - -Version 1.02 ------------- -Fix caching problem when files opened by multiple clients in which -page cache could contain stale data, and write through did -not occur often enough while file was still open when read ahead -(read oplock) not allowed. Treat "sep=" when first mount option -as an override of comma as the default separator between mount -options. - -Version 1.01 ------------- -Allow passwords longer than 16 bytes. Allow null password string. - -Version 1.00 ------------- -Gracefully clean up failed mounts when attempting to mount to servers such as -Windows 98 that terminate tcp sessions during protocol negotiation. Handle -embedded commas in mount parsing of passwords. - -Version 0.99 ------------- -Invalidate local inode cached pages on oplock break and when last file -instance is closed so that the client does not continue using stale local -copy rather than later modified server copy of file. Do not reconnect -when server drops the tcp session prematurely before negotiate -protocol response. Fix oops in reopen_file when dentry freed. Allow -the support for CIFS Unix Extensions to be disabled via proc interface. - -Version 0.98 ------------- -Fix hang in commit_write during reconnection of open files under heavy load. -Fix unload_nls oops in a mount failure path. Serialize writes to same socket -which also fixes any possible races when cifs signatures are enabled in SMBs -being sent out of signature sequence number order. - -Version 0.97 ------------- -Fix byte range locking bug (endian problem) causing bad offset and -length. - -Version 0.96 ------------- -Fix oops (in send_sig) caused by CIFS unmount code trying to -wake up the demultiplex thread after it had exited. Do not log -error on harmless oplock release of closed handle. - -Version 0.95 ------------- -Fix unsafe global variable usage and password hash failure on gcc 3.3.1 -Fix problem reconnecting secondary mounts to same server after session -failure. Fix invalid dentry - race in mkdir when directory gets created -by another client between the lookup and mkdir. - -Version 0.94 ------------- -Fix to list processing in reopen_files. Fix reconnection when server hung -but tcpip session still alive. Set proper timeout on socket read. - -Version 0.93 ------------- -Add missing mount options including iocharset. SMP fixes in write and open. -Fix errors in reconnecting after TCP session failure. Fix module unloading -of default nls codepage - -Version 0.92 ------------- -Active smb transactions should never go negative (fix double FreeXid). Fix -list processing in file routines. Check return code on kmalloc in open. -Fix spinlock usage for SMP. - -Version 0.91 ------------- -Fix oops in reopen_files when invalid dentry. drop dentry on server rename -and on revalidate errors. Fix cases where pid is now tgid. Fix return code -on create hard link when server does not support them. - -Version 0.90 ------------- -Fix scheduling while atomic error in getting inode info on newly created file. -Fix truncate of existing files opened with O_CREAT but not O_TRUNC set. - -Version 0.89 ------------- -Fix oops on write to dead tcp session. Remove error log write for case when file open -O_CREAT but not O_EXCL - -Version 0.88 ------------- -Fix non-POSIX behavior on rename of open file and delete of open file by taking -advantage of trans2 SetFileInfo rename facility if available on target server. -Retry on ENOSPC and EAGAIN socket errors. - -Version 0.87 ------------- -Fix oops on big endian readdir. Set blksize to be even power of two (2**blkbits) to fix -allocation size miscalculation. After oplock token lost do not read through -cache. - -Version 0.86 ------------- -Fix oops on empty file readahead. Fix for file size handling for locally cached files. - -Version 0.85 ------------- -Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files -during auto reconnection to server after server recovered from failure. - -Version 0.84 ------------- -Finish support for Linux 2.5 open/create changes, which removes the -redundant NTCreate/QPathInfo/close that was sent during file create. -Enable oplock by default. Enable packet signing by default (needed to -access many recent Windows servers) - -Version 0.83 ------------- -Fix oops when mounting to long server names caused by inverted parms to kmalloc. -Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled -we will choose a cifs user session (smb uid) that better matches the local -uid if a) the mount uid does not match the current uid and b) we have another -session to the same server (ip address) for a different mount which -matches the current local uid. - -Version 0.82 ------------- -Add support for mknod of block or character devices. Fix oplock -code (distributed caching) to properly send response to oplock -break from server. - -Version 0.81 ------------- -Finish up CIFS packet digital signing for the default -NTLM security case. This should help Windows 2003 -network interoperability since it is common for -packet signing to be required now. Fix statfs (stat -f) -which recently started returning errors due to -invalid value (-1 instead of 0) being set in the -struct kstatfs f_ffiles field. - -Version 0.80 ------------ -Fix oops on stopping oplock thread when removing cifs when -built as module. - -Version 0.79 ------------- -Fix mount options for ro (readonly), uid, gid and file and directory mode. - -Version 0.78 ------------- -Fix errors displayed on failed mounts to be more understandable. -Fixed various incorrect or misleading smb to posix error code mappings. - -Version 0.77 ------------- -Fix display of NTFS DFS junctions to display as symlinks. -They are the network equivalent. Fix oops in -cifs_partialpagewrite caused by missing spinlock protection -of openfile linked list. Allow writebehind caching errors to -be returned to the application at file close. - -Version 0.76 ------------- -Clean up options displayed in /proc/mounts by show_options to -be more consistent with other filesystems. - -Version 0.75 ------------- -Fix delete of readonly file to Windows servers. Reflect -presence or absence of read only dos attribute in mode -bits for servers that do not support CIFS Unix extensions. -Fix shortened results on readdir of large directories to -servers supporting CIFS Unix extensions (caused by -incorrect resume key). - -Version 0.74 ------------- -Fix truncate bug (set file size) that could cause hangs e.g. running fsx - -Version 0.73 ------------- -unload nls if mount fails. - -Version 0.72 ------------- -Add resume key support to search (readdir) code to workaround -Windows bug. Add /proc/fs/cifs/LookupCacheEnable which -allows disabling caching of attribute information for -lookups. - -Version 0.71 ------------- -Add more oplock handling (distributed caching code). Remove -dead code. Remove excessive stack space utilization from -symlink routines. - -Version 0.70 ------------- -Fix oops in get dfs referral (triggered when null path sent in to -mount). Add support for overriding rsize at mount time. - -Version 0.69 ------------- -Fix buffer overrun in readdir which caused intermittent kernel oopses. -Fix writepage code to release kmap on write data. Allow "-ip=" new -mount option to be passed in on parameter distinct from the first part -(server name portion of) the UNC name. Allow override of the -tcp port of the target server via new mount option "-port=" - -Version 0.68 ------------- -Fix search handle leak on rewind. Fix setuid and gid so that they are -reflected in the local inode immediately. Cleanup of whitespace -to make 2.4 and 2.5 versions more consistent. - - -Version 0.67 ------------- -Fix signal sending so that captive thread (cifsd) exits on umount -(which was causing the warning in kmem_cache_free of the request buffers -at rmmod time). This had broken as a sideeffect of the recent global -kernel change to daemonize. Fix memory leak in readdir code which -showed up in "ls -R" (and applications that did search rewinding). - -Version 0.66 ------------- -Reconnect tids and fids after session reconnection (still do not -reconnect byte range locks though). Fix problem caching -lookup information for directory inodes, improving performance, -especially in deep directory trees. Fix various build warnings. - -Version 0.65 ------------- -Finish fixes to commit write for caching/readahead consistency. fsx -now works to Samba servers. Fix oops caused when readahead -was interrupted by a signal. - -Version 0.64 ------------- -Fix data corruption (in partial page after truncate) that caused fsx to -fail to Windows servers. Cleaned up some extraneous error logging in -common error paths. Add generic sendfile support. - -Version 0.63 ------------- -Fix memory leak in AllocMidQEntry. -Finish reconnection logic, so connection with server can be dropped -(or server rebooted) and the cifs client will reconnect. - -Version 0.62 ------------- -Fix temporary socket leak when bad userid or password specified -(or other SMBSessSetup failure). Increase maximum buffer size to slightly -over 16K to allow negotiation of up to Samba and Windows server default read -sizes. Add support for readpages - -Version 0.61 ------------- -Fix oops when username not passed in on mount. Extensive fixes and improvements -to error logging (strip redundant newlines, change debug macros to ensure newline -passed in and to be more consistent). Fix writepage wrong file handle problem, -a readonly file handle could be incorrectly used to attempt to write out -file updates through the page cache to multiply open files. This could cause -the iozone benchmark to fail on the fwrite test. Fix bug mounting two different -shares to the same Windows server when using different usernames -(doing this to Samba servers worked but Windows was rejecting it) - now it is -possible to use different userids when connecting to the same server from a -Linux client. Fix oops when treeDisconnect called during unmount on -previously freed socket. - -Version 0.60 ------------- -Fix oops in readpages caused by not setting address space operations in inode in -rare code path. - -Version 0.59 ------------- -Includes support for deleting of open files and renaming over existing files (per POSIX -requirement). Add readlink support for Windows junction points (directory symlinks). - -Version 0.58 ------------- -Changed read and write to go through pagecache. Added additional address space operations. -Memory mapped operations now working. - -Version 0.57 ------------- -Added writepage code for additional memory mapping support. Fixed leak in xids causing -the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on -every stat call. Additional formatting cleanup. - -Version 0.56 ------------- -Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version. Formatting cleanup. - -Version 0.55 ------------- -Fixes from Zwane Mwaikambo for adding missing return code checking in a few places. -Also included a modified version of his fix to protect global list manipulation of -the smb session and tree connection and mid related global variables. - -Version 0.54 ------------- -Fix problem with captive thread hanging around at unmount time. Adjust to 2.5.42-pre -changes to superblock layout. Remove wasteful allocation of smb buffers (now the send -buffer is reused for responses). Add more oplock handling. Additional minor cleanup. - -Version 0.53 ------------- -More stylistic updates to better match kernel style. Add additional statistics -for filesystem which can be viewed via /proc/fs/cifs. Add more pieces of NTLMv2 -and CIFS Packet Signing enablement. - -Version 0.52 ------------- -Replace call to sleep_on with safer wait_on_event. -Make stylistic changes to better match kernel style recommendations. -Remove most typedef usage (except for the PDUs themselves). - -Version 0.51 ------------- -Update mount so the -unc mount option is no longer required (the ip address can be specified -in a UNC style device name. Implementation of readpage/writepage started. - -Version 0.50 ------------- -Fix intermittent problem with incorrect smb header checking on badly -fragmented tcp responses - -Version 0.49 ------------- -Fixes to setting of allocation size and file size. - -Version 0.48 ------------- -Various 2.5.38 fixes. Now works on 2.5.38 - -Version 0.47 ------------- -Prepare for 2.5 kernel merge. Remove ifdefs. - -Version 0.46 ------------- -Socket buffer management fixes. Fix dual free. - -Version 0.45 ------------- -Various big endian fixes for hardlinks and symlinks and also for dfs. - -Version 0.44 ------------- -Various big endian fixes for servers with Unix extensions such as Samba - -Version 0.43 ------------- -Various FindNext fixes for incorrect filenames on large directory searches on big endian -clients. basic posix file i/o tests now work on big endian machines, not just le - -Version 0.42 ------------- -SessionSetup and NegotiateProtocol now work from Big Endian machines. -Various Big Endian fixes found during testing on the Linux on 390. Various fixes for compatibility with older -versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7). - -Version 0.41 ------------- -Various minor fixes for Connectathon Posix "basic" file i/o test suite. Directory caching fixed so hardlinked -files now return the correct number of links on fstat as they are repeatedly linked and unlinked. - -Version 0.40 ------------- -Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate -session advanced session authentication). Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP. -Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs -(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos). - -Version 0.38 ------------- -Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable -it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU). - -Version 0.37 ------------- -Rewrote much of connection and mount/unmount logic to handle bugs with -multiple uses to same share, multiple users to same server etc. - -Version 0.36 ------------- -Fixed major problem with dentry corruption (missing call to dput) - -Version 0.35 ------------- -Rewrite of readdir code to fix bug. Various fixes for bigendian machines. -Begin adding oplock support. Multiusermount and oplockEnabled flags added to /proc/fs/cifs -although corresponding function not fully implemented in the vfs yet - -Version 0.34 ------------- -Fixed dentry caching bug, misc. cleanup - -Version 0.33 ------------- -Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes. Now can build -on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels. -Support for STATUS codes (newer 32 bit NT error codes) added. DFS support begun to be added. - -Version 0.32 ------------- -Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented -and tested against Samba 2.2.5 - - -Version 0.31 ------------- -1) Fixed lockrange to be correct (it was one byte too short) - -2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly -show range as locked when there is a conflict with an existing lock. - -3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories -in most cases. Eventually will offer optional ability to query server for the correct perms. - -3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded -but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb -session) - -4) Fixed error logging of valid mount options - -5) Removed logging of password field. - -6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c -and cleaned them up and made them more consistent with other cifs functions. - -7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways -(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed, -nor is the symlink support using the Unix extensions - -8) Started adding the readlink and follow_link code - -Version 0.3 ------------ -Initial drop - diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index aa0d68b086eb..1964d212ab08 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ - readdir.o ioctl.o sess.o export.o smb1ops.o + readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o cifs-$(CONFIG_CIFS_ACL) += cifsacl.o diff --git a/fs/cifs/README b/fs/cifs/README deleted file mode 100644 index 2d5622f60e11..000000000000 --- a/fs/cifs/README +++ /dev/null @@ -1,753 +0,0 @@ -The CIFS VFS support for Linux supports many advanced network filesystem -features such as hierarchical dfs like namespace, hardlinks, locking and more. -It was designed to comply with the SNIA CIFS Technical Reference (which -supersedes the 1992 X/Open SMB Standard) as well as to perform best practice -practical interoperability with Windows 2000, Windows XP, Samba and equivalent -servers. This code was developed in participation with the Protocol Freedom -Information Foundation. - -Please see - http://protocolfreedom.org/ and - http://samba.org/samba/PFIF/ -for more details. - - -For questions or bug reports please contact: - sfrench@samba.org (sfrench@us.ibm.com) - -Build instructions: -================== -For Linux 2.4: -1) Get the kernel source (e.g.from http://www.kernel.org) -and download the cifs vfs source (see the project page -at http://us1.samba.org/samba/Linux_CIFS_client.html) -and change directory into the top of the kernel directory -then patch the kernel (e.g. "patch -p1 < cifs_24.patch") -to add the cifs vfs to your kernel configure options if -it has not already been added (e.g. current SuSE and UL -users do not need to apply the cifs_24.patch since the cifs vfs is -already in the kernel configure menu) and then -mkdir linux/fs/cifs and then copy the current cifs vfs files from -the cifs download to your kernel build directory e.g. - - cp <cifs_download_dir>/fs/cifs/* to <kernel_download_dir>/fs/cifs - -2) make menuconfig (or make xconfig) -3) select cifs from within the network filesystem choices -4) save and exit -5) make dep -6) make modules (or "make" if CIFS VFS not to be built as a module) - -For Linux 2.6: -1) Download the kernel (e.g. from http://www.kernel.org) -and change directory into the top of the kernel directory tree -(e.g. /usr/src/linux-2.5.73) -2) make menuconfig (or make xconfig) -3) select cifs from within the network filesystem choices -4) save and exit -5) make - - -Installation instructions: -========================= -If you have built the CIFS vfs as module (successfully) simply -type "make modules_install" (or if you prefer, manually copy the file to -the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o). - -If you have built the CIFS vfs into the kernel itself, follow the instructions -for your distribution on how to install a new kernel (usually you -would simply type "make install"). - -If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on -the CIFS VFS web site) copy it to the same directory in which mount.smbfs and -similar files reside (usually /sbin). Although the helper software is not -required, mount.cifs is recommended. Eventually the Samba 3.0 utility program -"net" may also be helpful since it may someday provide easier mount syntax for -users who are used to Windows e.g. - net use <mount point> <UNC name or cifs URL> -Note that running the Winbind pam/nss module (logon service) on all of your -Linux clients is useful in mapping Uids and Gids consistently across the -domain to the proper network user. The mount.cifs mount helper can be -trivially built from Samba 3.0 or later source e.g. by executing: - - gcc samba/source/client/mount.cifs.c -o mount.cifs - -If cifs is built as a module, then the size and number of network buffers -and maximum number of simultaneous requests to one server can be configured. -Changing these from their defaults is not recommended. By executing modinfo - modinfo kernel/fs/cifs/cifs.ko -on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made -at module initialization time (by running insmod cifs.ko) can be seen. - -Allowing User Mounts -==================== -To permit users to mount and unmount over directories they own is possible -with the cifs vfs. A way to enable such mounting is to mark the mount.cifs -utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to -umount shares they mount requires -1) mount.cifs version 1.4 or later -2) an entry for the share in /etc/fstab indicating that a user may -unmount it e.g. -//server/usersharename /mnt/username cifs user 0 0 - -Note that when the mount.cifs utility is run suid (allowing user mounts), -in order to reduce risks, the "nosuid" mount flag is passed in on mount to -disallow execution of an suid program mounted on the remote target. -When mount is executed as root, nosuid is not passed in by default, -and execution of suid programs on the remote target would be enabled -by default. This can be changed, as with nfs and other filesystems, -by simply specifying "nosuid" among the mount options. For user mounts -though to be able to pass the suid flag to mount requires rebuilding -mount.cifs with the following flag: - - gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs - -There is a corresponding manual page for cifs mounting in the Samba 3.0 and -later source tree in docs/manpages/mount.cifs.8 - -Allowing User Unmounts -====================== -To permit users to ummount directories that they have user mounted (see above), -the utility umount.cifs may be used. It may be invoked directly, or if -umount.cifs is placed in /sbin, umount can invoke the cifs umount helper -(at least for most versions of the umount utility) for umount of cifs -mounts, unless umount is invoked with -i (which will avoid invoking a umount -helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked -as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions -allow adding entries to a file to the /etc/permissions file to achieve the -equivalent suid effect). For this utility to succeed the target path -must be a cifs mount, and the uid of the current user must match the uid -of the user who mounted the resource. - -Also note that the customary way of allowing user mounts and unmounts is -(instead of using mount.cifs and unmount.cifs as suid) to add a line -to the file /etc/fstab for each //server/share you wish to mount, but -this can become unwieldy when potential mount targets include many -or unpredictable UNC names. - -Samba Considerations -==================== -To get the maximum benefit from the CIFS VFS, we recommend using a server that -supports the SNIA CIFS Unix Extensions standard (e.g. Samba 2.2.5 or later or -Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers. -Note that uid, gid and file permissions will display default values if you do -not have a server that supports the Unix extensions for CIFS (such as Samba -2.2.5 or later). To enable the Unix CIFS Extensions in the Samba server, add -the line: - - unix extensions = yes - -to your smb.conf file on the server. Note that the following smb.conf settings -are also useful (on the Samba server) when the majority of clients are Unix or -Linux: - - case sensitive = yes - delete readonly = yes - ea support = yes - -Note that server ea support is required for supporting xattrs from the Linux -cifs client, and that EA support is present in later versions of Samba (e.g. -3.0.6 and later (also EA support works in all versions of Windows, at least to -shares on NTFS filesystems). Extended Attribute (xattr) support is an optional -feature of most Linux filesystems which may require enabling via -make menuconfig. Client support for extended attributes (user xattr) can be -disabled on a per-mount basis by specifying "nouser_xattr" on mount. - -The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers -version 3.10 and later. Setting POSIX ACLs requires enabling both XATTR and -then POSIX support in the CIFS configuration options when building the cifs -module. POSIX ACL support can be disabled on a per mount basic by specifying -"noacl" on mount. - -Some administrators may want to change Samba's smb.conf "map archive" and -"create mask" parameters from the default. Unless the create mask is changed -newly created files can end up with an unnecessarily restrictive default mode, -which may not be what you want, although if the CIFS Unix extensions are -enabled on the server and client, subsequent setattr calls (e.g. chmod) can -fix the mode. Note that creating special devices (mknod) remotely -may require specifying a mkdev function to Samba if you are not using -Samba 3.0.6 or later. For more information on these see the manual pages -("man smb.conf") on the Samba server system. Note that the cifs vfs, -unlike the smbfs vfs, does not read the smb.conf on the client system -(the few optional settings are passed in on mount via -o parameters instead). -Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete -open files (required for strict POSIX compliance). Windows Servers already -supported this feature. Samba server does not allow symlinks that refer to files -outside of the share, so in Samba versions prior to 3.0.6, most symlinks to -files with absolute paths (ie beginning with slash) such as: - ln -s /mnt/foo bar -would be forbidden. Samba 3.0.6 server or later includes the ability to create -such symlinks safely by converting unsafe symlinks (ie symlinks to server -files that are outside of the share) to a samba specific format on the server -that is ignored by local server applications and non-cifs clients and that will -not be traversed by the Samba server). This is opaque to the Linux client -application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or -later, but only for remote clients using the CIFS Unix extensions, and will -be invisbile to Windows clients and typically will not affect local -applications running on the same server as Samba. - -Use instructions: -================ -Once the CIFS VFS support is built into the kernel or installed as a module -(cifs.o), you can use mount syntax like the following to access Samba or Windows -servers: - - mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword - -Before -o the option -v may be specified to make the mount.cifs -mount helper display the mount steps more verbosely. -After -o the following commonly used cifs vfs specific options -are supported: - - user=<username> - pass=<password> - domain=<domain name> - -Other cifs mount options are described below. Use of TCP names (in addition to -ip addresses) is available if the mount helper (mount.cifs) is installed. If -you do not trust the server to which are mounted, or if you do not have -cifs signing enabled (and the physical network is insecure), consider use -of the standard mount options "noexec" and "nosuid" to reduce the risk of -running an altered binary on your local system (downloaded from a hostile server -or altered by a hostile router). - -Although mounting using format corresponding to the CIFS URL specification is -not possible in mount.cifs yet, it is possible to use an alternate format -for the server and sharename (which is somewhat similar to NFS style mount -syntax) instead of the more widely used UNC format (i.e. \\server\share): - mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd - -When using the mount helper mount.cifs, passwords may be specified via alternate -mechanisms, instead of specifying it after -o using the normal "pass=" syntax -on the command line: -1) By including it in a credential file. Specify credentials=filename as one -of the mount options. Credential files contain two lines - username=someuser - password=your_password -2) By specifying the password in the PASSWD environment variable (similarly -the user name can be taken from the USER environment variable). -3) By specifying the password in a file by name via PASSWD_FILE -4) By specifying the password in a file by file descriptor via PASSWD_FD - -If no password is provided, mount.cifs will prompt for password entry - -Restrictions -============ -Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC -1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a -problem as most servers support this. - -Valid filenames differ between Windows and Linux. Windows typically restricts -filenames which contain certain reserved characters (e.g.the character : -which is used to delimit the beginning of a stream name by Windows), while -Linux allows a slightly wider set of valid characters in filenames. Windows -servers can remap such characters when an explicit mapping is specified in -the Server's registry. Samba starting with version 3.10 will allow such -filenames (ie those which contain valid Linux characters, which normally -would be forbidden for Windows/CIFS semantics) as long as the server is -configured for Unix Extensions (and the client has not disabled -/proc/fs/cifs/LinuxExtensionsEnabled). - - -CIFS VFS Mount Options -====================== -A partial list of the supported mount options follows: - user The user name to use when trying to establish - the CIFS session. - password The user password. If the mount helper is - installed, the user will be prompted for password - if not supplied. - ip The ip address of the target server - unc The target server Universal Network Name (export) to - mount. - domain Set the SMB/CIFS workgroup name prepended to the - username during CIFS session establishment - forceuid Set the default uid for inodes to the uid - passed in on mount. For mounts to servers - which do support the CIFS Unix extensions, such as a - properly configured Samba server, the server provides - the uid, gid and mode so this parameter should not be - specified unless the server and clients uid and gid - numbering differ. If the server and client are in the - same domain (e.g. running winbind or nss_ldap) and - the server supports the Unix Extensions then the uid - and gid can be retrieved from the server (and uid - and gid would not have to be specifed on the mount. - For servers which do not support the CIFS Unix - extensions, the default uid (and gid) returned on lookup - of existing files will be the uid (gid) of the person - who executed the mount (root, except when mount.cifs - is configured setuid for user mounts) unless the "uid=" - (gid) mount option is specified. Also note that permission - checks (authorization checks) on accesses to a file occur - at the server, but there are cases in which an administrator - may want to restrict at the client as well. For those - servers which do not report a uid/gid owner - (such as Windows), permissions can also be checked at the - client, and a crude form of client side permission checking - can be enabled by specifying file_mode and dir_mode on - the client. (default) - forcegid (similar to above but for the groupid instead of uid) (default) - noforceuid Fill in file owner information (uid) by requesting it from - the server if possible. With this option, the value given in - the uid= option (on mount) will only be used if the server - can not support returning uids on inodes. - noforcegid (similar to above but for the group owner, gid, instead of uid) - uid Set the default uid for inodes, and indicate to the - cifs kernel driver which local user mounted. If the server - supports the unix extensions the default uid is - not used to fill in the owner fields of inodes (files) - unless the "forceuid" parameter is specified. - gid Set the default gid for inodes (similar to above). - file_mode If CIFS Unix extensions are not supported by the server - this overrides the default mode for file inodes. - fsc Enable local disk caching using FS-Cache (off by default). This - option could be useful to improve performance on a slow link, - heavily loaded server and/or network where reading from the - disk is faster than reading from the server (over the network). - This could also impact scalability positively as the - number of calls to the server are reduced. However, local - caching is not suitable for all workloads for e.g. read-once - type workloads. So, you need to consider carefully your - workload/scenario before using this option. Currently, local - disk caching is functional for CIFS files opened as read-only. - dir_mode If CIFS Unix extensions are not supported by the server - this overrides the default mode for directory inodes. - port attempt to contact the server on this tcp port, before - trying the usual ports (port 445, then 139). - iocharset Codepage used to convert local path names to and from - Unicode. Unicode is used by default for network path - names if the server supports it. If iocharset is - not specified then the nls_default specified - during the local client kernel build will be used. - If server does not support Unicode, this parameter is - unused. - rsize default read size (usually 16K). The client currently - can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize - defaults to 16K and may be changed (from 8K to the maximum - kmalloc size allowed by your kernel) at module install time - for cifs.ko. Setting CIFSMaxBufSize to a very large value - will cause cifs to use more memory and may reduce performance - in some cases. To use rsize greater than 127K (the original - cifs protocol maximum) also requires that the server support - a new Unix Capability flag (for very large read) which some - newer servers (e.g. Samba 3.0.26 or later) do. rsize can be - set from a minimum of 2048 to a maximum of 130048 (127K or - CIFSMaxBufSize, whichever is smaller) - wsize default write size (default 57344) - maximum wsize currently allowed by CIFS is 57344 (fourteen - 4096 byte pages) - actimeo=n attribute cache timeout in seconds (default 1 second). - After this timeout, the cifs client requests fresh attribute - information from the server. This option allows to tune the - attribute cache timeout to suit the workload needs. Shorter - timeouts mean better the cache coherency, but increased number - of calls to the server. Longer timeouts mean reduced number - of calls to the server at the expense of less stricter cache - coherency checks (i.e. incorrect attribute cache for a short - period of time). - rw mount the network share read-write (note that the - server may still consider the share read-only) - ro mount network share read-only - version used to distinguish different versions of the - mount helper utility (not typically needed) - sep if first mount option (after the -o), overrides - the comma as the separator between the mount - parms. e.g. - -o user=myname,password=mypassword,domain=mydom - could be passed instead with period as the separator by - -o sep=.user=myname.password=mypassword.domain=mydom - this might be useful when comma is contained within username - or password or domain. This option is less important - when the cifs mount helper cifs.mount (version 1.1 or later) - is used. - nosuid Do not allow remote executables with the suid bit - program to be executed. This is only meaningful for mounts - to servers such as Samba which support the CIFS Unix Extensions. - If you do not trust the servers in your network (your mount - targets) it is recommended that you specify this option for - greater security. - exec Permit execution of binaries on the mount. - noexec Do not permit execution of binaries on the mount. - dev Recognize block devices on the remote mount. - nodev Do not recognize devices on the remote mount. - suid Allow remote files on this mountpoint with suid enabled to - be executed (default for mounts when executed as root, - nosuid is default for user mounts). - credentials Although ignored by the cifs kernel component, it is used by - the mount helper, mount.cifs. When mount.cifs is installed it - opens and reads the credential file specified in order - to obtain the userid and password arguments which are passed to - the cifs vfs. - guest Although ignored by the kernel component, the mount.cifs - mount helper will not prompt the user for a password - if guest is specified on the mount options. If no - password is specified a null password will be used. - perm Client does permission checks (vfs_permission check of uid - and gid of the file against the mode and desired operation), - Note that this is in addition to the normal ACL check on the - target machine done by the server software. - Client permission checking is enabled by default. - noperm Client does not do permission checks. This can expose - files on this mount to access by other users on the local - client system. It is typically only needed when the server - supports the CIFS Unix Extensions but the UIDs/GIDs on the - client and server system do not match closely enough to allow - access by the user doing the mount, but it may be useful with - non CIFS Unix Extension mounts for cases in which the default - mode is specified on the mount but is not to be enforced on the - client (e.g. perhaps when MultiUserMount is enabled) - Note that this does not affect the normal ACL check on the - target machine done by the server software (of the server - ACL against the user name provided at mount time). - serverino Use server's inode numbers instead of generating automatically - incrementing inode numbers on the client. Although this will - make it easier to spot hardlinked files (as they will have - the same inode numbers) and inode numbers may be persistent, - note that the server does not guarantee that the inode numbers - are unique if multiple server side mounts are exported under a - single share (since inode numbers on the servers might not - be unique if multiple filesystems are mounted under the same - shared higher level directory). Note that some older - (e.g. pre-Windows 2000) do not support returning UniqueIDs - or the CIFS Unix Extensions equivalent and for those - this mount option will have no effect. Exporting cifs mounts - under nfsd requires this mount option on the cifs mount. - This is now the default if server supports the - required network operation. - noserverino Client generates inode numbers (rather than using the actual one - from the server). These inode numbers will vary after - unmount or reboot which can confuse some applications, - but not all server filesystems support unique inode - numbers. - setuids If the CIFS Unix extensions are negotiated with the server - the client will attempt to set the effective uid and gid of - the local process on newly created files, directories, and - devices (create, mkdir, mknod). If the CIFS Unix Extensions - are not negotiated, for newly created files and directories - instead of using the default uid and gid specified on - the mount, cache the new file's uid and gid locally which means - that the uid for the file can change when the inode is - reloaded (or the user remounts the share). - nosetuids The client will not attempt to set the uid and gid on - on newly created files, directories, and devices (create, - mkdir, mknod) which will result in the server setting the - uid and gid to the default (usually the server uid of the - user who mounted the share). Letting the server (rather than - the client) set the uid and gid is the default. If the CIFS - Unix Extensions are not negotiated then the uid and gid for - new files will appear to be the uid (gid) of the mounter or the - uid (gid) parameter specified on the mount. - netbiosname When mounting to servers via port 139, specifies the RFC1001 - source name to use to represent the client netbios machine - name when doing the RFC1001 netbios session initialize. - direct Do not do inode data caching on files opened on this mount. - This precludes mmapping files on this mount. In some cases - with fast networks and little or no caching benefits on the - client (e.g. when the application is doing large sequential - reads bigger than page size without rereading the same data) - this can provide better performance than the default - behavior which caches reads (readahead) and writes - (writebehind) through the local Linux client pagecache - if oplock (caching token) is granted and held. Note that - direct allows write operations larger than page size - to be sent to the server. - strictcache Use for switching on strict cache mode. In this mode the - client read from the cache all the time it has Oplock Level II, - otherwise - read from the server. All written data are stored - in the cache, but if the client doesn't have Exclusive Oplock, - it writes the data to the server. - rwpidforward Forward pid of a process who opened a file to any read or write - operation on that file. This prevent applications like WINE - from failing on read and write if we use mandatory brlock style. - acl Allow setfacl and getfacl to manage posix ACLs if server - supports them. (default) - noacl Do not allow setfacl and getfacl calls on this mount - user_xattr Allow getting and setting user xattrs (those attributes whose - name begins with "user." or "os2.") as OS/2 EAs (extended - attributes) to the server. This allows support of the - setfattr and getfattr utilities. (default) - nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs - mapchars Translate six of the seven reserved characters (not backslash) - *?<>|: - to the remap range (above 0xF000), which also - allows the CIFS client to recognize files created with - such characters by Windows's POSIX emulation. This can - also be useful when mounting to most versions of Samba - (which also forbids creating and opening files - whose names contain any of these seven characters). - This has no effect if the server does not support - Unicode on the wire. - nomapchars Do not translate any of these seven characters (default). - nocase Request case insensitive path name matching (case - sensitive is the default if the server supports it). - (mount option "ignorecase" is identical to "nocase") - posixpaths If CIFS Unix extensions are supported, attempt to - negotiate posix path name support which allows certain - characters forbidden in typical CIFS filenames, without - requiring remapping. (default) - noposixpaths If CIFS Unix extensions are supported, do not request - posix path name support (this may cause servers to - reject creatingfile with certain reserved characters). - nounix Disable the CIFS Unix Extensions for this mount (tree - connection). This is rarely needed, but it may be useful - in order to turn off multiple settings all at once (ie - posix acls, posix locks, posix paths, symlink support - and retrieving uids/gids/mode from the server) or to - work around a bug in server which implement the Unix - Extensions. - nobrl Do not send byte range lock requests to the server. - This is necessary for certain applications that break - with cifs style mandatory byte range locks (and most - cifs servers do not yet support requesting advisory - byte range locks). - forcemandatorylock Even if the server supports posix (advisory) byte range - locking, send only mandatory lock requests. For some - (presumably rare) applications, originally coded for - DOS/Windows, which require Windows style mandatory byte range - locking, they may be able to take advantage of this option, - forcing the cifs client to only send mandatory locks - even if the cifs server would support posix advisory locks. - "forcemand" is accepted as a shorter form of this mount - option. - nostrictsync If this mount option is set, when an application does an - fsync call then the cifs client does not send an SMB Flush - to the server (to force the server to write all dirty data - for this file immediately to disk), although cifs still sends - all dirty (cached) file data to the server and waits for the - server to respond to the write. Since SMB Flush can be - very slow, and some servers may be reliable enough (to risk - delaying slightly flushing the data to disk on the server), - turning on this option may be useful to improve performance for - applications that fsync too much, at a small risk of server - crash. If this mount option is not set, by default cifs will - send an SMB flush request (and wait for a response) on every - fsync call. - nodfs Disable DFS (global name space support) even if the - server claims to support it. This can help work around - a problem with parsing of DFS paths with Samba server - versions 3.0.24 and 3.0.25. - remount remount the share (often used to change from ro to rw mounts - or vice versa) - cifsacl Report mode bits (e.g. on stat) based on the Windows ACL for - the file. (EXPERIMENTAL) - servern Specify the server 's netbios name (RFC1001 name) to use - when attempting to setup a session to the server. - This is needed for mounting to some older servers (such - as OS/2 or Windows 98 and Windows ME) since they do not - support a default server name. A server name can be up - to 15 characters long and is usually uppercased. - sfu When the CIFS Unix Extensions are not negotiated, attempt to - create device files and fifos in a format compatible with - Services for Unix (SFU). In addition retrieve bits 10-12 - of the mode via the SETFILEBITS extended attribute (as - SFU does). In the future the bottom 9 bits of the - mode also will be emulated using queries of the security - descriptor (ACL). - mfsymlinks Enable support for Minshall+French symlinks - (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks) - This option is ignored when specified together with the - 'sfu' option. Minshall+French symlinks are used even if - the server supports the CIFS Unix Extensions. - sign Must use packet signing (helps avoid unwanted data modification - by intermediate systems in the route). Note that signing - does not work with lanman or plaintext authentication. - seal Must seal (encrypt) all data on this mounted share before - sending on the network. Requires support for Unix Extensions. - Note that this differs from the sign mount option in that it - causes encryption of data sent over this mounted share but other - shares mounted to the same server are unaffected. - locallease This option is rarely needed. Fcntl F_SETLEASE is - used by some applications such as Samba and NFSv4 server to - check to see whether a file is cacheable. CIFS has no way - to explicitly request a lease, but can check whether a file - is cacheable (oplocked). Unfortunately, even if a file - is not oplocked, it could still be cacheable (ie cifs client - could grant fcntl leases if no other local processes are using - the file) for cases for example such as when the server does not - support oplocks and the user is sure that the only updates to - the file will be from this client. Specifying this mount option - will allow the cifs client to check for leases (only) locally - for files which are not oplocked instead of denying leases - in that case. (EXPERIMENTAL) - sec Security mode. Allowed values are: - none attempt to connection as a null user (no name) - krb5 Use Kerberos version 5 authentication - krb5i Use Kerberos authentication and packet signing - ntlm Use NTLM password hashing (default) - ntlmi Use NTLM password hashing with signing (if - /proc/fs/cifs/PacketSigningEnabled on or if - server requires signing also can be the default) - ntlmv2 Use NTLMv2 password hashing - ntlmv2i Use NTLMv2 password hashing with packet signing - lanman (if configured in kernel config) use older - lanman hash -hard Retry file operations if server is not responding -soft Limit retries to unresponsive servers (usually only - one retry) before returning an error. (default) - -The mount.cifs mount helper also accepts a few mount options before -o -including: - - -S take password from stdin (equivalent to setting the environment - variable "PASSWD_FD=0" - -V print mount.cifs version - -? display simple usage information - -With most 2.6 kernel versions of modutils, the version of the cifs kernel -module can be displayed via modinfo. - -Misc /proc/fs/cifs Flags and Debug Info -======================================= -Informational pseudo-files: -DebugData Displays information about active CIFS sessions and - shares, features enabled as well as the cifs.ko - version. -Stats Lists summary resource usage information as well as per - share statistics, if CONFIG_CIFS_STATS in enabled - in the kernel configuration. - -Configuration pseudo-files: -PacketSigningEnabled If set to one, cifs packet signing is enabled - and will be used if the server requires - it. If set to two, cifs packet signing is - required even if the server considers packet - signing optional. (default 1) -SecurityFlags Flags which control security negotiation and - also packet signing. Authentication (may/must) - flags (e.g. for NTLM and/or NTLMv2) may be combined with - the signing flags. Specifying two different password - hashing mechanisms (as "must use") on the other hand - does not make much sense. Default flags are - 0x07007 - (NTLM, NTLMv2 and packet signing allowed). The maximum - allowable flags if you want to allow mounts to servers - using weaker password hashes is 0x37037 (lanman, - plaintext, ntlm, ntlmv2, signing allowed). Some - SecurityFlags require the corresponding menuconfig - options to be enabled (lanman and plaintext require - CONFIG_CIFS_WEAK_PW_HASH for example). Enabling - plaintext authentication currently requires also - enabling lanman authentication in the security flags - because the cifs module only supports sending - laintext passwords using the older lanman dialect - form of the session setup SMB. (e.g. for authentication - using plain text passwords, set the SecurityFlags - to 0x30030): - - may use packet signing 0x00001 - must use packet signing 0x01001 - may use NTLM (most common password hash) 0x00002 - must use NTLM 0x02002 - may use NTLMv2 0x00004 - must use NTLMv2 0x04004 - may use Kerberos security 0x00008 - must use Kerberos 0x08008 - may use lanman (weak) password hash 0x00010 - must use lanman password hash 0x10010 - may use plaintext passwords 0x00020 - must use plaintext passwords 0x20020 - (reserved for future packet encryption) 0x00040 - -cifsFYI If set to non-zero value, additional debug information - will be logged to the system error log. This field - contains three flags controlling different classes of - debugging entries. The maximum value it can be set - to is 7 which enables all debugging points (default 0). - Some debugging statements are not compiled into the - cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the - kernel configuration. cifsFYI may be set to one or - nore of the following flags (7 sets them all): - - log cifs informational messages 0x01 - log return codes from cifs entry points 0x02 - log slow responses (ie which take longer than 1 second) - CONFIG_CIFS_STATS2 must be enabled in .config 0x04 - - -traceSMB If set to one, debug information is logged to the - system error log with the start of smb requests - and responses (default 0) -LookupCacheEnable If set to one, inode information is kept cached - for one second improving performance of lookups - (default 1) -OplockEnabled If set to one, safe distributed caching enabled. - (default 1) -LinuxExtensionsEnabled If set to one then the client will attempt to - use the CIFS "UNIX" extensions which are optional - protocol enhancements that allow CIFS servers - to return accurate UID/GID information as well - as support symbolic links. If you use servers - such as Samba that support the CIFS Unix - extensions but do not want to use symbolic link - support and want to map the uid and gid fields - to values supplied at mount (rather than the - actual values, then set this to zero. (default 1) - -These experimental features and tracing can be enabled by changing flags in -/proc/fs/cifs (after the cifs module has been installed or built into the -kernel, e.g. insmod cifs). To enable a feature set it to 1 e.g. to enable -tracing to the kernel message log type: - - echo 7 > /proc/fs/cifs/cifsFYI - -cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel -logging of various informational messages. 2 enables logging of non-zero -SMB return codes while 4 enables logging of requests that take longer -than one second to complete (except for byte range lock requests). -Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the -source code (typically by setting it in the beginning of cifsglob.h), -and setting it to seven enables all three. Finally, tracing -the start of smb requests and responses can be enabled via: - - echo 1 > /proc/fs/cifs/traceSMB - -Per share (per client mount) statistics are available in /proc/fs/cifs/Stats -if the kernel was configured with cifs statistics enabled. The statistics -represent the number of successful (ie non-zero return code from the server) -SMB responses to some of the more common commands (open, delete, mkdir etc.). -Also recorded is the total bytes read and bytes written to the server for -that share. Note that due to client caching effects this can be less than the -number of bytes read and written by the application running on the client. -The statistics for the number of total SMBs and oplock breaks are different in -that they represent all for that share, not just those for which the server -returned success. - -Also note that "cat /proc/fs/cifs/DebugData" will display information about -the active sessions and the shares that are mounted. - -Enabling Kerberos (extended security) works but requires version 1.2 or later -of the helper program cifs.upcall to be present and to be configured in the -/etc/request-key.conf file. The cifs.upcall helper program is from the Samba -project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not -require this helper. Note that NTLMv2 security (which does not require the -cifs.upcall helper program), instead of using Kerberos, is sufficient for -some use cases. - -DFS support allows transparent redirection to shares in an MS-DFS name space. -In addition, DFS support for target shares which are specified as UNC -names which begin with host names (rather than IP addresses) requires -a user space helper (such as cifs.upcall) to be present in order to -translate host names to ip address, and the user space helper must also -be configured in the file /etc/request-key.conf. Samba, Windows servers and -many NAS appliances support DFS as a way of constructing a global name -space to ease network configuration and improve reliability. - -To use cifs Kerberos and DFS support, the Linux keyutils package should be -installed and something like the following lines should be added to the -/etc/request-key.conf file: - -create cifs.spnego * * /usr/local/sbin/cifs.upcall %k -create dns_resolver * * /usr/local/sbin/cifs.upcall %k - -CIFS kernel module parameters -============================= -These module parameters can be specified or modified either during the time of -module loading or during the runtime by using the interface - /proc/module/cifs/parameters/<param> - -i.e. echo "value" > /sys/module/cifs/parameters/<param> - -1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. - [Y/y/1]. To disable use any of [N/n/0]. - diff --git a/fs/cifs/TODO b/fs/cifs/TODO deleted file mode 100644 index 355abcdcda98..000000000000 --- a/fs/cifs/TODO +++ /dev/null @@ -1,129 +0,0 @@ -Version 1.53 May 20, 2008 - -A Partial List of Missing Features -================================== - -Contributions are welcome. There are plenty of opportunities -for visible, important contributions to this module. Here -is a partial list of the known problems and missing features: - -a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown -so that these operations can be supported to Windows servers - -b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS -SecurityDescriptors - -c) Better pam/winbind integration (e.g. to handle uid mapping -better) - -d) Cleanup now unneeded SessSetup code in -fs/cifs/connect.c and add back in NTLMSSP code if any servers -need it - -e) fix NTLMv2 signing when two mounts with different users to same -server. - -f) Directory entry caching relies on a 1 second timer, rather than -using FindNotify or equivalent. - (started) - -g) quota support (needs minor kernel change since quota calls -to make it to network filesystems or deviceless filesystems) - -h) investigate sync behavior (including syncpage) and check -for proper behavior of intr/nointr - -i) improve support for very old servers (OS/2 and Win9x for example) -Including support for changing the time remotely (utimes command). - -j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the -extra copy in/out of the socket buffers in some cases. - -k) Better optimize open (and pathbased setfilesize) to reduce the -oplock breaks coming from windows srv. Piggyback identical file -opens on top of each other by incrementing reference count rather -than resending (helps reduce server resource utilization and avoid -spurious oplock breaks). - -l) Improve performance of readpages by sending more than one read -at a time when 8 pages or more are requested. In conjuntion -add support for async_cifs_readpages. - -m) Add support for storing symlink info to Windows servers -in the Extended Attribute format their SFU clients would recognize. - -n) Finish fcntl D_NOTIFY support so kde and gnome file list windows -will autorefresh (partially complete by Asser). Needs minor kernel -vfs change to support removing D_NOTIFY on a file. - -o) Add GUI tool to configure /proc/fs/cifs settings and for display of -the CIFS statistics (started) - -p) implement support for security and trusted categories of xattrs -(requires minor protocol extension) to enable better support for SELINUX - -q) Implement O_DIRECT flag on open (already supported on mount) - -r) Create UID mapping facility so server UIDs can be mapped on a per -mount or a per server basis to client UIDs or nobody if no mapping -exists. This is helpful when Unix extensions are negotiated to -allow better permission checking when UIDs differ on the server -and client. Add new protocol request to the CIFS protocol -standard for asking the server for the corresponding name of a -particular uid. - -s) Add support for CIFS Unix and also the newer POSIX extensions to the -server side for Samba 4. - -t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) -need to add ability to set time to server (utimes command) - -u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too) - -v) mount check for unmatched uids - -w) Add support for new vfs entry point for fallocate - -x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of -processes can proceed better in parallel (on the server) - -y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount -restriction of wsize max being 127K) - -KNOWN BUGS (updated April 24, 2007) -==================================== -See http://bugzilla.samba.org - search on product "CifsVFS" for -current bug list. - -1) existing symbolic links (Windows reparse points) are recognized but -can not be created remotely. They are implemented for Samba and those that -support the CIFS Unix extensions, although earlier versions of Samba -overly restrict the pathnames. -2) follow_link and readdir code does not follow dfs junctions -but recognizes them -3) create of new files to FAT partitions on Windows servers can -succeed but still return access denied (appears to be Windows -server not cifs client problem) and has not been reproduced recently. -NTFS partitions do not have this problem. -4) Unix/POSIX capabilities are reset after reconnection, and affect -a few fields in the tree connection but we do do not know which -superblocks to apply these changes to. We should probably walk -the list of superblocks to set these. Also need to check the -flags on the second mount to the same share, and see if we -can do the same trick that NFS does to remount duplicate shares. - -Misc testing to do -================== -1) check out max path names and max path name components against various server -types. Try nested symlinks (8 deep). Return max path name in stat -f information - -2) Modify file portion of ltp so it can run against a mounted network -share and run it against cifs vfs in automated fashion. - -3) Additional performance testing and optimization using iozone and similar - -there are some easy changes that can be done to parallelize sequential writes, -and when signing is disabled to request larger read sizes (larger than -negotiated size) and send larger write sizes to modern servers. - -4) More exhaustively test against less common servers. More testing -against Windows 9x, Windows ME servers. - diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index fe8d6276410a..d8eac3b6cefb 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -91,6 +91,8 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen, #endif /* CONFIG_CIFS_SMB2 */ #endif +wchar_t cifs_toupper(wchar_t in); + /* * UniStrcat: Concatenate the second string to the first * diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 85ea98d139fc..a16b4e58bcc6 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -255,6 +255,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; + cifs_inode->epoch = 0; #ifdef CONFIG_CIFS_SMB2 get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); #endif @@ -357,6 +358,18 @@ cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) seq_printf(s, "loose"); } +static void +cifs_show_nls(struct seq_file *s, struct nls_table *cur) +{ + struct nls_table *def; + + /* Display iocharset= option if it's not default charset */ + def = load_nls_default(); + if (def != cur) + seq_printf(s, ",iocharset=%s", cur->charset); + unload_nls(def); +} + /* * cifs_show_options() is for displaying mount options in /proc/mounts. * Not all settable options are displayed but most of the important @@ -418,6 +431,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + + cifs_show_nls(s, cifs_sb->local_nls); + if (tcon->seal) seq_printf(s, ",seal"); if (tcon->nocase) @@ -718,7 +734,7 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, written = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (CIFS_I(inode)->clientCanCacheAll) + if (CIFS_CACHE_WRITE(CIFS_I(inode))) return written; rc = filemap_fdatawrite(inode->i_mapping); @@ -743,7 +759,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) * We need to be sure that all dirty pages are written and the * server has the newest file length. */ - if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && + if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_fdatawait(inode->i_mapping); if (rc) { @@ -767,8 +783,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) { - /* note that this is called by vfs setlease with i_lock held - to protect *lease from going away */ + /* + * Note that this is called by vfs setlease with i_lock held to + * protect *lease from going away. + */ struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; @@ -776,20 +794,19 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) return -EINVAL; /* check if file is oplocked */ - if (((arg == F_RDLCK) && - (CIFS_I(inode)->clientCanCacheRead)) || - ((arg == F_WRLCK) && - (CIFS_I(inode)->clientCanCacheAll))) + if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || + ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode)))) return generic_setlease(file, arg, lease); else if (tlink_tcon(cfile->tlink)->local_lease && - !CIFS_I(inode)->clientCanCacheRead) - /* If the server claims to support oplock on this - file, then we still need to check oplock even - if the local_lease mount option is set, but there - are servers which do not support oplock for which - this mount option may be useful if the user - knows that the file won't be changed on the server - by anyone else */ + !CIFS_CACHE_READ(CIFS_I(inode))) + /* + * If the server claims to support oplock on this file, then we + * still need to check oplock even if the local_lease mount + * option is set, but there are servers which do not support + * oplock for which this mount option may be useful if the user + * knows that the file won't be changed on the server by anyone + * else. + */ return generic_setlease(file, arg, lease); else return -EAGAIN; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 52ca861ed35e..cfa14c80ef3b 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -28,6 +28,7 @@ #include "cifsacl.h" #include <crypto/internal/hash.h> #include <linux/scatterlist.h> +#include <uapi/linux/cifs/cifs_mount.h> #ifdef CONFIG_CIFS_SMB2 #include "smb2pdu.h" #endif @@ -41,12 +42,7 @@ #define MAX_SES_INFO 2 #define MAX_TCON_INFO 4 -#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) -#define MAX_SERVER_SIZE 15 -#define MAX_SHARE_SIZE 80 -#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */ -#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */ -#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ +#define MAX_TREE_SIZE (2 + CIFS_NI_MAXHOST + 1 + CIFS_MAX_SHARE_LEN + 1) #define CIFS_MIN_RCV_POOL 4 @@ -135,6 +131,7 @@ struct cifs_secmech { /* per smb session structure/fields */ struct ntlmssp_auth { + bool sesskey_per_smbsess; /* whether session key is per smb session */ __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */ __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */ unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */ @@ -308,6 +305,9 @@ struct smb_version_operations { int (*create_hardlink)(const unsigned int, struct cifs_tcon *, const char *, const char *, struct cifs_sb_info *); + /* query symlink target */ + int (*query_symlink)(const unsigned int, struct cifs_tcon *, + const char *, char **, struct cifs_sb_info *); /* open a file for non-posix mounts */ int (*open)(const unsigned int, struct cifs_open_parms *, __u32 *, FILE_ALL_INFO *); @@ -361,18 +361,24 @@ struct smb_version_operations { /* push brlocks from the cache to the server */ int (*push_mand_locks)(struct cifsFileInfo *); /* get lease key of the inode */ - void (*get_lease_key)(struct inode *, struct cifs_fid *fid); + void (*get_lease_key)(struct inode *, struct cifs_fid *); /* set lease key of the inode */ - void (*set_lease_key)(struct inode *, struct cifs_fid *fid); + void (*set_lease_key)(struct inode *, struct cifs_fid *); /* generate new lease key */ - void (*new_lease_key)(struct cifs_fid *fid); - /* The next two functions will need to be changed to per smb session */ - void (*generate_signingkey)(struct TCP_Server_Info *server); - int (*calc_signature)(struct smb_rqst *rqst, - struct TCP_Server_Info *server); - int (*query_mf_symlink)(const unsigned char *path, char *pbuf, - unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, - unsigned int xid); + void (*new_lease_key)(struct cifs_fid *); + int (*generate_signingkey)(struct cifs_ses *); + int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); + int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *, + struct cifs_sb_info *, unsigned int); + /* if we can do cache read operations */ + bool (*is_read_op)(__u32); + /* set oplock level for the inode */ + void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int, + bool *); + /* create lease context buffer for CREATE request */ + char * (*create_lease_buf)(u8 *, u8); + /* parse lease context buffer and return oplock/epoch info */ + __u8 (*parse_lease_buf)(void *, unsigned int *); }; struct smb_version_values { @@ -390,9 +396,9 @@ struct smb_version_values { unsigned int cap_unix; unsigned int cap_nt_find; unsigned int cap_large_files; - unsigned int oplock_read; __u16 signing_enabled; __u16 signing_required; + size_t create_lease_size; }; #define HEADER_SIZE(server) (server->vals->header_size) @@ -548,7 +554,6 @@ struct TCP_Server_Info { int timeAdj; /* Adjust for difference in server time zone in sec */ __u64 CurrentMid; /* multiplex id - rotating counter */ char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ - char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ @@ -731,6 +736,7 @@ struct cifs_ses { bool need_reconnect:1; /* connection reset, uid now invalid */ #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; + char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ #endif /* CONFIG_CIFS_SMB2 */ }; @@ -935,6 +941,8 @@ struct cifs_fid { __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ #endif struct cifs_pending_open *pending_open; + unsigned int epoch; + bool purge_cache; }; struct cifs_fid_locks { @@ -1032,6 +1040,17 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); +#define CIFS_CACHE_READ_FLG 1 +#define CIFS_CACHE_HANDLE_FLG 2 +#define CIFS_CACHE_RH_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_HANDLE_FLG) +#define CIFS_CACHE_WRITE_FLG 4 +#define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG) +#define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG) + +#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG) +#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG) +#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG) + /* * One of these for each file inode */ @@ -1043,8 +1062,8 @@ struct cifsInodeInfo { /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ - bool clientCanCacheRead; /* read oplock */ - bool clientCanCacheAll; /* read and writebehind oplock */ + unsigned int oplock; /* oplock/lease level we have */ + unsigned int epoch; /* used to track lease state changes */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ unsigned long time; /* jiffies of last update of inode */ @@ -1502,7 +1521,7 @@ extern mempool_t *cifs_mid_poolp; extern struct smb_version_operations smb1_operations; extern struct smb_version_values smb1_values; #define SMB20_VERSION_STRING "2.0" -/*extern struct smb_version_operations smb20_operations; */ /* not needed yet */ +extern struct smb_version_operations smb20_operations; extern struct smb_version_values smb20_values; #define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 11ca24a8e054..948676db8e2e 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1495,11 +1495,12 @@ struct reparse_data { __u32 ReparseTag; __u16 ReparseDataLength; __u16 Reserved; - __u16 AltNameOffset; - __u16 AltNameLen; - __u16 TargetNameOffset; - __u16 TargetNameLen; - char LinkNamesBuf[1]; + __u16 SubstituteNameOffset; + __u16 SubstituteNameLength; + __u16 PrintNameOffset; + __u16 PrintNameLength; + __u32 Flags; + char PathBuffer[0]; } __attribute__((packed)); struct cifs_quota_data { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index b29a012bed33..b5ec2a268f56 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -357,13 +357,9 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **syminfo, const struct nls_table *nls_codepage); -#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL -extern int CIFSSMBQueryReparseLinkInfo(const unsigned int xid, - struct cifs_tcon *tcon, - const unsigned char *searchName, - char *symlinkinfo, const int buflen, __u16 fid, - const struct nls_table *nls_codepage); -#endif /* temporarily unused until cifs_symlink fixed */ +extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid, char **symlinkinfo, + const struct nls_table *nls_codepage); extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const int disposition, const int access_flags, const int omode, @@ -435,7 +431,7 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifs_ses *); -extern void generate_smb3signingkey(struct TCP_Server_Info *); +extern int generate_smb3signingkey(struct cifs_ses *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern int calc_lanman_hash(const char *password, const char *cryptkey, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a89c4cb4e6cf..a3d74fea1623 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3067,7 +3067,6 @@ querySymLinkRetry: return rc; } -#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL /* * Recent Windows versions now create symlinks more frequently * and they use the "reparse point" mechanism below. We can of course @@ -3079,18 +3078,22 @@ querySymLinkRetry: * it is not compiled in by default until callers fixed up and more tested. */ int -CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, - char *symlinkinfo, const int buflen, __u16 fid, - const struct nls_table *nls_codepage) +CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid, char **symlinkinfo, + const struct nls_table *nls_codepage) { int rc = 0; int bytes_returned; struct smb_com_transaction_ioctl_req *pSMB; struct smb_com_transaction_ioctl_rsp *pSMBr; + bool is_unicode; + unsigned int sub_len; + char *sub_start; + struct reparse_data *reparse_buf; + __u32 data_offset, data_count; + char *end_of_smb; - cifs_dbg(FYI, "In Windows reparse style QueryLink for path %s\n", - searchName); + cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid); rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -3119,66 +3122,55 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc); - } else { /* decode response */ - __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); - __u32 data_count = le32_to_cpu(pSMBr->DataCount); - if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { - /* BB also check enough total bytes returned */ - rc = -EIO; /* bad smb */ - goto qreparse_out; - } - if (data_count && (data_count < 2048)) { - char *end_of_smb = 2 /* sizeof byte count */ + - get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; - - struct reparse_data *reparse_buf = - (struct reparse_data *) - ((char *)&pSMBr->hdr.Protocol - + data_offset); - if ((char *)reparse_buf >= end_of_smb) { - rc = -EIO; - goto qreparse_out; - } - if ((reparse_buf->LinkNamesBuf + - reparse_buf->TargetNameOffset + - reparse_buf->TargetNameLen) > end_of_smb) { - cifs_dbg(FYI, "reparse buf beyond SMB\n"); - rc = -EIO; - goto qreparse_out; - } + goto qreparse_out; + } - if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) { - cifs_from_ucs2(symlinkinfo, (__le16 *) - (reparse_buf->LinkNamesBuf + - reparse_buf->TargetNameOffset), - buflen, - reparse_buf->TargetNameLen, - nls_codepage, 0); - } else { /* ASCII names */ - strncpy(symlinkinfo, - reparse_buf->LinkNamesBuf + - reparse_buf->TargetNameOffset, - min_t(const int, buflen, - reparse_buf->TargetNameLen)); - } - } else { - rc = -EIO; - cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); - } - symlinkinfo[buflen] = 0; /* just in case so the caller - does not go off the end of the buffer */ - cifs_dbg(FYI, "readlink result - %s\n", symlinkinfo); + data_offset = le32_to_cpu(pSMBr->DataOffset); + data_count = le32_to_cpu(pSMBr->DataCount); + if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { + /* BB also check enough total bytes returned */ + rc = -EIO; /* bad smb */ + goto qreparse_out; + } + if (!data_count || (data_count > 2048)) { + rc = -EIO; + cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); + goto qreparse_out; + } + end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; + reparse_buf = (struct reparse_data *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + if ((char *)reparse_buf >= end_of_smb) { + rc = -EIO; + goto qreparse_out; } + if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset + + reparse_buf->PrintNameLength) > end_of_smb) { + cifs_dbg(FYI, "reparse buf beyond SMB\n"); + rc = -EIO; + goto qreparse_out; + } + sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer; + sub_len = reparse_buf->SubstituteNameLength; + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + /* BB FIXME investigate remapping reserved chars here */ + *symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode, + nls_codepage); + if (!*symlinkinfo) + rc = -ENOMEM; qreparse_out: cifs_buf_release(pSMB); - /* Note: On -EAGAIN error only caller can retry on handle based calls - since file handle passed in no longer valid */ - + /* + * Note: On -EAGAIN error only caller can retry on handle based calls + * since file handle passed in no longer valid. + */ return rc; } -#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */ #ifdef CONFIG_CIFS_POSIX diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d67c550c4980..a279ffc0bc29 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -379,6 +379,7 @@ cifs_reconnect(struct TCP_Server_Info *server) try_to_freeze(); /* we should try only the port we connected to before */ + mutex_lock(&server->srv_mutex); rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); @@ -390,6 +391,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); } + mutex_unlock(&server->srv_mutex); } while (server->tcpStatus == CifsNeedReconnect); return rc; @@ -1114,7 +1116,7 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) break; #ifdef CONFIG_CIFS_SMB2 case Smb_20: - vol->ops = &smb21_operations; /* currently identical with 2.1 */ + vol->ops = &smb20_operations; vol->vals = &smb20_values; break; case Smb_21: @@ -1575,8 +1577,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (strnlen(string, MAX_USERNAME_SIZE) > - MAX_USERNAME_SIZE) { + if (strnlen(string, CIFS_MAX_USERNAME_LEN) > + CIFS_MAX_USERNAME_LEN) { printk(KERN_WARNING "CIFS: username too long\n"); goto cifs_parse_mount_err; } @@ -2221,13 +2223,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) /* anything else takes username/password */ if (strncmp(ses->user_name, vol->username ? vol->username : "", - MAX_USERNAME_SIZE)) + CIFS_MAX_USERNAME_LEN)) return 0; if (strlen(vol->username) != 0 && ses->password != NULL && strncmp(ses->password, vol->password ? vol->password : "", - MAX_PASSWORD_SIZE)) + CIFS_MAX_PASSWORD_LEN)) return 0; } return 1; @@ -2352,7 +2354,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } len = delim - payload; - if (len > MAX_USERNAME_SIZE || len <= 0) { + if (len > CIFS_MAX_USERNAME_LEN || len <= 0) { cifs_dbg(FYI, "Bad value from username search (len=%zd)\n", len); rc = -EINVAL; @@ -2369,7 +2371,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username); len = key->datalen - (len + 1); - if (len > MAX_PASSWORD_SIZE || len <= 0) { + if (len > CIFS_MAX_PASSWORD_LEN || len <= 0) { cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len); rc = -EINVAL; kfree(vol->username); @@ -3826,33 +3828,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); - if (rc) { + if (rc) cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc); - } else { - mutex_lock(&server->srv_mutex); - if (!server->session_estab) { - server->session_key.response = ses->auth_key.response; - server->session_key.len = ses->auth_key.len; - server->sequence_number = 0x2; - server->session_estab = true; - ses->auth_key.response = NULL; - if (server->ops->generate_signingkey) - server->ops->generate_signingkey(server); - } - mutex_unlock(&server->srv_mutex); - - cifs_dbg(FYI, "CIFS Session Established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); - } - - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - ses->auth_key.len = 0; - kfree(ses->ntlmssp); - ses->ntlmssp = NULL; return rc; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index d62ce0d48141..5384c2a640ca 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -32,6 +32,7 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "cifs_unicode.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -499,6 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, if (server->ops->close) server->ops->close(xid, tcon, &fid); cifs_del_pending_open(&open); + fput(file); rc = -ENOMEM; } @@ -834,12 +836,17 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) { struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; unsigned long hash; - int i; + wchar_t c; + int i, charlen; hash = init_name_hash(); - for (i = 0; i < q->len; i++) - hash = partial_name_hash(nls_tolower(codepage, q->name[i]), - hash); + for (i = 0; i < q->len; i += charlen) { + charlen = codepage->char2uni(&q->name[i], q->len - i, &c); + /* error out if we can't convert the character */ + if (unlikely(charlen < 0)) + return charlen; + hash = partial_name_hash(cifs_toupper(c), hash); + } q->hash = end_name_hash(hash); return 0; @@ -849,11 +856,47 @@ static int cifs_ci_compare(const struct dentry *parent, const struct dentry *den unsigned int len, const char *str, const struct qstr *name) { struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls; + wchar_t c1, c2; + int i, l1, l2; - if ((name->len == len) && - (nls_strnicmp(codepage, name->name, str, len) == 0)) - return 0; - return 1; + /* + * We make the assumption here that uppercase characters in the local + * codepage are always the same length as their lowercase counterparts. + * + * If that's ever not the case, then this will fail to match it. + */ + if (name->len != len) + return 1; + + for (i = 0; i < len; i += l1) { + /* Convert characters in both strings to UTF-16. */ + l1 = codepage->char2uni(&str[i], len - i, &c1); + l2 = codepage->char2uni(&name->name[i], name->len - i, &c2); + + /* + * If we can't convert either character, just declare it to + * be 1 byte long and compare the original byte. + */ + if (unlikely(l1 < 0 && l2 < 0)) { + if (str[i] != name->name[i]) + return 1; + l1 = 1; + continue; + } + + /* + * Here, we again ass|u|me that upper/lowercase versions of + * a character are the same length in the local NLS. + */ + if (l1 != l2) + return 1; + + /* Now compare uppercase versions of these characters */ + if (cifs_toupper(c1) != cifs_toupper(c2)) + return 1; + } + + return 0; } const struct dentry_operations cifs_ci_dentry_ops = { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7e36ae34e947..eb955b525e55 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -313,8 +313,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, * If the server returned a read oplock and we have mandatory brlocks, * set oplock level to None. */ - if (oplock == server->vals->oplock_read && - cifs_has_mand_locks(cinode)) { + if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); oplock = 0; } @@ -324,6 +323,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); + fid->purge_cache = false; server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); @@ -334,6 +334,9 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, list_add_tail(&cfile->flist, &cinode->openFileList); spin_unlock(&cifs_file_list_lock); + if (fid->purge_cache) + cifs_invalidate_mapping(inode); + file->private_data = cfile; return cfile; } @@ -1524,12 +1527,12 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, * read won't conflict with non-overlapted locks due to * pagereading. */ - if (!CIFS_I(inode)->clientCanCacheAll && - CIFS_I(inode)->clientCanCacheRead) { + if (!CIFS_CACHE_WRITE(CIFS_I(inode)) && + CIFS_CACHE_READ(CIFS_I(inode))) { cifs_invalidate_mapping(inode); cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n", inode); - CIFS_I(inode)->clientCanCacheRead = false; + CIFS_I(inode)->oplock = 0; } rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, @@ -2213,7 +2216,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", file->f_path.dentry->d_name.name, datasync); - if (!CIFS_I(inode)->clientCanCacheRead) { + if (!CIFS_CACHE_READ(CIFS_I(inode))) { rc = cifs_invalidate_mapping(inode); if (rc) { cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); @@ -2553,7 +2556,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, mutex_unlock(&inode->i_mutex); } - if (rc > 0 || rc == -EIOCBQUEUED) { + if (rc > 0) { ssize_t err; err = generic_write_sync(file, pos, rc); @@ -2577,7 +2580,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; - if (cinode->clientCanCacheAll) { + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) @@ -2591,7 +2594,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, * these pages but not on the region from pos to ppos+len-1. */ written = cifs_user_writev(iocb, iov, nr_segs, pos); - if (written > 0 && cinode->clientCanCacheRead) { + if (written > 0 && CIFS_CACHE_READ(cinode)) { /* * Windows 7 server can delay breaking level2 oplock if a write * request comes - break it on the client to prevent reading @@ -2600,7 +2603,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, cifs_invalidate_mapping(inode); cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", inode); - cinode->clientCanCacheRead = false; + cinode->oplock = 0; } return written; } @@ -2957,7 +2960,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, * on pages affected by this read but not on the region from pos to * pos+len-1. */ - if (!cinode->clientCanCacheRead) + if (!CIFS_CACHE_READ(cinode)) return cifs_user_readv(iocb, iov, nr_segs, pos); if (cap_unix(tcon->ses) && @@ -3093,7 +3096,7 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) xid = get_xid(); - if (!CIFS_I(inode)->clientCanCacheRead) { + if (!CIFS_CACHE_READ(CIFS_I(inode))) { rc = cifs_invalidate_mapping(inode); if (rc) return rc; @@ -3376,6 +3379,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, return rc; } +/* + * cifs_readpage_worker must be called with the page pinned + */ static int cifs_readpage_worker(struct file *file, struct page *page, loff_t *poffset) { @@ -3387,7 +3393,6 @@ static int cifs_readpage_worker(struct file *file, struct page *page, if (rc == 0) goto read_complete; - page_cache_get(page); read_data = kmap(page); /* for reads over a certain size could initiate async read ahead */ @@ -3414,7 +3419,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, io_error: kunmap(page); - page_cache_release(page); + unlock_page(page); read_complete: return rc; @@ -3439,8 +3444,6 @@ static int cifs_readpage(struct file *file, struct page *page) rc = cifs_readpage_worker(file, page, &offset); - unlock_page(page); - free_xid(xid); return rc; } @@ -3494,6 +3497,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int oncethru = 0; pgoff_t index = pos >> PAGE_CACHE_SHIFT; loff_t offset = pos & (PAGE_CACHE_SIZE - 1); loff_t page_start = pos & PAGE_MASK; @@ -3503,6 +3507,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len); +start: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { rc = -ENOMEM; @@ -3526,7 +3531,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, * is, when the page lies beyond the EOF, or straddles the EOF * and the write will cover all of the existing data. */ - if (CIFS_I(mapping->host)->clientCanCacheRead) { + if (CIFS_CACHE_READ(CIFS_I(mapping->host))) { i_size = i_size_read(mapping->host); if (page_start >= i_size || (offset == 0 && (pos + len) >= i_size)) { @@ -3544,13 +3549,16 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, } } - if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) { /* * might as well read a page, it is fast enough. If we get * an error, we don't need to return it. cifs_write_end will * do a sync write instead since PG_uptodate isn't set. */ cifs_readpage_worker(file, page, &page_start); + page_cache_release(page); + oncethru = 1; + goto start; } else { /* we could try using another file handle if there is one - but how would we lock it to prevent close of that handle @@ -3609,20 +3617,20 @@ void cifs_oplock_break(struct work_struct *work) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; - if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", inode); - cinode->clientCanCacheRead = false; + cinode->oplock = 0; } if (inode && S_ISREG(inode->i_mode)) { - if (cinode->clientCanCacheRead) + if (CIFS_CACHE_READ(cinode)) break_lease(inode, O_RDONLY); else break_lease(inode, O_WRONLY); rc = filemap_fdatawrite(inode->i_mapping); - if (cinode->clientCanCacheRead == 0) { + if (!CIFS_CACHE_READ(cinode)) { rc = filemap_fdatawait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); cifs_invalidate_mapping(inode); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 449b6cf09b09..f9ff9c173f78 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -101,7 +101,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) } /* don't bother with revalidation if we have an oplock */ - if (cifs_i->clientCanCacheRead) { + if (CIFS_CACHE_READ(cifs_i)) { cifs_dbg(FYI, "%s: inode %llu is oplocked\n", __func__, cifs_i->uniqueid); return; @@ -549,6 +549,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, * when Unix extensions are disabled - fake it. */ fattr->cf_nlink = 2; + } else if (fattr->cf_cifsattrs & ATTR_REPARSE) { + fattr->cf_mode = S_IFLNK; + fattr->cf_dtype = DT_LNK; + fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; @@ -646,7 +650,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_dbg(FYI, "Getting info on %s\n", full_path); if ((data == NULL) && (*inode != NULL)) { - if (CIFS_I(*inode)->clientCanCacheRead) { + if (CIFS_CACHE_READ(CIFS_I(*inode))) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto cgii_exit; } @@ -1657,7 +1661,7 @@ cifs_inode_needs_reval(struct inode *inode) struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - if (cifs_i->clientCanCacheRead) + if (CIFS_CACHE_READ(cifs_i)) return false; if (!lookupCacheEnabled) @@ -1800,7 +1804,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, * We need to be sure that all dirty pages are written and the server * has actual ctime, mtime and file length. */ - if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && + if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_fdatawait(inode->i_mapping); if (rc) { @@ -1852,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) static void cifs_setsize(struct inode *inode, loff_t offset) { - loff_t oldsize; - spin_lock(&inode->i_lock); - oldsize = inode->i_size; i_size_write(inode, offset); spin_unlock(&inode->i_lock); - truncate_pagecache(inode, oldsize, offset); + truncate_pagecache(inode, offset); } static int diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 562044f700e5..7e36ceba0c7a 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -509,6 +509,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; xid = get_xid(); @@ -519,25 +520,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) goto out; } tcon = tlink_tcon(tlink); - - /* - * For now, we just handle symlinks with unix extensions enabled. - * Eventually we should handle NTFS reparse points, and MacOS - * symlink support. For instance... - * - * rc = CIFSSMBQueryReparseLinkInfo(...) - * - * For now, just return -EACCES when the server doesn't support posix - * extensions. Note that we still allow querying symlinks when posix - * extensions are manually disabled. We could disable these as well - * but there doesn't seem to be any harm in allowing the client to - * read them. - */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) && - !cap_unix(tcon->ses)) { - rc = -EACCES; - goto out; - } + server = tcon->ses->server; full_path = build_path_from_dentry(direntry); if (!full_path) @@ -559,6 +542,9 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) if ((rc != 0) && cap_unix(tcon->ses)) rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, cifs_sb->local_nls); + else if (rc != 0 && server->ops->query_symlink) + rc = server->ops->query_symlink(xid, tcon, full_path, + &target_path, cifs_sb); kfree(full_path); out: diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index f7d4b2285efe..138a011633fe 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -105,6 +105,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) } kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); + kfree(buf_to_free->auth_key.response); kfree(buf_to_free); } @@ -545,19 +546,15 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) oplock &= 0xF; if (oplock == OPLOCK_EXCLUSIVE) { - cinode->clientCanCacheAll = true; - cinode->clientCanCacheRead = true; + cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", &cinode->vfs_inode); } else if (oplock == OPLOCK_READ) { - cinode->clientCanCacheAll = false; - cinode->clientCanCacheRead = true; + cinode->oplock = CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", &cinode->vfs_inode); - } else { - cinode->clientCanCacheAll = false; - cinode->clientCanCacheRead = false; - } + } else + cinode->oplock = 0; } bool diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 69d2c826a23b..42ef03be089f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -172,6 +172,9 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) if (cifs_dfs_is_possible(cifs_sb) && (fattr->cf_cifsattrs & ATTR_REPARSE)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + } else if (fattr->cf_cifsattrs & ATTR_REPARSE) { + fattr->cf_mode = S_IFLNK; + fattr->cf_dtype = DT_LNK; } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 08dd37bb23aa..5f99b7f19e78 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -226,7 +226,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, *(bcc_ptr+1) = 0; } else { bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); } bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* account for null termination */ @@ -246,8 +246,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* BB what about null user mounts - check that we do this BB */ /* copy user */ if (ses->user_name != NULL) { - strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); - bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); + strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN); + bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN); } /* else null user mount */ *bcc_ptr = 0; @@ -428,7 +428,8 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; if (ses->server->sign) { flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab) + if (!ses->server->session_estab || + ses->ntlmssp->sesskey_per_smbsess) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } @@ -466,7 +467,8 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; if (ses->server->sign) { flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab) + if (!ses->server->session_estab || + ses->ntlmssp->sesskey_per_smbsess) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } @@ -501,7 +503,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, } else { int len; len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); @@ -517,7 +519,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, } else { int len; len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); @@ -629,7 +631,8 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, type = select_sectype(ses->server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { - cifs_dbg(VFS, "Unable to select appropriate authentication method!"); + cifs_dbg(VFS, + "Unable to select appropriate authentication method!"); return -EINVAL; } @@ -640,6 +643,8 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); if (!ses->ntlmssp) return -ENOMEM; + ses->ntlmssp->sesskey_per_smbsess = false; + } ssetup_ntlmssp_authenticate: @@ -815,8 +820,9 @@ ssetup_ntlmssp_authenticate: ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, GFP_KERNEL); if (!ses->auth_key.response) { - cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); rc = -ENOMEM; goto ssetup_exit; } @@ -1005,5 +1011,37 @@ ssetup_exit: if ((phase == NtLmChallenge) && (rc == 0)) goto ssetup_ntlmssp_authenticate; + if (!rc) { + mutex_lock(&ses->server->srv_mutex); + if (!ses->server->session_estab) { + if (ses->server->sign) { + ses->server->session_key.response = + kmemdup(ses->auth_key.response, + ses->auth_key.len, GFP_KERNEL); + if (!ses->server->session_key.response) { + rc = -ENOMEM; + mutex_unlock(&ses->server->srv_mutex); + goto keycp_exit; + } + ses->server->session_key.len = + ses->auth_key.len; + } + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; + } + mutex_unlock(&ses->server->srv_mutex); + + cifs_dbg(FYI, "CIFS session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + } + +keycp_exit: + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + kfree(ses->ntlmssp); + return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 60943978aec3..8233b174de3d 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -700,7 +700,7 @@ cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); cfile->fid.netfid = fid->netfid; cifs_set_oplock_level(cinode, oplock); - cinode->can_cache_brlcks = cinode->clientCanCacheAll; + cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); } static void @@ -837,7 +837,7 @@ cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, { return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, - cinode->clientCanCacheRead ? 1 : 0); + CIFS_CACHE_READ(cinode) ? 1 : 0); } static int @@ -881,6 +881,43 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, (__u8)type, wait, 0); } +static int +cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, char **target_path, + struct cifs_sb_info *cifs_sb) +{ + int rc; + int oplock = 0; + __u16 netfid; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, + FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid, + &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + return rc; + + rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path, + cifs_sb->local_nls); + if (rc) { + CIFSSMBClose(xid, tcon, netfid); + return rc; + } + + convert_delimiter(*target_path, '/'); + CIFSSMBClose(xid, tcon, netfid); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + return rc; +} + +static bool +cifs_is_read_op(__u32 oplock) +{ + return oplock == OPLOCK_READ; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -927,6 +964,7 @@ struct smb_version_operations smb1_operations = { .rename_pending_delete = cifs_rename_pending_delete, .rename = CIFSSMBRename, .create_hardlink = CIFSCreateHardLink, + .query_symlink = cifs_query_symlink, .open = cifs_open_file, .set_fid = cifs_set_fid, .close = cifs_close_file, @@ -945,6 +983,7 @@ struct smb_version_operations smb1_operations = { .mand_unlock_range = cifs_unlock_range, .push_mand_locks = cifs_push_mandatory_locks, .query_mf_symlink = open_query_close_cifs_symlink, + .is_read_op = cifs_is_read_op, }; struct smb_version_values smb1_values = { @@ -960,7 +999,6 @@ struct smb_version_values smb1_values = { .cap_unix = CAP_UNIX, .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, - .oplock_read = OPLOCK_READ, .signing_enabled = SECMODE_SIGN_ENABLED, .signing_required = SECMODE_SIGN_REQUIRED, }; diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 04a81a4142c3..3f17b4550831 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -34,29 +34,6 @@ #include "fscache.h" #include "smb2proto.h" -void -smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) -{ - oplock &= 0xFF; - if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) - return; - if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE || - oplock == SMB2_OPLOCK_LEVEL_BATCH) { - cinode->clientCanCacheAll = true; - cinode->clientCanCacheRead = true; - cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", - &cinode->vfs_inode); - } else if (oplock == SMB2_OPLOCK_LEVEL_II) { - cinode->clientCanCacheAll = false; - cinode->clientCanCacheRead = true; - cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", - &cinode->vfs_inode); - } else { - cinode->clientCanCacheAll = false; - cinode->clientCanCacheRead = false; - } -} - int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, FILE_ALL_INFO *buf) @@ -86,7 +63,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); - rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data); + rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL); if (rc) goto out; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index c6ec1633309a..78ff88c467b9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -60,7 +60,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); if (rc) { kfree(utf16_path); return rc; @@ -136,7 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return -ENOMEM; rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data, + FILE_READ_ATTRIBUTES, FILE_OPEN, + OPEN_REPARSE_POINT, smb2_data, SMB2_OP_QUERY_INFO); if (rc) goto out; @@ -191,8 +192,8 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE, NULL, - SMB2_OP_DELETE); + CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, + NULL, SMB2_OP_DELETE); } static int diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b0c43345cd98..fb3966265b6e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -171,6 +171,10 @@ smb2_check_message(char *buf, unsigned int length) if (4 + len != clc_len) { cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", clc_len, 4 + len, mid); + /* create failed on symlink */ + if (command == SMB2_CREATE_HE && + hdr->Status == STATUS_STOPPED_ON_SYMLINK) + return 0; /* Windows 7 server returns 24 bytes more */ if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; @@ -376,23 +380,15 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode) { - if (cinode->clientCanCacheAll) - return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; - else if (cinode->clientCanCacheRead) - return SMB2_LEASE_READ_CACHING; - return 0; -} - -__u8 smb2_map_lease_to_oplock(__le32 lease_state) -{ - if (lease_state & SMB2_LEASE_WRITE_CACHING) { - if (lease_state & SMB2_LEASE_HANDLE_CACHING) - return SMB2_OPLOCK_LEVEL_BATCH; - else - return SMB2_OPLOCK_LEVEL_EXCLUSIVE; - } else if (lease_state & SMB2_LEASE_READ_CACHING) - return SMB2_OPLOCK_LEVEL_II; - return 0; + __le32 lease = 0; + + if (CIFS_CACHE_WRITE(cinode)) + lease |= SMB2_LEASE_WRITE_CACHING; + if (CIFS_CACHE_HANDLE(cinode)) + lease |= SMB2_LEASE_HANDLE_CACHING; + if (CIFS_CACHE_READ(cinode)) + lease |= SMB2_LEASE_READ_CACHING; + return lease; } struct smb2_lease_break_work { @@ -417,96 +413,109 @@ cifs_ses_oplock_break(struct work_struct *work) } static bool -smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) +smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, + struct smb2_lease_break_work *lw) { - struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; - struct list_head *tmp, *tmp1, *tmp2; - struct cifs_ses *ses; - struct cifs_tcon *tcon; - struct cifsInodeInfo *cinode; + bool found; + __u8 lease_state; + struct list_head *tmp; struct cifsFileInfo *cfile; + struct TCP_Server_Info *server = tcon->ses->server; struct cifs_pending_open *open; - struct smb2_lease_break_work *lw; - bool found; + struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); - lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); - if (!lw) - return false; + lease_state = le32_to_cpu(rsp->NewLeaseState); - INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); - lw->lease_state = rsp->NewLeaseState; + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, tlist); + cinode = CIFS_I(cfile->dentry->d_inode); - cifs_dbg(FYI, "Checking for lease break\n"); + if (memcmp(cinode->lease_key, rsp->LeaseKey, + SMB2_LEASE_KEY_SIZE)) + continue; - /* look up tcon based on tid & uid */ - spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + cifs_dbg(FYI, "found in the open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + le32_to_cpu(rsp->NewLeaseState)); - spin_lock(&cifs_file_list_lock); - list_for_each(tmp1, &ses->tcon_list) { - tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + server->ops->set_oplock_level(cinode, lease_state, 0, NULL); - cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - list_for_each(tmp2, &tcon->openFileList) { - cfile = list_entry(tmp2, struct cifsFileInfo, - tlist); - cinode = CIFS_I(cfile->dentry->d_inode); + if (ack_req) + cfile->oplock_break_cancelled = false; + else + cfile->oplock_break_cancelled = true; - if (memcmp(cinode->lease_key, rsp->LeaseKey, - SMB2_LEASE_KEY_SIZE)) - continue; + queue_work(cifsiod_wq, &cfile->oplock_break); + kfree(lw); + return true; + } - cifs_dbg(FYI, "found in the open list\n"); - cifs_dbg(FYI, "lease key match, lease break 0x%d\n", - le32_to_cpu(rsp->NewLeaseState)); + found = false; + list_for_each_entry(open, &tcon->pending_opens, olist) { + if (memcmp(open->lease_key, rsp->LeaseKey, + SMB2_LEASE_KEY_SIZE)) + continue; + + if (!found && ack_req) { + found = true; + memcpy(lw->lease_key, open->lease_key, + SMB2_LEASE_KEY_SIZE); + lw->tlink = cifs_get_tlink(open->tlink); + queue_work(cifsiod_wq, &lw->lease_break); + } - smb2_set_oplock_level(cinode, - smb2_map_lease_to_oplock(rsp->NewLeaseState)); + cifs_dbg(FYI, "found in the pending open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + le32_to_cpu(rsp->NewLeaseState)); - if (ack_req) - cfile->oplock_break_cancelled = false; - else - cfile->oplock_break_cancelled = true; + open->oplock = lease_state; + } + return found; +} - queue_work(cifsiod_wq, &cfile->oplock_break); +static bool +smb2_is_valid_lease_break(char *buffer) +{ + struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; + struct list_head *tmp, *tmp1, *tmp2; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct smb2_lease_break_work *lw; - spin_unlock(&cifs_file_list_lock); - spin_unlock(&cifs_tcp_ses_lock); - return true; - } + lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); + if (!lw) + return false; - found = false; - list_for_each_entry(open, &tcon->pending_opens, olist) { - if (memcmp(open->lease_key, rsp->LeaseKey, - SMB2_LEASE_KEY_SIZE)) - continue; + INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); + lw->lease_state = rsp->NewLeaseState; - if (!found && ack_req) { - found = true; - memcpy(lw->lease_key, open->lease_key, - SMB2_LEASE_KEY_SIZE); - lw->tlink = cifs_get_tlink(open->tlink); - queue_work(cifsiod_wq, - &lw->lease_break); - } + cifs_dbg(FYI, "Checking for lease break\n"); - cifs_dbg(FYI, "found in the pending open list\n"); - cifs_dbg(FYI, "lease key match, lease break 0x%d\n", - le32_to_cpu(rsp->NewLeaseState)); + /* look up tcon based on tid & uid */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &cifs_tcp_ses_list) { + server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list); - open->oplock = - smb2_map_lease_to_oplock(rsp->NewLeaseState); - } - if (found) { - spin_unlock(&cifs_file_list_lock); - spin_unlock(&cifs_tcp_ses_lock); - return true; + list_for_each(tmp1, &server->smb_ses_list) { + ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); + + spin_lock(&cifs_file_list_lock); + list_for_each(tmp2, &ses->tcon_list) { + tcon = list_entry(tmp2, struct cifs_tcon, + tcon_list); + cifs_stats_inc( + &tcon->stats.cifs_stats.num_oplock_brks); + if (smb2_tcon_has_lease(tcon, rsp, lw)) { + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } } + spin_unlock(&cifs_file_list_lock); } - spin_unlock(&cifs_file_list_lock); } spin_unlock(&cifs_tcp_ses_lock); kfree(lw); @@ -532,7 +541,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) - return smb2_is_valid_lease_break(buffer, server); + return smb2_is_valid_lease_break(buffer); else return false; } @@ -560,14 +569,15 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(cfile->dentry->d_inode); - if (!cinode->clientCanCacheAll && + if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) cfile->oplock_break_cancelled = true; else cfile->oplock_break_cancelled = false; - smb2_set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0); + server->ops->set_oplock_level(cinode, + rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, + 0, NULL); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f259e6cc8357..861b33214144 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -24,6 +24,7 @@ #include "smb2proto.h" #include "cifsproto.h" #include "cifs_debug.h" +#include "cifs_unicode.h" #include "smb2status.h" #include "smb2glob.h" @@ -229,7 +230,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); if (rc) { kfree(utf16_path); return rc; @@ -376,10 +377,13 @@ static void smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) { struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + cfile->fid.persistent_fid = fid->persistent_fid; cfile->fid.volatile_fid = fid->volatile_fid; - smb2_set_oplock_level(cinode, oplock); - cinode->can_cache_brlcks = cinode->clientCanCacheAll; + server->ops->set_oplock_level(cinode, oplock, fid->epoch, + &fid->purge_cache); + cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); } static void @@ -463,7 +467,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); kfree(utf16_path); if (rc) { cifs_dbg(VFS, "open dir failed\n"); @@ -530,7 +534,7 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, return SMB2_oplock_break(0, tcon, fid->persistent_fid, fid->volatile_fid, - cinode->clientCanCacheRead ? 1 : 0); + CIFS_CACHE_READ(cinode) ? 1 : 0); } static int @@ -550,7 +554,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL); + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); if (rc) return rc; buf->f_type = SMB2_MAGIC_NUMBER; @@ -596,7 +600,245 @@ smb2_new_lease_key(struct cifs_fid *fid) get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); } -struct smb_version_operations smb21_operations = { +static int +smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, char **target_path, + struct cifs_sb_info *cifs_sb) +{ + int rc; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct smb2_err_rsp *err_buf = NULL; + struct smb2_symlink_err_rsp *symlink; + unsigned int sub_len, sub_offset; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf); + + if (!rc || !err_buf) { + kfree(utf16_path); + return -ENOENT; + } + /* open must fail on symlink - reset rc */ + rc = 0; + symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; + sub_len = le16_to_cpu(symlink->SubstituteNameLength); + sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); + *target_path = cifs_strndup_from_utf16( + (char *)symlink->PathBuffer + sub_offset, + sub_len, true, cifs_sb->local_nls); + if (!(*target_path)) { + kfree(utf16_path); + return -ENOMEM; + } + convert_delimiter(*target_path, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + kfree(utf16_path); + return rc; +} + +static void +smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + oplock &= 0xFF; + if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) + return; + if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { + cinode->oplock = CIFS_CACHE_RHW_FLG; + cifs_dbg(FYI, "Batch Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { + cinode->oplock = CIFS_CACHE_RW_FLG; + cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else if (oplock == SMB2_OPLOCK_LEVEL_II) { + cinode->oplock = CIFS_CACHE_READ_FLG; + cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else + cinode->oplock = 0; +} + +static void +smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + char message[5] = {0}; + + oplock &= 0xFF; + if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) + return; + + cinode->oplock = 0; + if (oplock & SMB2_LEASE_READ_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_READ_FLG; + strcat(message, "R"); + } + if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_HANDLE_FLG; + strcat(message, "H"); + } + if (oplock & SMB2_LEASE_WRITE_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_WRITE_FLG; + strcat(message, "W"); + } + if (!cinode->oplock) + strcat(message, "None"); + cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, + &cinode->vfs_inode); +} + +static void +smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + unsigned int old_oplock = cinode->oplock; + + smb21_set_oplock_level(cinode, oplock, epoch, purge_cache); + + if (purge_cache) { + *purge_cache = false; + if (old_oplock == CIFS_CACHE_READ_FLG) { + if (cinode->oplock == CIFS_CACHE_READ_FLG && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RH_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RHW_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + else if (cinode->oplock == 0 && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + } else if (old_oplock == CIFS_CACHE_RH_FLG) { + if (cinode->oplock == CIFS_CACHE_RH_FLG && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RHW_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + } + cinode->epoch = epoch; + } +} + +static bool +smb2_is_read_op(__u32 oplock) +{ + return oplock == SMB2_OPLOCK_LEVEL_II; +} + +static bool +smb21_is_read_op(__u32 oplock) +{ + return (oplock & SMB2_LEASE_READ_CACHING_HE) && + !(oplock & SMB2_LEASE_WRITE_CACHING_HE); +} + +static __le32 +map_oplock_to_lease(u8 oplock) +{ + if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) + return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; + else if (oplock == SMB2_OPLOCK_LEVEL_II) + return SMB2_LEASE_READ_CACHING; + else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) + return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING | + SMB2_LEASE_WRITE_CACHING; + return 0; +} + +static char * +smb2_create_lease_buf(u8 *lease_key, u8 oplock) +{ + struct create_lease *buf; + + buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL); + if (!buf) + return NULL; + + buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); + buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + return (char *)buf; +} + +static char * +smb3_create_lease_buf(u8 *lease_key, u8 oplock) +{ + struct create_lease_v2 *buf; + + buf = kzalloc(sizeof(struct create_lease_v2), GFP_KERNEL); + if (!buf) + return NULL; + + buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); + buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease_v2, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease_v2, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + return (char *)buf; +} + +static __u8 +smb2_parse_lease_buf(void *buf, unsigned int *epoch) +{ + struct create_lease *lc = (struct create_lease *)buf; + + *epoch = 0; /* not used */ + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + return SMB2_OPLOCK_LEVEL_NOCHANGE; + return le32_to_cpu(lc->lcontext.LeaseState); +} + +static __u8 +smb3_parse_lease_buf(void *buf, unsigned int *epoch) +{ + struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; + + *epoch = le16_to_cpu(lc->lcontext.Epoch); + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + return SMB2_OPLOCK_LEVEL_NOCHANGE; + return le32_to_cpu(lc->lcontext.LeaseState); +} + +struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, .setup_async_request = smb2_setup_async_request, @@ -638,6 +880,7 @@ struct smb_version_operations smb21_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -660,8 +903,82 @@ struct smb_version_operations smb21_operations = { .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, .calc_signature = smb2_calc_signature, + .is_read_op = smb2_is_read_op, + .set_oplock_level = smb2_set_oplock_level, + .create_lease_buf = smb2_create_lease_buf, + .parse_lease_buf = smb2_parse_lease_buf, }; +struct smb_version_operations smb21_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .is_oplock_break = smb2_is_valid_oplock_break, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .calc_signature = smb2_calc_signature, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb21_set_oplock_level, + .create_lease_buf = smb2_create_lease_buf, + .parse_lease_buf = smb2_parse_lease_buf, +}; struct smb_version_operations smb30_operations = { .compare_fids = smb2_compare_fids, @@ -706,6 +1023,7 @@ struct smb_version_operations smb30_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -729,6 +1047,10 @@ struct smb_version_operations smb30_operations = { .new_lease_key = smb2_new_lease_key, .generate_signingkey = generate_smb3signingkey, .calc_signature = smb3_calc_signature, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb3_set_oplock_level, + .create_lease_buf = smb3_create_lease_buf, + .parse_lease_buf = smb3_parse_lease_buf, }; struct smb_version_values smb20_values = { @@ -746,9 +1068,9 @@ struct smb_version_values smb20_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, - .oplock_read = SMB2_OPLOCK_LEVEL_II, .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease), }; struct smb_version_values smb21_values = { @@ -766,9 +1088,9 @@ struct smb_version_values smb21_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, - .oplock_read = SMB2_OPLOCK_LEVEL_II, .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease), }; struct smb_version_values smb30_values = { @@ -786,9 +1108,9 @@ struct smb_version_values smb30_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, - .oplock_read = SMB2_OPLOCK_LEVEL_II, .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), }; struct smb_version_values smb302_values = { @@ -806,7 +1128,7 @@ struct smb_version_values smb302_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, - .oplock_read = SMB2_OPLOCK_LEVEL_II, .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), }; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index abc9c2809b51..eba0efde66d7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -478,12 +478,20 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, } /* + * If we are here due to reconnect, free per-smb session key + * in case signing was required. + */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + + /* * If memory allocation is successful, caller of this function * frees it. */ ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); if (!ses->ntlmssp) return -ENOMEM; + ses->ntlmssp->sesskey_per_smbsess = true; /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ ses->sectype = RawNTLMSSP; @@ -628,6 +636,40 @@ ssetup_exit: /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ if ((phase == NtLmChallenge) && (rc == 0)) goto ssetup_ntlmssp_authenticate; + + if (!rc) { + mutex_lock(&server->srv_mutex); + if (server->sign && server->ops->generate_signingkey) { + rc = server->ops->generate_signingkey(ses); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + if (rc) { + cifs_dbg(FYI, + "SMB3 session key generation failed\n"); + mutex_unlock(&server->srv_mutex); + goto keygen_exit; + } + } + if (!server->session_estab) { + server->sequence_number = 0x2; + server->session_estab = true; + } + mutex_unlock(&server->srv_mutex); + + cifs_dbg(FYI, "SMB2/3 session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + } + +keygen_exit: + if (!server->sign) { + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + } + kfree(ses->ntlmssp); + return rc; } @@ -813,39 +855,6 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) return rc; } -static struct create_lease * -create_lease_buf(u8 *lease_key, u8 oplock) -{ - struct create_lease *buf; - - buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL); - if (!buf) - return NULL; - - buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); - buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); - if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) - buf->lcontext.LeaseState = SMB2_LEASE_WRITE_CACHING | - SMB2_LEASE_READ_CACHING; - else if (oplock == SMB2_OPLOCK_LEVEL_II) - buf->lcontext.LeaseState = SMB2_LEASE_READ_CACHING; - else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) - buf->lcontext.LeaseState = SMB2_LEASE_HANDLE_CACHING | - SMB2_LEASE_READ_CACHING | - SMB2_LEASE_WRITE_CACHING; - - buf->ccontext.DataOffset = cpu_to_le16(offsetof - (struct create_lease, lcontext)); - buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context)); - buf->ccontext.NameOffset = cpu_to_le16(offsetof - (struct create_lease, Name)); - buf->ccontext.NameLength = cpu_to_le16(4); - buf->Name[0] = 'R'; - buf->Name[1] = 'q'; - buf->Name[2] = 'L'; - buf->Name[3] = 's'; - return buf; -} static struct create_durable * create_durable_buf(void) @@ -894,55 +903,49 @@ create_reconnect_durable_buf(struct cifs_fid *fid) } static __u8 -parse_lease_state(struct smb2_create_rsp *rsp) +parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, + unsigned int *epoch) { char *data_offset; - struct create_lease *lc; - bool found = false; + struct create_context *cc; unsigned int next = 0; char *name; data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset); - lc = (struct create_lease *)data_offset; + cc = (struct create_context *)data_offset; do { - lc = (struct create_lease *)((char *)lc + next); - name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc; - if (le16_to_cpu(lc->ccontext.NameLength) != 4 || + cc = (struct create_context *)((char *)cc + next); + name = le16_to_cpu(cc->NameOffset) + (char *)cc; + if (le16_to_cpu(cc->NameLength) != 4 || strncmp(name, "RqLs", 4)) { - next = le32_to_cpu(lc->ccontext.Next); + next = le32_to_cpu(cc->Next); continue; } - if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) - return SMB2_OPLOCK_LEVEL_NOCHANGE; - found = true; - break; + return server->ops->parse_lease_buf(cc, epoch); } while (next != 0); - if (!found) - return 0; - - return smb2_map_lease_to_oplock(lc->lcontext.LeaseState); + return 0; } static int -add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock) +add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, + unsigned int *num_iovec, __u8 *oplock) { struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; - iov[num].iov_base = create_lease_buf(oplock+1, *oplock); + iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock); if (iov[num].iov_base == NULL) return -ENOMEM; - iov[num].iov_len = sizeof(struct create_lease); + iov[num].iov_len = server->vals->create_lease_size; req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; if (!req->CreateContextsOffset) req->CreateContextsOffset = cpu_to_le32( sizeof(struct smb2_create_req) - 4 + iov[num - 1].iov_len); - req->CreateContextsLength = cpu_to_le32( - le32_to_cpu(req->CreateContextsLength) + - sizeof(struct create_lease)); - inc_rfc1001_len(&req->hdr, sizeof(struct create_lease)); + le32_add_cpu(&req->CreateContextsLength, + server->vals->create_lease_size); + inc_rfc1001_len(&req->hdr, server->vals->create_lease_size); *num_iovec = num + 1; return 0; } @@ -967,9 +970,7 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, req->CreateContextsOffset = cpu_to_le32(sizeof(struct smb2_create_req) - 4 + iov[1].iov_len); - req->CreateContextsLength = - cpu_to_le32(le32_to_cpu(req->CreateContextsLength) + - sizeof(struct create_durable)); + le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable)); inc_rfc1001_len(&req->hdr, sizeof(struct create_durable)); *num_iovec = num + 1; return 0; @@ -977,7 +978,8 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, - __u8 *oplock, struct smb2_file_all_info *buf) + __u8 *oplock, struct smb2_file_all_info *buf, + struct smb2_err_rsp **err_buf) { struct smb2_create_req *req; struct smb2_create_rsp *rsp; @@ -1048,11 +1050,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (!server->oplocks) *oplock = SMB2_OPLOCK_LEVEL_NONE; - if (!(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) || + if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) || *oplock == SMB2_OPLOCK_LEVEL_NONE) req->RequestedOplockLevel = *oplock; else { - rc = add_lease_context(iov, &num_iovecs, oplock); + rc = add_lease_context(server, iov, &num_iovecs, oplock); if (rc) { cifs_small_buf_release(req); kfree(copy_path); @@ -1062,11 +1064,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { /* need to set Next field of lease context if we request it */ - if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) { + if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) { struct create_context *ccontext = (struct create_context *)iov[num_iovecs-1].iov_base; ccontext->Next = - cpu_to_le32(sizeof(struct create_lease)); + cpu_to_le32(server->vals->create_lease_size); } rc = add_durable_context(iov, &num_iovecs, oparms); if (rc) { @@ -1082,6 +1084,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); + if (err_buf) + *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4, + GFP_KERNEL); goto creat_exit; } @@ -1098,7 +1103,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = parse_lease_state(rsp); + *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch); else *oplock = rsp->OplockLevel; creat_exit: diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 36b0d37ea69b..b83d0118a757 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -150,6 +150,20 @@ struct smb2_err_rsp { __u8 ErrorData[1]; /* variable length */ } __packed; +struct smb2_symlink_err_rsp { + __le32 SymLinkLength; + __le32 SymLinkErrorTag; + __le32 ReparseTag; + __le16 ReparseDataLength; + __le16 UnparsedPathLength; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + __u8 PathBuffer[0]; +} __packed; + #define SMB2_CLIENT_GUID_SIZE 16 extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; @@ -462,6 +476,10 @@ struct create_context { __u8 Buffer[0]; } __packed; +#define SMB2_LEASE_READ_CACHING_HE 0x01 +#define SMB2_LEASE_HANDLE_CACHING_HE 0x02 +#define SMB2_LEASE_WRITE_CACHING_HE 0x04 + #define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) #define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) #define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) @@ -479,12 +497,31 @@ struct lease_context { __le64 LeaseDuration; } __packed; +struct lease_context_v2 { + __le64 LeaseKeyLow; + __le64 LeaseKeyHigh; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; + __le64 ParentLeaseKeyLow; + __le64 ParentLeaseKeyHigh; + __le16 Epoch; + __le16 Reserved; +} __packed; + struct create_lease { struct create_context ccontext; __u8 Name[8]; struct lease_context lcontext; } __packed; +struct create_lease_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context_v2 lcontext; + __u8 Pad[4]; +} __packed; + struct create_durable { struct create_context ccontext; __u8 Name[8]; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 1a5ecbed40ed..e3fb4801ee96 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -53,7 +53,6 @@ extern int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server); extern void smb2_echo_request(struct work_struct *work); extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); -extern __u8 smb2_map_lease_to_oplock(__le32 lease_state); extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); @@ -87,7 +86,6 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, extern int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, FILE_ALL_INFO *buf); -extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); @@ -106,7 +104,8 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, - struct smb2_file_all_info *buf); + struct smb2_file_all_info *buf, + struct smb2_err_rsp **err_buf); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 4f2300d020c7..340abca3aa52 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -114,6 +114,23 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) return 0; } +static struct cifs_ses * +smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server) +{ + struct cifs_ses *ses; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid != smb2hdr->SessionId) + continue; + spin_unlock(&cifs_tcp_ses_lock); + return ses; + } + spin_unlock(&cifs_tcp_ses_lock); + + return NULL; +} + int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) @@ -124,6 +141,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + struct cifs_ses *ses; + + ses = smb2_find_smb_ses(smb2_pdu, server); + if (!ses) { + cifs_dbg(VFS, "%s: Could not find session\n", __func__); + return 0; + } memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); @@ -135,7 +159,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) } rc = crypto_shash_setkey(server->secmech.hmacsha256, - server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; @@ -198,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } -void -generate_smb3signingkey(struct TCP_Server_Info *server) +int +generate_smb3signingkey(struct cifs_ses *ses) { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; @@ -209,90 +233,99 @@ generate_smb3signingkey(struct TCP_Server_Info *server) unsigned char *hashptr = prfhash; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); - memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); - rc = smb3_crypto_shash_allocate(server); + rc = smb3_crypto_shash_allocate(ses->server); if (rc) { cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_setkey(server->secmech.hmacsha256, - server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + rc = crypto_shash_setkey(ses->server->secmech.hmacsha256, + ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set with session key\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash); if (rc) { cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, i, 4); if (rc) { cifs_dbg(VFS, "%s: Could not update with n\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, "SMB2AESCMAC", 12); if (rc) { cifs_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, &zero, 1); if (rc) { cifs_dbg(VFS, "%s: Could not update with zero\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, "SmbSign", 8); if (rc) { cifs_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, L, 4); if (rc) { cifs_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash, hashptr); if (rc) { cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); goto smb3signkey_ret; } - memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); smb3signkey_ret: - return; + return rc; } int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - int i, rc; + int i; + int rc = 0; unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + struct cifs_ses *ses; + + ses = smb2_find_smb_ses(smb2_pdu, server); + if (!ses) { + cifs_dbg(VFS, "%s: Could not find session\n", __func__); + return 0; + } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); rc = crypto_shash_setkey(server->secmech.cmacaes, - server->smb3signingkey, SMB2_CMACAES_SIZE); + ses->smb3signingkey, SMB2_CMACAES_SIZE); + if (rc) { cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); return rc; @@ -389,6 +422,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; if ((smb2_pdu->Command == SMB2_NEGOTIATE) || + (smb2_pdu->Command == SMB2_SESSION_SETUP) || (smb2_pdu->Command == SMB2_OPLOCK_BREAK) || (!server->session_estab)) return 0; diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c new file mode 100644 index 000000000000..1506d4fddb2c --- /dev/null +++ b/fs/cifs/winucase.c @@ -0,0 +1,663 @@ +/* + * fs/cifs/winucase.c + * + * Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * The const tables in this file were converted from the following info + * provided by Microsoft: + * + * 3.1.5.3 Mapping UTF-16 Strings to Upper Case: + * + * http://msdn.microsoft.com/en-us/library/hh877830.aspx + * http://www.microsoft.com/en-us/download/details.aspx?displaylang=en&id=10921 + * + * In particular, the table in "Windows 8 Upper Case Mapping Table.txt" was + * post-processed using the winucase_convert.pl script. + */ + +#include <linux/nls.h> + +wchar_t cifs_toupper(wchar_t in); /* quiet sparse */ + +static const wchar_t t2_00[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0000, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178, +}; + +static const wchar_t t2_01[256] = { + 0x0000, 0x0100, 0x0000, 0x0102, 0x0000, 0x0104, 0x0000, 0x0106, + 0x0000, 0x0108, 0x0000, 0x010a, 0x0000, 0x010c, 0x0000, 0x010e, + 0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0000, 0x0116, + 0x0000, 0x0118, 0x0000, 0x011a, 0x0000, 0x011c, 0x0000, 0x011e, + 0x0000, 0x0120, 0x0000, 0x0122, 0x0000, 0x0124, 0x0000, 0x0126, + 0x0000, 0x0128, 0x0000, 0x012a, 0x0000, 0x012c, 0x0000, 0x012e, + 0x0000, 0x0000, 0x0000, 0x0132, 0x0000, 0x0134, 0x0000, 0x0136, + 0x0000, 0x0000, 0x0139, 0x0000, 0x013b, 0x0000, 0x013d, 0x0000, + 0x013f, 0x0000, 0x0141, 0x0000, 0x0143, 0x0000, 0x0145, 0x0000, + 0x0147, 0x0000, 0x0000, 0x014a, 0x0000, 0x014c, 0x0000, 0x014e, + 0x0000, 0x0150, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156, + 0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x0000, 0x015e, + 0x0000, 0x0160, 0x0000, 0x0162, 0x0000, 0x0164, 0x0000, 0x0166, + 0x0000, 0x0168, 0x0000, 0x016a, 0x0000, 0x016c, 0x0000, 0x016e, + 0x0000, 0x0170, 0x0000, 0x0172, 0x0000, 0x0174, 0x0000, 0x0176, + 0x0000, 0x0000, 0x0179, 0x0000, 0x017b, 0x0000, 0x017d, 0x0000, + 0x0243, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0000, 0x0000, + 0x0187, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x01f6, 0x0000, 0x0000, + 0x0000, 0x0198, 0x023d, 0x0000, 0x0000, 0x0000, 0x0220, 0x0000, + 0x0000, 0x01a0, 0x0000, 0x01a2, 0x0000, 0x01a4, 0x0000, 0x0000, + 0x01a7, 0x0000, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0000, + 0x01af, 0x0000, 0x0000, 0x0000, 0x01b3, 0x0000, 0x01b5, 0x0000, + 0x0000, 0x01b8, 0x0000, 0x0000, 0x0000, 0x01bc, 0x0000, 0x01f7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01c4, 0x0000, + 0x0000, 0x01c7, 0x0000, 0x0000, 0x01ca, 0x0000, 0x01cd, 0x0000, + 0x01cf, 0x0000, 0x01d1, 0x0000, 0x01d3, 0x0000, 0x01d5, 0x0000, + 0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x018e, 0x0000, 0x01de, + 0x0000, 0x01e0, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6, + 0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee, + 0x0000, 0x0000, 0x0000, 0x01f1, 0x0000, 0x01f4, 0x0000, 0x0000, + 0x0000, 0x01f8, 0x0000, 0x01fa, 0x0000, 0x01fc, 0x0000, 0x01fe, +}; + +static const wchar_t t2_02[256] = { + 0x0000, 0x0200, 0x0000, 0x0202, 0x0000, 0x0204, 0x0000, 0x0206, + 0x0000, 0x0208, 0x0000, 0x020a, 0x0000, 0x020c, 0x0000, 0x020e, + 0x0000, 0x0210, 0x0000, 0x0212, 0x0000, 0x0214, 0x0000, 0x0216, + 0x0000, 0x0218, 0x0000, 0x021a, 0x0000, 0x021c, 0x0000, 0x021e, + 0x0000, 0x0000, 0x0000, 0x0222, 0x0000, 0x0224, 0x0000, 0x0226, + 0x0000, 0x0228, 0x0000, 0x022a, 0x0000, 0x022c, 0x0000, 0x022e, + 0x0000, 0x0230, 0x0000, 0x0232, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x023b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0241, 0x0000, 0x0000, 0x0000, 0x0000, 0x0246, + 0x0000, 0x0248, 0x0000, 0x024a, 0x0000, 0x024c, 0x0000, 0x024e, + 0x2c6f, 0x2c6d, 0x0000, 0x0181, 0x0186, 0x0000, 0x0189, 0x018a, + 0x0000, 0x018f, 0x0000, 0x0190, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0193, 0x0000, 0x0000, 0x0194, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0197, 0x0196, 0x0000, 0x2c62, 0x0000, 0x0000, 0x0000, 0x019c, + 0x0000, 0x2c6e, 0x019d, 0x0000, 0x0000, 0x019f, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c64, 0x0000, 0x0000, + 0x01a6, 0x0000, 0x0000, 0x01a9, 0x0000, 0x0000, 0x0000, 0x0000, + 0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x01b7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_03[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0370, 0x0000, 0x0372, 0x0000, 0x0000, 0x0000, 0x0376, + 0x0000, 0x0000, 0x0000, 0x03fd, 0x03fe, 0x03ff, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0388, 0x0389, 0x038a, + 0x0000, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, + 0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x03cf, + 0x0000, 0x03d8, 0x0000, 0x03da, 0x0000, 0x03dc, 0x0000, 0x03de, + 0x0000, 0x03e0, 0x0000, 0x03e2, 0x0000, 0x03e4, 0x0000, 0x03e6, + 0x0000, 0x03e8, 0x0000, 0x03ea, 0x0000, 0x03ec, 0x0000, 0x03ee, + 0x0000, 0x0000, 0x03f9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x03f7, 0x0000, 0x0000, 0x03fa, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_04[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, + 0x0000, 0x0460, 0x0000, 0x0462, 0x0000, 0x0464, 0x0000, 0x0466, + 0x0000, 0x0468, 0x0000, 0x046a, 0x0000, 0x046c, 0x0000, 0x046e, + 0x0000, 0x0470, 0x0000, 0x0472, 0x0000, 0x0474, 0x0000, 0x0476, + 0x0000, 0x0478, 0x0000, 0x047a, 0x0000, 0x047c, 0x0000, 0x047e, + 0x0000, 0x0480, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x048a, 0x0000, 0x048c, 0x0000, 0x048e, + 0x0000, 0x0490, 0x0000, 0x0492, 0x0000, 0x0494, 0x0000, 0x0496, + 0x0000, 0x0498, 0x0000, 0x049a, 0x0000, 0x049c, 0x0000, 0x049e, + 0x0000, 0x04a0, 0x0000, 0x04a2, 0x0000, 0x04a4, 0x0000, 0x04a6, + 0x0000, 0x04a8, 0x0000, 0x04aa, 0x0000, 0x04ac, 0x0000, 0x04ae, + 0x0000, 0x04b0, 0x0000, 0x04b2, 0x0000, 0x04b4, 0x0000, 0x04b6, + 0x0000, 0x04b8, 0x0000, 0x04ba, 0x0000, 0x04bc, 0x0000, 0x04be, + 0x0000, 0x0000, 0x04c1, 0x0000, 0x04c3, 0x0000, 0x04c5, 0x0000, + 0x04c7, 0x0000, 0x04c9, 0x0000, 0x04cb, 0x0000, 0x04cd, 0x04c0, + 0x0000, 0x04d0, 0x0000, 0x04d2, 0x0000, 0x04d4, 0x0000, 0x04d6, + 0x0000, 0x04d8, 0x0000, 0x04da, 0x0000, 0x04dc, 0x0000, 0x04de, + 0x0000, 0x04e0, 0x0000, 0x04e2, 0x0000, 0x04e4, 0x0000, 0x04e6, + 0x0000, 0x04e8, 0x0000, 0x04ea, 0x0000, 0x04ec, 0x0000, 0x04ee, + 0x0000, 0x04f0, 0x0000, 0x04f2, 0x0000, 0x04f4, 0x0000, 0x04f6, + 0x0000, 0x04f8, 0x0000, 0x04fa, 0x0000, 0x04fc, 0x0000, 0x04fe, +}; + +static const wchar_t t2_05[256] = { + 0x0000, 0x0500, 0x0000, 0x0502, 0x0000, 0x0504, 0x0000, 0x0506, + 0x0000, 0x0508, 0x0000, 0x050a, 0x0000, 0x050c, 0x0000, 0x050e, + 0x0000, 0x0510, 0x0000, 0x0512, 0x0000, 0x0514, 0x0000, 0x0516, + 0x0000, 0x0518, 0x0000, 0x051a, 0x0000, 0x051c, 0x0000, 0x051e, + 0x0000, 0x0520, 0x0000, 0x0522, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, + 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, + 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, + 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_1d[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa77d, 0x0000, 0x0000, 0x0000, 0x2c63, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_1e[256] = { + 0x0000, 0x1e00, 0x0000, 0x1e02, 0x0000, 0x1e04, 0x0000, 0x1e06, + 0x0000, 0x1e08, 0x0000, 0x1e0a, 0x0000, 0x1e0c, 0x0000, 0x1e0e, + 0x0000, 0x1e10, 0x0000, 0x1e12, 0x0000, 0x1e14, 0x0000, 0x1e16, + 0x0000, 0x1e18, 0x0000, 0x1e1a, 0x0000, 0x1e1c, 0x0000, 0x1e1e, + 0x0000, 0x1e20, 0x0000, 0x1e22, 0x0000, 0x1e24, 0x0000, 0x1e26, + 0x0000, 0x1e28, 0x0000, 0x1e2a, 0x0000, 0x1e2c, 0x0000, 0x1e2e, + 0x0000, 0x1e30, 0x0000, 0x1e32, 0x0000, 0x1e34, 0x0000, 0x1e36, + 0x0000, 0x1e38, 0x0000, 0x1e3a, 0x0000, 0x1e3c, 0x0000, 0x1e3e, + 0x0000, 0x1e40, 0x0000, 0x1e42, 0x0000, 0x1e44, 0x0000, 0x1e46, + 0x0000, 0x1e48, 0x0000, 0x1e4a, 0x0000, 0x1e4c, 0x0000, 0x1e4e, + 0x0000, 0x1e50, 0x0000, 0x1e52, 0x0000, 0x1e54, 0x0000, 0x1e56, + 0x0000, 0x1e58, 0x0000, 0x1e5a, 0x0000, 0x1e5c, 0x0000, 0x1e5e, + 0x0000, 0x1e60, 0x0000, 0x1e62, 0x0000, 0x1e64, 0x0000, 0x1e66, + 0x0000, 0x1e68, 0x0000, 0x1e6a, 0x0000, 0x1e6c, 0x0000, 0x1e6e, + 0x0000, 0x1e70, 0x0000, 0x1e72, 0x0000, 0x1e74, 0x0000, 0x1e76, + 0x0000, 0x1e78, 0x0000, 0x1e7a, 0x0000, 0x1e7c, 0x0000, 0x1e7e, + 0x0000, 0x1e80, 0x0000, 0x1e82, 0x0000, 0x1e84, 0x0000, 0x1e86, + 0x0000, 0x1e88, 0x0000, 0x1e8a, 0x0000, 0x1e8c, 0x0000, 0x1e8e, + 0x0000, 0x1e90, 0x0000, 0x1e92, 0x0000, 0x1e94, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x1ea0, 0x0000, 0x1ea2, 0x0000, 0x1ea4, 0x0000, 0x1ea6, + 0x0000, 0x1ea8, 0x0000, 0x1eaa, 0x0000, 0x1eac, 0x0000, 0x1eae, + 0x0000, 0x1eb0, 0x0000, 0x1eb2, 0x0000, 0x1eb4, 0x0000, 0x1eb6, + 0x0000, 0x1eb8, 0x0000, 0x1eba, 0x0000, 0x1ebc, 0x0000, 0x1ebe, + 0x0000, 0x1ec0, 0x0000, 0x1ec2, 0x0000, 0x1ec4, 0x0000, 0x1ec6, + 0x0000, 0x1ec8, 0x0000, 0x1eca, 0x0000, 0x1ecc, 0x0000, 0x1ece, + 0x0000, 0x1ed0, 0x0000, 0x1ed2, 0x0000, 0x1ed4, 0x0000, 0x1ed6, + 0x0000, 0x1ed8, 0x0000, 0x1eda, 0x0000, 0x1edc, 0x0000, 0x1ede, + 0x0000, 0x1ee0, 0x0000, 0x1ee2, 0x0000, 0x1ee4, 0x0000, 0x1ee6, + 0x0000, 0x1ee8, 0x0000, 0x1eea, 0x0000, 0x1eec, 0x0000, 0x1eee, + 0x0000, 0x1ef0, 0x0000, 0x1ef2, 0x0000, 0x1ef4, 0x0000, 0x1ef6, + 0x0000, 0x1ef8, 0x0000, 0x1efa, 0x0000, 0x1efc, 0x0000, 0x1efe, +}; + +static const wchar_t t2_1f[256] = { + 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x1f59, 0x0000, 0x1f5b, 0x0000, 0x1f5d, 0x0000, 0x1f5f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fda, 0x1fdb, + 0x1ff8, 0x1ff9, 0x1fea, 0x1feb, 0x1ffa, 0x1ffb, 0x0000, 0x0000, + 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fb8, 0x1fb9, 0x0000, 0x1fbc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x1fcc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fd8, 0x1fd9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fe8, 0x1fe9, 0x0000, 0x0000, 0x0000, 0x1fec, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x1ffc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_21[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2132, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, + 0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2183, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_24[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, 0x24bb, 0x24bc, 0x24bd, + 0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, 0x24c3, 0x24c4, 0x24c5, + 0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, 0x24cb, 0x24cc, 0x24cd, + 0x24ce, 0x24cf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_2c[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2c00, 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07, + 0x2c08, 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f, + 0x2c10, 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17, + 0x2c18, 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f, + 0x2c20, 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27, + 0x2c28, 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x0000, + 0x0000, 0x2c60, 0x0000, 0x0000, 0x0000, 0x023a, 0x023e, 0x0000, + 0x2c67, 0x0000, 0x2c69, 0x0000, 0x2c6b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x2c72, 0x0000, 0x0000, 0x2c75, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2c80, 0x0000, 0x2c82, 0x0000, 0x2c84, 0x0000, 0x2c86, + 0x0000, 0x2c88, 0x0000, 0x2c8a, 0x0000, 0x2c8c, 0x0000, 0x2c8e, + 0x0000, 0x2c90, 0x0000, 0x2c92, 0x0000, 0x2c94, 0x0000, 0x2c96, + 0x0000, 0x2c98, 0x0000, 0x2c9a, 0x0000, 0x2c9c, 0x0000, 0x2c9e, + 0x0000, 0x2ca0, 0x0000, 0x2ca2, 0x0000, 0x2ca4, 0x0000, 0x2ca6, + 0x0000, 0x2ca8, 0x0000, 0x2caa, 0x0000, 0x2cac, 0x0000, 0x2cae, + 0x0000, 0x2cb0, 0x0000, 0x2cb2, 0x0000, 0x2cb4, 0x0000, 0x2cb6, + 0x0000, 0x2cb8, 0x0000, 0x2cba, 0x0000, 0x2cbc, 0x0000, 0x2cbe, + 0x0000, 0x2cc0, 0x0000, 0x2cc2, 0x0000, 0x2cc4, 0x0000, 0x2cc6, + 0x0000, 0x2cc8, 0x0000, 0x2cca, 0x0000, 0x2ccc, 0x0000, 0x2cce, + 0x0000, 0x2cd0, 0x0000, 0x2cd2, 0x0000, 0x2cd4, 0x0000, 0x2cd6, + 0x0000, 0x2cd8, 0x0000, 0x2cda, 0x0000, 0x2cdc, 0x0000, 0x2cde, + 0x0000, 0x2ce0, 0x0000, 0x2ce2, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_2d[256] = { + 0x10a0, 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7, + 0x10a8, 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af, + 0x10b0, 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7, + 0x10b8, 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf, + 0x10c0, 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_a6[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa640, 0x0000, 0xa642, 0x0000, 0xa644, 0x0000, 0xa646, + 0x0000, 0xa648, 0x0000, 0xa64a, 0x0000, 0xa64c, 0x0000, 0xa64e, + 0x0000, 0xa650, 0x0000, 0xa652, 0x0000, 0xa654, 0x0000, 0xa656, + 0x0000, 0xa658, 0x0000, 0xa65a, 0x0000, 0xa65c, 0x0000, 0xa65e, + 0x0000, 0x0000, 0x0000, 0xa662, 0x0000, 0xa664, 0x0000, 0xa666, + 0x0000, 0xa668, 0x0000, 0xa66a, 0x0000, 0xa66c, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa680, 0x0000, 0xa682, 0x0000, 0xa684, 0x0000, 0xa686, + 0x0000, 0xa688, 0x0000, 0xa68a, 0x0000, 0xa68c, 0x0000, 0xa68e, + 0x0000, 0xa690, 0x0000, 0xa692, 0x0000, 0xa694, 0x0000, 0xa696, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_a7[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0xa722, 0x0000, 0xa724, 0x0000, 0xa726, + 0x0000, 0xa728, 0x0000, 0xa72a, 0x0000, 0xa72c, 0x0000, 0xa72e, + 0x0000, 0x0000, 0x0000, 0xa732, 0x0000, 0xa734, 0x0000, 0xa736, + 0x0000, 0xa738, 0x0000, 0xa73a, 0x0000, 0xa73c, 0x0000, 0xa73e, + 0x0000, 0xa740, 0x0000, 0xa742, 0x0000, 0xa744, 0x0000, 0xa746, + 0x0000, 0xa748, 0x0000, 0xa74a, 0x0000, 0xa74c, 0x0000, 0xa74e, + 0x0000, 0xa750, 0x0000, 0xa752, 0x0000, 0xa754, 0x0000, 0xa756, + 0x0000, 0xa758, 0x0000, 0xa75a, 0x0000, 0xa75c, 0x0000, 0xa75e, + 0x0000, 0xa760, 0x0000, 0xa762, 0x0000, 0xa764, 0x0000, 0xa766, + 0x0000, 0xa768, 0x0000, 0xa76a, 0x0000, 0xa76c, 0x0000, 0xa76e, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0xa779, 0x0000, 0xa77b, 0x0000, 0x0000, 0xa77e, + 0x0000, 0xa780, 0x0000, 0xa782, 0x0000, 0xa784, 0x0000, 0xa786, + 0x0000, 0x0000, 0x0000, 0x0000, 0xa78b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_ff[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, + 0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f, + 0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, + 0xff38, 0xff39, 0xff3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t *const toplevel[256] = { + t2_00, t2_01, t2_02, t2_03, t2_04, t2_05, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, t2_1d, t2_1e, t2_1f, + NULL, t2_21, NULL, NULL, t2_24, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, t2_2c, t2_2d, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, t2_a6, t2_a7, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, t2_ff, +}; + +/** + * cifs_toupper - convert a wchar_t from lower to uppercase + * @in: character to convert from lower to uppercase + * + * This function consults the static tables above to convert a wchar_t from + * lower to uppercase. In the event that there is no mapping, the original + * "in" character is returned. + */ +wchar_t +cifs_toupper(wchar_t in) +{ + unsigned char idx; + const wchar_t *tbl; + wchar_t out; + + /* grab upper byte */ + idx = (in & 0xff00) >> 8; + + /* find pointer to 2nd layer table */ + tbl = toplevel[idx]; + if (!tbl) + return in; + + /* grab lower byte */ + idx = in & 0xff; + + /* look up character in table */ + out = tbl[idx]; + if (out) + return out; + + return in; +} diff --git a/fs/coredump.c b/fs/coredump.c index 72f816d6cad9..9bdeca12ae0e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -190,6 +190,11 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) err = cn_printf(cn, "%d", task_tgid_vnr(current)); break; + /* global pid */ + case 'P': + err = cn_printf(cn, "%d", + task_tgid_nr(current)); + break; /* uid */ case 'u': err = cn_printf(cn, "%d", cred->uid); diff --git a/fs/dcache.c b/fs/dcache.c index b949af850cd6..41000305d716 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -37,6 +37,7 @@ #include <linux/rculist_bl.h> #include <linux/prefetch.h> #include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" #include "mount.h" @@ -48,7 +49,7 @@ * - the dcache hash table * s_anon bl list spinlock protects: * - the s_anon list (see __d_drop) - * dcache_lru_lock protects: + * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags @@ -63,7 +64,7 @@ * Ordering: * dentry->d_inode->i_lock * dentry->d_lock - * dcache_lru_lock + * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_anon lock * @@ -81,13 +82,41 @@ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; +/** + * read_seqbegin_or_lock - begin a sequence number check or locking block + * @lock: sequence lock + * @seq : sequence number to be checked + * + * First try it once optimistically without taking the lock. If that fails, + * take the lock. The sequence number is also used as a marker for deciding + * whether to be a reader (even) or writer (odd). + * N.B. seq must be initialized to an even number to begin with. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + read_seqlock_excl(lock); +} + +static inline int need_seqretry(seqlock_t *lock, int seq) +{ + return !(seq & 1) && read_seqretry(lock, seq); +} + +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) + read_sequnlock_excl(lock); +} + /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try @@ -117,23 +146,47 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry_unused); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -static int get_nr_dentry(void) + +/* + * Here we resort to our own counters instead of using generic per-cpu counters + * for consistency with what the vfs inode code does. We are expected to harvest + * better code and performance by having our own specialized counters. + * + * Please note that the loop is done over all possible CPUs, not over all online + * CPUs. The reason for this is that we don't want to play games with CPUs going + * on and off. If one of them goes off, we will just keep their counters. + * + * glommer: See cffbc8a for details, and if you ever intend to change this, + * please update all vfs counters to match. + */ +static long get_nr_dentry(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } +static long get_nr_dentry_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); - return proc_dointvec(table, write, buffer, lenp, ppos); + dentry_stat.nr_unused = get_nr_dentry_unused(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -229,7 +282,7 @@ static void __d_free(struct rcu_head *head) */ static void d_free(struct dentry *dentry) { - BUG_ON(dentry->d_lockref.count); + BUG_ON((int)dentry->d_lockref.count > 0); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -304,50 +357,96 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. + * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry + * is in use - which includes both the "real" per-superblock + * LRU list _and_ the DCACHE_SHRINK_LIST use. + * + * The DCACHE_SHRINK_LIST bit is set whenever the dentry is + * on the shrink list (ie not on the superblock LRU list). + * + * The per-cpu "nr_dentry_unused" counters are updated with + * the DCACHE_LRU_LIST bit. + * + * These helper functions make sure we always follow the + * rules. d_lock must be held by the caller. */ -static void dentry_lru_add(struct dentry *dentry) +#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) +static void d_lru_add(struct dentry *dentry) { - if (list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); - list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - spin_unlock(&dcache_lru_lock); - } + D_FLAG_VERIFY(dentry, 0); + dentry->d_flags |= DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } -static void __dentry_lru_del(struct dentry *dentry) +static void d_lru_del(struct dentry *dentry) { + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_shrink_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); - dentry->d_flags &= ~DCACHE_SHRINK_LIST; - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + this_cpu_dec(nr_dentry_unused); +} + +static void d_shrink_add(struct dentry *dentry, struct list_head *list) +{ + D_FLAG_VERIFY(dentry, 0); + list_add(&dentry->d_lru, list); + dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); } /* - * Remove a dentry with references from the LRU. + * These can only be called under the global LRU lock, ie during the + * callback for freeing the LRU list. "isolate" removes it from the + * LRU lists entirely, while shrink_move moves it to the indicated + * private list. */ -static void dentry_lru_del(struct dentry *dentry) +static void d_lru_isolate(struct dentry *dentry) { - if (!list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); - __dentry_lru_del(dentry); - spin_unlock(&dcache_lru_lock); - } + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + list_del_init(&dentry->d_lru); } -static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) +static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) { - spin_lock(&dcache_lru_lock); - if (list_empty(&dentry->d_lru)) { - list_add_tail(&dentry->d_lru, list); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - } else { - list_move_tail(&dentry->d_lru, list); + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_move_tail(&dentry->d_lru, list); +} + +/* + * dentry_lru_(add|del)_list) must be called with d_lock held. + */ +static void dentry_lru_add(struct dentry *dentry) +{ + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + d_lru_add(dentry); +} + +/* + * Remove a dentry with references from the LRU. + * + * If we are on the shrink list, then we can get to try_prune_one_dentry() and + * lose our last reference through the parent walk. In this case, we need to + * remove ourselves from the shrink list, not the LRU. + */ +static void dentry_lru_del(struct dentry *dentry) +{ + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) + return d_shrink_del(dentry); + d_lru_del(dentry); } - spin_unlock(&dcache_lru_lock); } /** @@ -443,7 +542,8 @@ EXPORT_SYMBOL(d_drop); * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ -static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) +static inline struct dentry * +dentry_kill(struct dentry *dentry, int unlock_on_failure) __releases(dentry->d_lock) { struct inode *inode; @@ -452,8 +552,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) inode = dentry->d_inode; if (inode && !spin_trylock(&inode->i_lock)) { relock: - spin_unlock(&dentry->d_lock); - cpu_relax(); + if (unlock_on_failure) { + spin_unlock(&dentry->d_lock); + cpu_relax(); + } return dentry; /* try again with same dentry */ } if (IS_ROOT(dentry)) @@ -466,13 +568,16 @@ relock: goto relock; } - if (ref) - dentry->d_lockref.count--; + /* + * The dentry is now unrecoverably dead to the world. + */ + lockref_mark_dead(&dentry->d_lockref); + /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); @@ -509,24 +614,22 @@ relock: */ void dput(struct dentry *dentry) { - if (!dentry) + if (unlikely(!dentry)) return; repeat: - if (dentry->d_lockref.count == 1) - might_sleep(); if (lockref_put_or_lock(&dentry->d_lockref)) return; - if (dentry->d_flags & DCACHE_OP_DELETE) { + /* Unreachable? Get rid of it */ + if (unlikely(d_unhashed(dentry))) + goto kill_it; + + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) goto kill_it; } - /* Unreachable? Get rid of it */ - if (d_unhashed(dentry)) - goto kill_it; - dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); @@ -611,8 +714,23 @@ static inline void __dget(struct dentry *dentry) struct dentry *dget_parent(struct dentry *dentry) { + int gotref; struct dentry *ret; + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + ret = ACCESS_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + return ret; + dput(ret); + } + repeat: /* * Don't need rcu_dereference because we re-check it was correct under @@ -712,6 +830,14 @@ restart: hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) { + /* + * inform the fs via d_prune that this dentry + * is about to be unhashed and destroyed. + */ + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) + dentry->d_op->d_prune(dentry); + __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -732,7 +858,7 @@ EXPORT_SYMBOL(d_prune_aliases); * * This may fail if locks cannot be acquired no problem, just try again. */ -static void try_prune_one_dentry(struct dentry *dentry) +static struct dentry * try_prune_one_dentry(struct dentry *dentry) __releases(dentry->d_lock) { struct dentry *parent; @@ -749,17 +875,18 @@ static void try_prune_one_dentry(struct dentry *dentry) * fragmentation. */ if (!parent) - return; + return NULL; if (parent == dentry) - return; + return dentry; /* Prune ancestors. */ dentry = parent; while (dentry) { if (lockref_put_or_lock(&dentry->d_lockref)) - return; + return NULL; dentry = dentry_kill(dentry, 1); } + return NULL; } static void shrink_dentry_list(struct list_head *list) @@ -771,6 +898,12 @@ static void shrink_dentry_list(struct list_head *list) dentry = list_entry_rcu(list->prev, struct dentry, d_lru); if (&dentry->d_lru == list) break; /* empty */ + + /* + * Get the dentry lock, and re-verify that the dentry is + * this on the shrinking list. If it is, we know that + * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. + */ spin_lock(&dentry->d_lock); if (dentry != list_entry(list->prev, struct dentry, d_lru)) { spin_unlock(&dentry->d_lock); @@ -778,76 +911,146 @@ static void shrink_dentry_list(struct list_head *list) } /* + * The dispose list is isolated and dentries are not accounted + * to the LRU here, so we can simply remove it from the list + * here regardless of whether it is referenced or not. + */ + d_shrink_del(dentry); + + /* * We found an inuse dentry which was not removed from - * the LRU because of laziness during lookup. Do not free - * it - just keep it off the LRU list. + * the LRU because of laziness during lookup. Do not free it. */ if (dentry->d_lockref.count) { - dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; } - rcu_read_unlock(); - try_prune_one_dentry(dentry); + /* + * If 'try_to_prune()' returns a dentry, it will + * be the same one we passed in, and d_lock will + * have been held the whole time, so it will not + * have been added to any other lists. We failed + * to get the inode lock. + * + * We just add it back to the shrink list. + */ + dentry = try_prune_one_dentry(dentry); rcu_read_lock(); + if (dentry) { + d_shrink_add(dentry, list); + spin_unlock(&dentry->d_lock); + } } rcu_read_unlock(); } +static enum lru_status +dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + d_lru_isolate(dentry); + spin_unlock(&dentry->d_lock); + return LRU_REMOVED; + } + + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + spin_unlock(&dentry->d_lock); + + /* + * The list move itself will be made by the common LRU code. At + * this point, we've dropped the dentry->d_lock but keep the + * lru lock. This is safe to do, since every list movement is + * protected by the lru lock even if both locks are held. + * + * This is guaranteed by the fact that all LRU management + * functions are intermediated by the LRU API calls like + * list_lru_add and list_lru_del. List movement in this file + * only ever occur through this functions or through callbacks + * like this one, that are called from the LRU API. + * + * The only exceptions to this are functions like + * shrink_dentry_list, and code that first checks for the + * DCACHE_SHRINK_LIST flag. Those are guaranteed to be + * operating only with stack provided lists after they are + * properly isolated from the main list. It is thus, always a + * local access. + */ + return LRU_ROTATE; + } + + d_lru_shrink_move(dentry, freeable); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @count: number of entries to try to free + * @nr_to_scan : number of entries to try to free + * @nid: which node to scan for freeable entities * - * Attempt to shrink the superblock dcache LRU by @count entries. This is + * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is * done when we need more memory an called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ -void prune_dcache_sb(struct super_block *sb, int count) +long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) { - struct dentry *dentry; - LIST_HEAD(referenced); - LIST_HEAD(tmp); + LIST_HEAD(dispose); + long freed; -relock: - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - dentry = list_entry(sb->s_dentry_lru.prev, - struct dentry, d_lru); - BUG_ON(dentry->d_sb != sb); - - if (!spin_trylock(&dentry->d_lock)) { - spin_unlock(&dcache_lru_lock); - cpu_relax(); - goto relock; - } + freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, + &dispose, &nr_to_scan); + shrink_dentry_list(&dispose); + return freed; +} - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - } else { - list_move_tail(&dentry->d_lru, &tmp); - dentry->d_flags |= DCACHE_SHRINK_LIST; - spin_unlock(&dentry->d_lock); - if (!--count) - break; - } - cond_resched_lock(&dcache_lru_lock); - } - if (!list_empty(&referenced)) - list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&dcache_lru_lock); +static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, + spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + d_lru_shrink_move(dentry, freeable); + spin_unlock(&dentry->d_lock); - shrink_dentry_list(&tmp); + return LRU_REMOVED; } + /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -857,16 +1060,17 @@ relock: */ void shrink_dcache_sb(struct super_block *sb) { - LIST_HEAD(tmp); + long freed; - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - list_splice_init(&sb->s_dentry_lru, &tmp); - spin_unlock(&dcache_lru_lock); - shrink_dentry_list(&tmp); - spin_lock(&dcache_lru_lock); - } - spin_unlock(&dcache_lru_lock); + do { + LIST_HEAD(dispose); + + freed = list_lru_walk(&sb->s_dentry_lru, + dentry_lru_isolate_shrink, &dispose, UINT_MAX); + + this_cpu_sub(nr_dentry_unused, freed); + shrink_dentry_list(&dispose); + } while (freed > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -896,7 +1100,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) * inform the fs that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); @@ -985,7 +1190,7 @@ void shrink_dcache_for_umount(struct super_block *sb) * the parenthood after dropping the lock and check * that the sequence number still matches. */ -static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) { struct dentry *new = old->d_parent; @@ -999,7 +1204,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq */ if (new != old->d_parent || (old->d_flags & DCACHE_DENTRY_KILLED) || - (!locked && read_seqretry(&rename_lock, seq))) { + need_seqretry(&rename_lock, seq)) { spin_unlock(&new->d_lock); new = NULL; } @@ -1007,34 +1212,55 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq return new; } +/** + * enum d_walk_ret - action to talke during tree walk + * @D_WALK_CONTINUE: contrinue walk + * @D_WALK_QUIT: quit walk + * @D_WALK_NORETRY: quit when retry is needed + * @D_WALK_SKIP: skip this dentry and its children + */ +enum d_walk_ret { + D_WALK_CONTINUE, + D_WALK_QUIT, + D_WALK_NORETRY, + D_WALK_SKIP, +}; -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ - /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * d_walk - walk the dentry tree + * @parent: start of walk + * @data: data passed to @enter() and @finish() + * @enter: callback when first entering the dentry + * @finish: callback when successfully finished the walk * - * Return true if the parent or its subdirectories contain - * a mount point + * The @enter() and @finish() callbacks are called with d_lock held. */ -int have_submounts(struct dentry *parent) +static void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *), + void (*finish)(void *)) { struct dentry *this_parent; struct list_head *next; - unsigned seq; - int locked = 0; + unsigned seq = 0; + enum d_walk_ret ret; + bool retry = true; - seq = read_seqbegin(&rename_lock); again: + read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; - - if (d_mountpoint(parent)) - goto positive; spin_lock(&this_parent->d_lock); + + ret = enter(data, this_parent); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + case D_WALK_SKIP: + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + } repeat: next = this_parent->d_subdirs.next; resume: @@ -1044,12 +1270,22 @@ resume: next = tmp->next; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* Have we found a mount point ? */ - if (d_mountpoint(dentry)) { + + ret = enter(data, dentry); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: spin_unlock(&dentry->d_lock); - spin_unlock(&this_parent->d_lock); - goto positive; + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + case D_WALK_SKIP: + spin_unlock(&dentry->d_lock); + continue; } + if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); @@ -1064,35 +1300,99 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); + this_parent = try_to_ascend(this_parent, seq); if (!this_parent) goto rename_retry; next = child->d_u.d_child.next; goto resume; } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) + if (need_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 0; /* No mount points found in tree */ -positive: - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 1; + } + if (finish) + finish(data); + +out_unlock: + spin_unlock(&this_parent->d_lock); + done_seqretry(&rename_lock, seq); + return; rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); + if (!retry) + return; + seq = 1; goto again; } + +/* + * Search for at least 1 mount point in the dentry's subdirs. + * We descend to the next level whenever the d_subdirs + * list is non-empty and continue searching. + */ + +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ + +static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +{ + int *ret = data; + if (d_mountpoint(dentry)) { + *ret = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +int have_submounts(struct dentry *parent) +{ + int ret = 0; + + d_walk(parent, &ret, check_mount, NULL); + + return ret; +} EXPORT_SYMBOL(have_submounts); /* + * Called by mount code to set a mountpoint and check if the mountpoint is + * reachable (e.g. NFS can unhash a directory dentry and then the complete + * subtree can become unreachable). + * + * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For + * this reason take rename_lock and d_lock on dentry and ancestors. + */ +int d_set_mounted(struct dentry *dentry) +{ + struct dentry *p; + int ret = -ENOENT; + write_seqlock(&rename_lock); + for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { + /* Need exclusion wrt. check_submounts_and_drop() */ + spin_lock(&p->d_lock); + if (unlikely(d_unhashed(p))) { + spin_unlock(&p->d_lock); + goto out; + } + spin_unlock(&p->d_lock); + } + spin_lock(&dentry->d_lock); + if (!d_unlinked(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } + spin_unlock(&dentry->d_lock); +out: + write_sequnlock(&rename_lock); + return ret; +} + +/* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level @@ -1106,93 +1406,51 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry *parent, struct list_head *dispose) -{ - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int found = 0; - int locked = 0; - seq = read_seqbegin(&rename_lock); -again: - this_parent = parent; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; +struct select_data { + struct dentry *start; + struct list_head dispose; + int found; +}; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); +static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, dispose); - dentry->d_flags |= DCACHE_SHRINK_LIST; - found++; - } - /* - * We can return to the caller if we have found some (this - * ensures forward progress). We'll be coming back to find - * the rest. - */ - if (found && need_resched()) { - spin_unlock(&dentry->d_lock); - goto out; - } + if (data->start == dentry) + goto out; + /* + * move only zero ref count dentries to the dispose list. + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. + */ + if (dentry->d_lockref.count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { /* - * Descend a level if the d_subdirs list is non-empty. + * We can't use d_lru_shrink_move() because we + * need to get the global LRU lock and do the + * LRU accounting. */ - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } - - spin_unlock(&dentry->d_lock); + d_lru_del(dentry); + d_shrink_add(dentry, &data->dispose); + data->found++; + ret = D_WALK_NORETRY; } /* - * All done at this level ... ascend and resume the search. + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. */ - if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; - } + if (data->found && need_resched()) + ret = D_WALK_QUIT; out: - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return found; - -rename_retry: - if (found) - return found; - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; + return ret; } /** @@ -1201,18 +1459,90 @@ rename_retry: * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry * parent) +void shrink_dcache_parent(struct dentry *parent) { - LIST_HEAD(dispose); - int found; + for (;;) { + struct select_data data; - while ((found = select_parent(parent, &dispose)) != 0) { - shrink_dentry_list(&dispose); + INIT_LIST_HEAD(&data.dispose); + data.start = parent; + data.found = 0; + + d_walk(parent, &data, select_collect, NULL); + if (!data.found) + break; + + shrink_dentry_list(&data.dispose); cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + + if (d_mountpoint(dentry)) { + data->found = -EBUSY; + return D_WALK_QUIT; + } + + return select_collect(_data, dentry); +} + +static void check_and_drop(void *_data) +{ + struct select_data *data = _data; + + if (d_mountpoint(data->start)) + data->found = -EBUSY; + if (!data->found) + __d_drop(data->start); +} + +/** + * check_submounts_and_drop - prune dcache, check for submounts and drop + * + * All done as a single atomic operation relative to has_unlinked_ancestor(). + * Returns 0 if successfully unhashed @parent. If there were submounts then + * return -EBUSY. + * + * @dentry: dentry to prune and drop + */ +int check_submounts_and_drop(struct dentry *dentry) +{ + int ret = 0; + + /* Negative dentries can be dropped without further checks */ + if (!dentry->d_inode) { + d_drop(dentry); + goto out; + } + + for (;;) { + struct select_data data; + + INIT_LIST_HEAD(&data.dispose); + data.start = dentry; + data.found = 0; + + d_walk(dentry, &data, check_and_collect, check_and_drop); + ret = data.found; + + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + + if (ret <= 0) + break; + + cond_resched(); + } + +out: + return ret; +} +EXPORT_SYMBOL(check_submounts_and_drop); + /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to @@ -1771,7 +2101,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of @@ -2495,9 +2825,39 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) return 0; } +/** + * prepend_name - prepend a pathname in front of current buffer pointer + * @buffer: buffer pointer + * @buflen: allocated length of the buffer + * @name: name string and length qstr structure + * + * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * make sure that either the old or the new name pointer and length are + * fetched. However, there may be mismatch between length and pointer. + * The length cannot be trusted, we need to copy it byte-by-byte until + * the length is reached or a null byte is found. It also prepends "/" at + * the beginning of the name. The sequence number check at the caller will + * retry it again when a d_move() does happen. So any garbage in the buffer + * due to mismatched pointer and length will be discarded. + */ static int prepend_name(char **buffer, int *buflen, struct qstr *name) { - return prepend(buffer, buflen, name->name, name->len); + const char *dname = ACCESS_ONCE(name->name); + u32 dlen = ACCESS_ONCE(name->len); + char *p; + + if (*buflen < dlen + 1) + return -ENAMETOOLONG; + *buflen -= dlen + 1; + p = *buffer -= dlen + 1; + *p++ = '/'; + while (dlen--) { + char c = *dname++; + if (!c) + break; + *p++ = c; + } + return 0; } /** @@ -2507,7 +2867,15 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * - * Caller holds the rename_lock. + * The function will first try to write out the pathname without taking any + * lock other than the RCU read lock to make sure that dentries won't go away. + * It only checks the sequence number of the global rename_lock as any change + * in the dentry's d_seq will be preceded by changes in the rename_lock + * sequence number. If the sequence number had been changed, it will restart + * the whole pathname back-tracing sequence again by taking the rename_lock. + * In this case, there is no need to take the RCU read lock as the recursive + * parent pointer references will keep the dentry chain alive as long as no + * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, @@ -2516,54 +2884,66 @@ static int prepend_path(const struct path *path, struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; struct mount *mnt = real_mount(vfsmnt); - bool slash = false; int error = 0; + unsigned seq = 0; + char *bptr; + int blen; + rcu_read_lock(); +restart: + bptr = *buffer; + blen = *buflen; + read_seqbegin_or_lock(&rename_lock, &seq); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - if (!mnt_has_parent(mnt)) - goto global_root; - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - vfsmnt = &mnt->mnt; - continue; + if (mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + vfsmnt = &mnt->mnt; + continue; + } + /* + * Filesystems needing to implement special "root names" + * should do so with ->d_dname() + */ + if (IS_ROOT(dentry) && + (dentry->d_name.len != 1 || + dentry->d_name.name[0] != '/')) { + WARN(1, "Root dentry has weird name <%.*s>\n", + (int) dentry->d_name.len, + dentry->d_name.name); + } + if (!error) + error = is_mounted(vfsmnt) ? 1 : 2; + break; } parent = dentry->d_parent; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(buffer, buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (!error) - error = prepend(buffer, buflen, "/", 1); + error = prepend_name(&bptr, &blen, &dentry->d_name); if (error) break; - slash = true; dentry = parent; } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); - if (!error && !slash) - error = prepend(buffer, buflen, "/", 1); - - return error; - -global_root: - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, dentry->d_name.name); - } - if (!slash) - error = prepend(buffer, buflen, "/", 1); - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; + if (error >= 0 && bptr == *buffer) { + if (--blen < 0) + error = -ENAMETOOLONG; + else + *--bptr = '/'; + } + *buffer = bptr; + *buflen = blen; return error; } @@ -2592,9 +2972,7 @@ char *__d_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) @@ -2613,9 +2991,7 @@ char *d_absolute_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error > 1) @@ -2647,6 +3023,16 @@ static int prepend_unreachable(char **buffer, int *buflen) return prepend(buffer, buflen, "(unreachable)", 13); } +static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + } while (read_seqcount_retry(&fs->seq, seq)); +} + /** * d_path - return the path of a dentry * @path: path to report @@ -2679,15 +3065,15 @@ char *d_path(const struct path *path, char *buf, int buflen) if (path->dentry->d_op && path->dentry->d_op->d_dname) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); - get_fs_root(current->fs, &root); + rcu_read_lock(); + get_fs_root_rcu(current->fs, &root); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); + if (error < 0) res = ERR_PTR(error); - path_put(&root); return res; } EXPORT_SYMBOL(d_path); @@ -2718,10 +3104,10 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) char *end = buffer + buflen; /* these dentries are never renamed, so d_lock is not needed */ if (prepend(&end, &buflen, " (deleted)", 11) || - prepend_name(&end, &buflen, &dentry->d_name) || + prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || prepend(&end, &buflen, "/", 1)) end = ERR_PTR(-ENAMETOOLONG); - return end; + return end; } /* @@ -2729,30 +3115,42 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) */ static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) { - char *end = buf + buflen; - char *retval; + char *end, *retval; + int len, seq = 0; + int error = 0; - prepend(&end, &buflen, "\0", 1); + rcu_read_lock(); +restart: + end = buf + buflen; + len = buflen; + prepend(&end, &len, "\0", 1); if (buflen < 1) goto Elong; /* Get '/' right */ retval = end-1; *retval = '/'; - + read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; int error; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(&end, &buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (error != 0 || prepend(&end, &buflen, "/", 1) != 0) - goto Elong; + error = prepend_name(&end, &len, &dentry->d_name); + if (error) + break; retval = end; dentry = parent; } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); + if (error) + goto Elong; return retval; Elong: return ERR_PTR(-ENAMETOOLONG); @@ -2760,13 +3158,7 @@ Elong: char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) { - char *retval; - - write_seqlock(&rename_lock); - retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); - - return retval; + return __dentry_path(dentry, buf, buflen); } EXPORT_SYMBOL(dentry_path_raw); @@ -2775,7 +3167,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) char *p = NULL; char *retval; - write_seqlock(&rename_lock); if (d_unlinked(dentry)) { p = buf + buflen; if (prepend(&p, &buflen, "//deleted", 10) != 0) @@ -2783,7 +3174,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) buflen++; } retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); if (!IS_ERR(retval) && p) *p = '/'; /* restore '/' overriden with '\0' */ return retval; @@ -2791,6 +3181,18 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, + struct path *pwd) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + *pwd = fs->pwd; + } while (read_seqcount_retry(&fs->seq, seq)); +} + /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just @@ -2813,25 +3215,25 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; - char *page = (char *) __get_free_page(GFP_USER); + char *page = __getname(); if (!page) return -ENOMEM; - get_fs_root_and_pwd(current->fs, &root, &pwd); + rcu_read_lock(); + get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); error = -ENOENT; br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - char *cwd = page + PAGE_SIZE; - int buflen = PAGE_SIZE; + char *cwd = page + PATH_MAX; + int buflen = PATH_MAX; prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); if (error < 0) goto out; @@ -2844,21 +3246,19 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } error = -ERANGE; - len = PAGE_SIZE + page - cwd; + len = PATH_MAX + page - cwd; if (len <= size) { error = len; if (copy_to_user(buf, cwd, len)) error = -EFAULT; } } else { - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); } out: - path_put(&pwd); - path_put(&root); - free_page((unsigned long) page); + __putname(page); return error; } @@ -2904,68 +3304,24 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } -void d_genocide(struct dentry *root) +static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int locked = 0; + struct dentry *root = data; + if (dentry != root) { + if (d_unhashed(dentry) || !dentry->d_inode) + return D_WALK_SKIP; - seq = read_seqbegin(&rename_lock); -again: - this_parent = root; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (d_unhashed(dentry) || !dentry->d_inode) { - spin_unlock(&dentry->d_lock); - continue; - } - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } - spin_unlock(&dentry->d_lock); } - if (this_parent != root) { - struct dentry *child = this_parent; - if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { - this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_lockref.count--; - } - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; - } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return; + return D_WALK_CONTINUE; +} -rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; +void d_genocide(struct dentry *parent) +{ + d_walk(parent, parent, d_genocide_kill, NULL); } void d_tmpfile(struct dentry *dentry, struct inode *inode) diff --git a/fs/direct-io.c b/fs/direct-io.c index 7ab90f5081ee..0e04142d5962 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -127,6 +127,7 @@ struct dio { spinlock_t bio_lock; /* protects BIO fields below */ int page_errors; /* errno from get_user_pages() */ int is_async; /* is IO async ? */ + bool defer_completion; /* defer AIO completion to workqueue? */ int io_error; /* IO error in completion path */ unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ @@ -141,7 +142,10 @@ struct dio { * allocation time. Don't add new fields after pages[] unless you * wish that they not be zeroed. */ - struct page *pages[DIO_PAGES]; /* page buffer */ + union { + struct page *pages[DIO_PAGES]; /* page buffer */ + struct work_struct complete_work;/* deferred AIO completion */ + }; } ____cacheline_aligned_in_smp; static struct kmem_cache *dio_cache __read_mostly; @@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio, * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation * - * This releases locks as dictated by the locking type, lets interested parties - * know that a DIO operation has completed, and calculates the resulting return - * code for the operation. + * This drops i_dio_count, lets interested parties know that a DIO operation + * has completed, and calculates the resulting return code for the operation. * * It lets the filesystem know if it registered an interest earlier via * get_block. Pass the private field of the map buffer_head so that * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, + bool is_async) { ssize_t transferred = 0; @@ -258,19 +262,36 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (ret == 0) ret = transferred; - if (dio->end_io && dio->result) { - dio->end_io(dio->iocb, offset, transferred, - dio->private, ret, is_async); - } else { - inode_dio_done(dio->inode); - if (is_async) - aio_complete(dio->iocb, ret, 0); + if (dio->end_io && dio->result) + dio->end_io(dio->iocb, offset, transferred, dio->private); + + inode_dio_done(dio->inode); + if (is_async) { + if (dio->rw & WRITE) { + int err; + + err = generic_write_sync(dio->iocb->ki_filp, offset, + transferred); + if (err < 0 && ret > 0) + ret = err; + } + + aio_complete(dio->iocb, ret, 0); } + kmem_cache_free(dio_cache, dio); return ret; } +static void dio_aio_complete_work(struct work_struct *work) +{ + struct dio *dio = container_of(work, struct dio, complete_work); + + dio_complete(dio, dio->iocb->ki_pos, 0, true); +} + static int dio_bio_complete(struct dio *dio, struct bio *bio); + /* * Asynchronous IO callback. */ @@ -290,8 +311,13 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - dio_complete(dio, dio->iocb->ki_pos, 0, true); - kmem_cache_free(dio_cache, dio); + if (dio->result && dio->defer_completion) { + INIT_WORK(&dio->complete_work, dio_aio_complete_work); + queue_work(dio->inode->i_sb->s_dio_done_wq, + &dio->complete_work); + } else { + dio_complete(dio, dio->iocb->ki_pos, 0, true); + } } } @@ -511,6 +537,42 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) } /* + * Create workqueue for deferred direct IO completions. We allocate the + * workqueue when it's first needed. This avoids creating workqueue for + * filesystems that don't need it and also allows us to create the workqueue + * late enough so the we can include s_id in the name of the workqueue. + */ +static int sb_init_dio_done_wq(struct super_block *sb) +{ + struct workqueue_struct *old; + struct workqueue_struct *wq = alloc_workqueue("dio/%s", + WQ_MEM_RECLAIM, 0, + sb->s_id); + if (!wq) + return -ENOMEM; + /* + * This has to be atomic as more DIOs can race to create the workqueue + */ + old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); + /* Someone created workqueue before us? Free ours... */ + if (old) + destroy_workqueue(wq); + return 0; +} + +static int dio_set_defer_completion(struct dio *dio) +{ + struct super_block *sb = dio->inode->i_sb; + + if (dio->defer_completion) + return 0; + dio->defer_completion = true; + if (!sb->s_dio_done_wq) + return sb_init_dio_done_wq(sb); + return 0; +} + +/* * Call into the fs to map some more disk blocks. We record the current number * of available blocks at sdio->blocks_available. These are in units of the * fs blocksize, (1 << inode->i_blkbits). @@ -581,6 +643,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, /* Store for completion */ dio->private = map_bh->b_private; + + if (ret == 0 && buffer_defer_completion(map_bh)) + ret = dio_set_defer_completion(dio); } return ret; } @@ -1129,11 +1194,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } /* - * Will be decremented at I/O completion time. - */ - atomic_inc(&inode->i_dio_count); - - /* * For file extending writes updating i_size before data * writeouts complete can expose uninitialized blocks. So * even for AIO, we need to wait for i/o to complete before @@ -1141,11 +1201,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && (end > i_size_read(inode))); - - retval = 0; - dio->inode = inode; dio->rw = rw; + + /* + * For AIO O_(D)SYNC writes we need to defer completions to a workqueue + * so that we can call ->fsync. + */ + if (dio->is_async && (rw & WRITE) && + ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host))) { + retval = dio_set_defer_completion(dio); + if (retval) { + /* + * We grab i_mutex only for reads so we don't have + * to release it here + */ + kmem_cache_free(dio_cache, dio); + goto out; + } + } + + /* + * Will be decremented at I/O completion time. + */ + atomic_inc(&inode->i_dio_count); + + retval = 0; sdio.blkbits = blkbits; sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; @@ -1269,7 +1351,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (drop_refcount(dio) == 0) { retval = dio_complete(dio, offset, retval, false); - kmem_cache_free(dio_cache, dio); } else BUG_ON(retval != -EIOCBQUEUED); diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 27a6ba9aaeec..0e90f0c91b93 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -267,10 +267,7 @@ void dlm_callback_work(struct work_struct *work) int dlm_callback_start(struct dlm_ls *ls) { ls->ls_callback_wq = alloc_workqueue("dlm_callback", - WQ_UNBOUND | - WQ_MEM_RECLAIM | - WQ_NON_REENTRANT, - 0); + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!ls->ls_callback_wq) { log_print("can't start dlm_callback workqueue"); return -ENOMEM; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 812149119fa3..142e21655eed 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -493,7 +493,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, { struct dlm_user_proc *proc = file->private_data; struct dlm_write_request *kbuf; - sigset_t tmpsig, allsigs; int error; #ifdef CONFIG_COMPAT @@ -557,9 +556,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, goto out_free; } - sigfillset(&allsigs); - sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); - error = -EINVAL; switch (kbuf->cmd) @@ -567,7 +563,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_LOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_lock(proc, &kbuf->i.lock); break; @@ -575,7 +571,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_UNLOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_unlock(proc, &kbuf->i.lock); break; @@ -583,7 +579,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_DEADLOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_deadlock(proc, &kbuf->i.lock); break; @@ -591,7 +587,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_CREATE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); - goto out_sig; + goto out_free; } error = device_create_lockspace(&kbuf->i.lspace); break; @@ -599,7 +595,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_REMOVE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); - goto out_sig; + goto out_free; } error = device_remove_lockspace(&kbuf->i.lspace); break; @@ -607,7 +603,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_PURGE: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_purge(proc, &kbuf->i.purge); break; @@ -617,8 +613,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, kbuf->cmd); } - out_sig: - sigprocmask(SIG_SETMASK, &tmpsig, NULL); out_free: kfree(kbuf); return error; @@ -659,15 +653,11 @@ static int device_close(struct inode *inode, struct file *file) { struct dlm_user_proc *proc = file->private_data; struct dlm_ls *ls; - sigset_t tmpsig, allsigs; ls = dlm_find_lockspace_local(proc->lockspace); if (!ls) return -ENOENT; - sigfillset(&allsigs); - sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); - set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags); dlm_clear_proc_locks(ls, proc); @@ -685,8 +675,6 @@ static int device_close(struct inode *inode, struct file *file) /* FIXME: AUTOFREE: if this ls is no longer used do device_remove_lockspace() */ - sigprocmask(SIG_SETMASK, &tmpsig, NULL); - return 0; } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index c00e055b6282..9fd702f5bfb2 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -44,6 +44,7 @@ static void drop_slab(void) .gfp_mask = GFP_KERNEL, }; + nodes_setall(shrink.nodes_to_scan); do { nr_objects = shrink_slab(&shrink, 1000, 1000); } while (nr_objects > 10); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index d10757635b9c..c88e355f7635 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -609,39 +609,35 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) char *full_alg_name; int rc = -EINVAL; - if (!crypt_stat->cipher) { - ecryptfs_printk(KERN_ERR, "No cipher specified\n"); - goto out; - } ecryptfs_printk(KERN_DEBUG, "Initializing cipher [%s]; strlen = [%d]; " "key_size_bits = [%zd]\n", crypt_stat->cipher, (int)strlen(crypt_stat->cipher), crypt_stat->key_size << 3); + mutex_lock(&crypt_stat->cs_tfm_mutex); if (crypt_stat->tfm) { rc = 0; - goto out; + goto out_unlock; } - mutex_lock(&crypt_stat->cs_tfm_mutex); rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name, crypt_stat->cipher, "cbc"); if (rc) goto out_unlock; crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0); - kfree(full_alg_name); if (IS_ERR(crypt_stat->tfm)) { rc = PTR_ERR(crypt_stat->tfm); crypt_stat->tfm = NULL; ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " "Error initializing cipher [%s]\n", - crypt_stat->cipher); - goto out_unlock; + full_alg_name); + goto out_free; } crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); rc = 0; +out_free: + kfree(full_alg_name); out_unlock: mutex_unlock(&crypt_stat->cs_tfm_mutex); -out: return rc; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9ad17b15b454..473e09da7d02 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -740,6 +740,7 @@ static void ep_free(struct eventpoll *ep) epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); + cond_resched(); } /* @@ -754,6 +755,7 @@ static void ep_free(struct eventpoll *ep) while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); + cond_resched(); } mutex_unlock(&ep->mtx); @@ -1792,7 +1794,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, { int error; int did_lock_epmutex = 0; - struct file *file, *tfile; + struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; @@ -1802,20 +1804,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; - /* Get the "struct file *" for the eventpoll file */ error = -EBADF; - file = fget(epfd); - if (!file) + f = fdget(epfd); + if (!f.file) goto error_return; /* Get the "struct file *" for the target file */ - tfile = fget(fd); - if (!tfile) + tf = fdget(fd); + if (!tf.file) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; - if (!tfile->f_op || !tfile->f_op->poll) + if (!tf.file->f_op || !tf.file->f_op->poll) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ @@ -1828,14 +1829,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (file == tfile || !is_file_epoll(file)) + if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = file->private_data; + ep = f.file->private_data; /* * When we insert an epoll file descriptor, inside another epoll file @@ -1854,14 +1855,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, did_lock_epmutex = 1; } if (op == EPOLL_CTL_ADD) { - if (is_file_epoll(tfile)) { + if (is_file_epoll(tf.file)) { error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) { + if (ep_loop_check(ep, tf.file) != 0) { clear_tfile_check_list(); goto error_tgt_fput; } } else - list_add(&tfile->f_tfile_llink, &tfile_check_list); + list_add(&tf.file->f_tfile_llink, &tfile_check_list); } mutex_lock_nested(&ep->mtx, 0); @@ -1871,14 +1872,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ - epi = ep_find(ep, tfile, fd); + epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); + error = ep_insert(ep, &epds, tf.file, fd); } else error = -EEXIST; clear_tfile_check_list(); @@ -1903,9 +1904,9 @@ error_tgt_fput: if (did_lock_epmutex) mutex_unlock(&epmutex); - fput(tfile); + fdput(tf); error_fput: - fput(file); + fdput(f); error_return: return error; diff --git a/fs/exec.c b/fs/exec.c index fd774c7cb483..8875dd10ae7a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -74,6 +74,8 @@ static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { BUG_ON(!fmt); + if (WARN_ON(!fmt->load_binary)) + return; write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); @@ -266,7 +268,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); @@ -1365,18 +1367,18 @@ out: } EXPORT_SYMBOL(remove_arg_zero); +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ int search_binary_handler(struct linux_binprm *bprm) { - unsigned int depth = bprm->recursion_depth; - int try,retval; + bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; - pid_t old_pid, old_vpid; + int retval; /* This allows 4 levels of binfmt rewrites before failing hard. */ - if (depth > 5) + if (bprm->recursion_depth > 5) return -ELOOP; retval = security_bprm_check(bprm); @@ -1387,71 +1389,67 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; + retval = -ENOENT; + retry: + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + bprm->recursion_depth++; + retval = fmt->load_binary(bprm); + bprm->recursion_depth--; + if (retval >= 0 || retval != -ENOEXEC || + bprm->mm == NULL || bprm->file == NULL) { + put_binfmt(fmt); + return retval; + } + read_lock(&binfmt_lock); + put_binfmt(fmt); + } + read_unlock(&binfmt_lock); + + if (need_retry && retval == -ENOEXEC) { + if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && + printable(bprm->buf[2]) && printable(bprm->buf[3])) + return retval; + if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) + return retval; + need_retry = false; + goto retry; + } + + return retval; +} +EXPORT_SYMBOL(search_binary_handler); + +static int exec_binprm(struct linux_binprm *bprm) +{ + pid_t old_pid, old_vpid; + int ret; + /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); - retval = -ENOENT; - for (try=0; try<2; try++) { - read_lock(&binfmt_lock); - list_for_each_entry(fmt, &formats, lh) { - int (*fn)(struct linux_binprm *) = fmt->load_binary; - if (!fn) - continue; - if (!try_module_get(fmt->module)) - continue; - read_unlock(&binfmt_lock); - bprm->recursion_depth = depth + 1; - retval = fn(bprm); - bprm->recursion_depth = depth; - if (retval >= 0) { - if (depth == 0) { - trace_sched_process_exec(current, old_pid, bprm); - ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - } - put_binfmt(fmt); - allow_write_access(bprm->file); - if (bprm->file) - fput(bprm->file); - bprm->file = NULL; - current->did_exec = 1; - proc_exec_connector(current); - return retval; - } - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (retval != -ENOEXEC || bprm->mm == NULL) - break; - if (!bprm->file) { - read_unlock(&binfmt_lock); - return retval; - } + ret = search_binary_handler(bprm); + if (ret >= 0) { + trace_sched_process_exec(current, old_pid, bprm); + ptrace_event(PTRACE_EVENT_EXEC, old_vpid); + current->did_exec = 1; + proc_exec_connector(current); + + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; /* to catch use-after-free */ } - read_unlock(&binfmt_lock); -#ifdef CONFIG_MODULES - if (retval != -ENOEXEC || bprm->mm == NULL) { - break; - } else { -#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) - if (printable(bprm->buf[0]) && - printable(bprm->buf[1]) && - printable(bprm->buf[2]) && - printable(bprm->buf[3])) - break; /* -ENOEXEC */ - if (try) - break; /* -ENOEXEC */ - request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); - } -#else - break; -#endif } - return retval; -} -EXPORT_SYMBOL(search_binary_handler); + return ret; +} /* * sys_execve() executes a new program. @@ -1541,7 +1539,7 @@ static int do_execve_common(const char *filename, if (retval < 0) goto out; - retval = search_binary_handler(bprm); + retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2ec8eb1ab269..a52a5d23c30b 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) static void _write_failed(struct inode *inode, loff_t to) { if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } int exofs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 293bc2e47a73..a235f0016889 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -231,7 +231,7 @@ static int filldir_one(void * __buf, const char * name, int len, int result = 0; buf->sequence++; - if (buf->ino == ino) { + if (buf->ino == ino && len <= NAME_MAX) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0a87bb10998d..c260de6d7b6d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); ext2_truncate_blocks(inode, inode->i_size); } } diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index f522425aaa24..bafdd48eefde 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -41,7 +41,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) /** * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which chould potentially get coverted to use htree + * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not diff --git a/fs/ext3/super.c b/fs/ext3/super.c index c47f14750722..c50c76190373 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -27,6 +27,7 @@ #include <linux/seq_file.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/namei.h> #include <asm/uaccess.h> @@ -819,6 +820,7 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_path, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -860,6 +862,7 @@ static const match_table_t tokens = { {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, @@ -975,6 +978,11 @@ static int parse_options (char *options, struct super_block *sb, int option; kuid_t uid; kgid_t gid; + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + #ifdef CONFIG_QUOTA int qfmt; #endif @@ -1129,6 +1137,41 @@ static int parse_options (char *options, struct super_block *sb, return 0; *journal_devnum = option; break; + case Opt_journal_path: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext3_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return 0; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext3_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return 0; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext3_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return 0; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); + break; case Opt_noload: set_opt (sbi->s_mount_opt, NOLOAD); break; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index ddd715e42a5c..dc5d572ebd6a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -184,6 +184,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); @@ -191,11 +192,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } memset(bh->b_data, 0, sb->s_blocksize); @@ -305,7 +304,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh) { ext4_grpblk_t offset; @@ -352,10 +351,11 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return; @@ -366,12 +366,14 @@ void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } set_buffer_verified(bh); @@ -445,7 +447,10 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) return bh; verify: ext4_validate_block_bitmap(sb, desc, block_group, bh); - return bh; + if (buffer_verified(bh)) + return bh; + put_bh(bh); + return NULL; } /* Returns 0 on success, 1 on error */ @@ -469,7 +474,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ ext4_validate_block_bitmap(sb, desc, block_group, bh); - return 0; + /* ...but check for error just in case errors=continue. */ + return !buffer_verified(bh); } struct buffer_head * diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 3c7d288ae94c..680bb3388919 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -33,7 +33,7 @@ static int ext4_dx_readdir(struct file *, struct dir_context *); /** * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which chould potentially get coverted to use htree + * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0ab26fbf3380..af815ea9d7cc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -180,7 +180,6 @@ struct ext4_map_blocks { * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 -#define EXT4_IO_END_DIRECT 0x0002 /* * For converting uninitialized extents on a work queue. 'handle' is used for @@ -196,8 +195,6 @@ typedef struct ext4_io_end { unsigned int flag; /* unwritten or not */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - struct kiocb *iocb; /* iocb struct for AIO */ - int result; /* error value for AIO */ atomic_t count; /* reference counter */ } ext4_io_end_t; @@ -561,6 +558,18 @@ enum { #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x0400 +#define EXT4_EX_FORCE_CACHE 0x0800 + +/* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 @@ -569,6 +578,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RESERVE 0x0040 /* * ioctl commands @@ -590,6 +600,7 @@ enum { #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -900,11 +911,9 @@ struct ext4_inode_info { * Completed IOs that need unwritten extents handling and don't have * transaction reserved */ - struct list_head i_unrsv_conversion_list; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ atomic_t i_unwritten; /* Nr. of inflight conversions pending */ struct work_struct i_rsv_conversion_work; - struct work_struct i_unrsv_conversion_work; spinlock_t i_block_reservation_lock; @@ -1276,8 +1285,6 @@ struct ext4_sb_info { struct flex_groups *s_flex_groups; ext4_group_t s_flex_groups_allocated; - /* workqueue for unreserved extent convertions (dio) */ - struct workqueue_struct *unrsv_conversion_wq; /* workqueue for reserved extent conversions (buffered io) */ struct workqueue_struct *rsv_conversion_wq; @@ -1340,9 +1347,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - /* Writeback has to have coversion transaction reserved */ - WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && - !(io_end->flag & EXT4_IO_END_DIRECT)); io_end->flag |= EXT4_IO_END_UNWRITTEN; atomic_inc(&EXT4_I(inode)->i_unwritten); } @@ -1375,6 +1379,7 @@ enum { nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1915,7 +1920,7 @@ extern ext4_group_t ext4_get_group_number(struct super_block *sb, extern void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh); extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); @@ -2417,16 +2422,32 @@ do { \ #define EXT4_FREECLUSTERS_WATERMARK 0 #endif +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { - /* - * XXX: replace with spinlock if seen contended -bzzz - */ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !mutex_is_locked(&inode->i_mutex)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; up_write(&EXT4_I(inode)->i_data_sem); - return ; +} + +/* + * Update i_disksize after writeback has been started. Races with truncate + * are avoided by checking i_size under i_data_sem. + */ +static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) +{ + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (newsize > i_size) + newsize = i_size; + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); } struct ext4_group_info { @@ -2449,9 +2470,15 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) @@ -2655,6 +2682,12 @@ extern int ext4_check_blockref(const char *, unsigned int, struct ext4_ext_path; struct ext4_extent; +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); @@ -2684,7 +2717,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); + struct ext4_ext_path *, + int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern int ext4_find_delalloc_range(struct inode *inode, @@ -2693,7 +2727,7 @@ extern int ext4_find_delalloc_range(struct inode *inode, extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); - +extern int ext4_ext_precache(struct inode *inode); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -2716,7 +2750,6 @@ extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); -extern void ext4_end_io_unrsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 51bc821ade90..5074fe23f19e 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -134,12 +134,6 @@ struct ext4_ext_path { */ /* - * Maximum number of logical blocks in a file; ext4_extent's ee_block is - * __le32. - */ -#define EXT_MAX_BLOCKS 0xffffffff - -/* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 2877258d9497..81cfefa9dc0c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -197,7 +197,7 @@ static inline void ext4_journal_callback_add(handle_t *handle, * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister - * Return true if object was sucessfully removed + * Return true if object was successfully removed */ static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72ba4705d4fa..54d52afcdb19 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -407,7 +407,7 @@ static int ext4_valid_extent_entries(struct inode *inode, static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, - int depth) + int depth, ext4_fsblk_t pblk) { const char *error_msg; int max = 0; @@ -447,42 +447,149 @@ static int __ext4_ext_check(const char *function, unsigned int line, corrupted: ext4_error_inode(inode, function, line, 0, - "bad header/extent: %s - magic %x, " - "entries %u, max %u(%u), depth %u(%u)", - error_msg, le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - max, le16_to_cpu(eh->eh_depth), depth); - + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); return -EIO; } -#define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, __LINE__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth, pblk) \ + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) int ext4_ext_check_inode(struct inode *inode) { - return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } -static int __ext4_ext_check_block(const char *function, unsigned int line, - struct inode *inode, - struct ext4_extent_header *eh, - int depth, - struct buffer_head *bh) +static struct buffer_head * +__read_extent_tree_block(const char *function, unsigned int line, + struct inode *inode, ext4_fsblk_t pblk, int depth, + int flags) { - int ret; + struct buffer_head *bh; + int err; - if (buffer_verified(bh)) - return 0; - ret = ext4_ext_check(inode, eh, depth); - if (ret) - return ret; + bh = sb_getblk(inode->i_sb, pblk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); + err = bh_submit_read(bh); + if (err < 0) + goto errout; + } + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) + return bh; + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); - return ret; + /* + * If this is a leaf block, cache all of its entries + */ + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { + struct ext4_extent_header *eh = ext_block_hdr(bh); + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, + lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_uninitialized(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } + } + return bh; +errout: + put_bh(bh); + return ERR_PTR(err); + } -#define ext4_ext_check_block(inode, eh, depth, bh) \ - __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh) +#define read_extent_tree_block(inode, pblk, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ + (depth), (flags)) + +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_ext_path *path = NULL; + struct buffer_head *bh; + int i = 0, depth, ret = 0; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return 0; /* not an extent-mapped inode */ + + down_read(&ei->i_data_sem); + depth = ext_depth(inode); + + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + up_read(&ei->i_data_sem); + return -ENOMEM; + } + + /* Don't cache anything if there are no external extent blocks */ + if (depth == 0) + goto out; + path[0].p_hdr = ext_inode_hdr(inode); + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); + if (ret) + goto out; + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); + while (i >= 0) { + /* + * If this is a leaf block or we've reached the end of + * the index block, go up + */ + if ((i == depth) || + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx++), + depth - i - 1, + EXT4_EX_FORCE_CACHE); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + break; + } + i++; + path[i].p_bh = bh; + path[i].p_hdr = ext_block_hdr(bh); + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); + } + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: + up_read(&ei->i_data_sem); + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -716,7 +823,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) struct ext4_ext_path * ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; @@ -748,20 +855,13 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) { - ret = -ENOMEM; + bh = read_extent_tree_block(inode, path[ppos].p_block, --i, + flags); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); goto err; } - if (!bh_uptodate_or_lock(bh)) { - trace_ext4_ext_load_extent(inode, block, - path[ppos].p_block); - ret = bh_submit_read(bh); - if (ret < 0) { - put_bh(bh); - goto err; - } - } + eh = ext_block_hdr(bh); ppos++; if (unlikely(ppos > depth)) { @@ -773,11 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; - i--; - - ret = ext4_ext_check_block(inode, eh, i, bh); - if (ret < 0) - goto err; } path[ppos].p_depth = i; @@ -1198,7 +1293,8 @@ out: * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int flags, + unsigned int mb_flags, + unsigned int gb_flags, struct ext4_ext_path *path, struct ext4_extent *newext) { @@ -1220,7 +1316,7 @@ repeat: if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, flags, path, newext, i); + err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; @@ -1228,12 +1324,12 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + path, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, newext); + err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); if (err) goto out; @@ -1241,7 +1337,7 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + path, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1412,29 +1508,21 @@ got_index: ix++; block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; - eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check_block(inode, eh, - path->p_depth - depth, bh)) { - put_bh(bh); - return -EIO; - } + bh = read_extent_tree_block(inode, block, + path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); block = ext4_idx_pblock(ix); put_bh(bh); } - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; + bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = ext_block_hdr(bh); - if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) { - put_bh(bh); - return -EIO; - } ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); @@ -1705,7 +1793,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | + EXT4_FREE_BLOCKS_RESERVE); } /* @@ -1793,7 +1882,7 @@ out: */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext, int flag) + struct ext4_extent *newext, int gb_flags) { struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; @@ -1802,7 +1891,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, int depth, len, err; ext4_lblk_t next; unsigned uninitialized = 0; - int flags = 0; + int mb_flags = 0; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1817,7 +1906,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { + if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { /* * Try to see whether we should rather test the extent on @@ -1920,7 +2009,7 @@ prepend: if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL); + npath = ext4_ext_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -1939,9 +2028,10 @@ prepend: * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) - flags = EXT4_MB_USE_RESERVED; - err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + mb_flags = EXT4_MB_USE_RESERVED; + err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + path, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2007,7 +2097,7 @@ has_space: merge: /* try to merge extents */ - if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) + if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); @@ -2050,7 +2140,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, path = NULL; } - path = ext4_ext_find_extent(inode, block, path); + path = ext4_ext_find_extent(inode, block, path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); @@ -2195,8 +2285,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { int depth = ext_depth(inode); - unsigned long len; - ext4_lblk_t lblock; + unsigned long len = 0; + ext4_lblk_t lblock = 0; struct ext4_extent *ex; ex = path[depth].p_ext; @@ -2233,7 +2323,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); } else { - lblock = len = 0; BUG(); } @@ -2712,7 +2801,7 @@ again: ext4_lblk_t ee_block; /* find extent for this block */ - path = ext4_ext_find_extent(inode, end, NULL); + path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2754,6 +2843,7 @@ again: */ err = ext4_split_extent_at(handle, inode, path, end + 1, split_flag, + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); @@ -2782,7 +2872,7 @@ again: path[0].p_hdr = ext_inode_hdr(inode); i = 0; - if (ext4_ext_check(inode, path[0].p_hdr, depth)) { + if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { err = -EIO; goto out; } @@ -2829,10 +2919,12 @@ again: ext_debug("move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); - if (!bh) { + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx), depth - i - 1, + EXT4_EX_NOCACHE); + if (IS_ERR(bh)) { /* should we reset i_size? */ - err = -EIO; + err = PTR_ERR(bh); break; } /* Yield here to deal with large extent trees. @@ -2842,11 +2934,6 @@ again: err = -EIO; break; } - if (ext4_ext_check_block(inode, ext_block_hdr(bh), - depth - i - 1, bh)) { - err = -EIO; - break; - } path[i + 1].p_bh = bh; /* save actual number of indexes since this @@ -2961,6 +3048,23 @@ void ext4_ext_release(struct super_block *sb) #endif } +static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { @@ -3113,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle, goto fix_extent_len; /* update extent status tree */ - err = ext4_es_zeroout(inode, &zero_ex); + err = ext4_zeroout_es(inode, &zero_ex); goto out; } else if (err) @@ -3133,7 +3237,7 @@ fix_extent_len: * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * - * It may result in splitting the extent into multiple extents (upto three) + * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent @@ -3181,7 +3285,7 @@ static int ext4_split_extent(handle_t *handle, * result in split of original leaf or extent zeroout. */ ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3464,7 +3568,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, out: /* If we have gotten a failure, don't zero out status tree */ if (!err) - err = ext4_es_zeroout(inode, &zero_ex); + err = ext4_zeroout_es(inode, &zero_ex); return err ? err : allocated; } @@ -3565,7 +3669,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, if (err < 0) goto out; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4052,7 +4156,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL); + path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -4744,6 +4848,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + } + /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, @@ -4771,6 +4881,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } - + ext4_es_lru_add(inode); return error; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 91cb110da1b4..3981ff783950 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -13,7 +13,6 @@ #include <linux/list_sort.h> #include "ext4.h" #include "extents_status.h" -#include "ext4_extents.h" #include <trace/events/ext4.h> @@ -263,7 +262,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, if (tree->cache_es) { es1 = tree->cache_es; if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u) %llu %llx\n", + es_debug("%u cached by [%u/%u) %llu %x\n", lblk, es1->es_lblk, es1->es_len, ext4_es_pblock(es1), ext4_es_status(es1)); goto out; @@ -409,6 +408,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) } #ifdef ES_AGGRESSIVE_TEST +#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ + static void ext4_es_insert_extent_ext_check(struct inode *inode, struct extent_status *es) { @@ -419,7 +420,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, unsigned short ee_len; int depth, ee_status, es_status; - path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return; @@ -641,13 +642,13 @@ out: */ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned long long status) + unsigned int status) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; - es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", + es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); if (!len) @@ -684,6 +685,38 @@ error: } /* + * ext4_es_cache_extent() inserts information into the extent status + * tree if and only if there isn't information about the range in + * question already. + */ +void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) +{ + struct extent_status *es; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock(&newes, pblk); + ext4_es_store_status(&newes, status); + trace_ext4_es_cache_extent(inode, &newes); + + if (!len) + return; + + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + + es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); + if (!es || es->es_lblk > end) + __es_insert_extent(inode, &newes); + write_unlock(&EXT4_I(inode)->i_es_lock); +} + +/* * ext4_es_lookup_extent() looks up an extent in extent status tree. * * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. @@ -871,23 +904,6 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } -int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) -{ - ext4_lblk_t ee_block; - ext4_fsblk_t ee_pblock; - unsigned int ee_len; - - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - ee_pblock = ext4_ext_pblock(ex); - - if (ee_len == 0) - return 0; - - return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN); -} - static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, struct list_head *b) { @@ -895,6 +911,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, eia = list_entry(a, struct ext4_inode_info, i_es_lru); eib = list_entry(b, struct ext4_inode_info, i_es_lru); + if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return 1; + if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return -1; if (eia->i_touch_when == eib->i_touch_when) return 0; if (time_after(eia->i_touch_when, eib->i_touch_when)) @@ -908,22 +930,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, { struct ext4_inode_info *ei; struct list_head *cur, *tmp; - LIST_HEAD(skiped); - int ret, nr_shrunk = 0; + LIST_HEAD(skipped); + int nr_shrunk = 0; + int retried = 0, skip_precached = 1, nr_skipped = 0; spin_lock(&sbi->s_es_lru_lock); - /* - * If the inode that is at the head of LRU list is newer than - * last_sorted time, that means that we need to sort this list. - */ - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); - if (sbi->s_es_last_sorted < ei->i_touch_when) { - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - sbi->s_es_last_sorted = jiffies; - } - +retry: list_for_each_safe(cur, tmp, &sbi->s_es_lru) { + int shrunk; + /* * If we have already reclaimed all extents from extent * status tree, just stop the loop immediately. @@ -933,9 +949,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ei = list_entry(cur, struct ext4_inode_info, i_es_lru); - /* Skip the inode that is newer than the last_sorted time */ - if (sbi->s_es_last_sorted < ei->i_touch_when) { - list_move_tail(cur, &skiped); + /* + * Skip the inode that is newer than the last_sorted + * time. Normally we try hard to avoid shrinking + * precached inodes, but we will as a last resort. + */ + if ((sbi->s_es_last_sorted < ei->i_touch_when) || + (skip_precached && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED))) { + nr_skipped++; + list_move_tail(cur, &skipped); continue; } @@ -943,28 +966,63 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, continue; write_lock(&ei->i_es_lock); - ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); write_unlock(&ei->i_es_lock); - nr_shrunk += ret; - nr_to_scan -= ret; + nr_shrunk += shrunk; + nr_to_scan -= shrunk; if (nr_to_scan == 0) break; } /* Move the newer inodes into the tail of the LRU list. */ - list_splice_tail(&skiped, &sbi->s_es_lru); + list_splice_tail(&skipped, &sbi->s_es_lru); + INIT_LIST_HEAD(&skipped); + + /* + * If we skipped any inodes, and we weren't able to make any + * forward progress, sort the list and try again. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); + sbi->s_es_last_sorted = jiffies; + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, + i_es_lru); + /* + * If there are no non-precached inodes left on the + * list, start releasing precached extents. + */ + if (ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) + skip_precached = 0; + goto retry; + } + spin_unlock(&sbi->s_es_lru_lock); if (locked_ei && nr_shrunk == 0) - nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); return nr_shrunk; } -static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long ext4_es_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr; + struct ext4_sb_info *sbi; + + sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); + return nr; +} + +static unsigned long ext4_es_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); @@ -979,9 +1037,8 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); - return ret; + return nr_shrunk; } void ext4_es_register_shrinker(struct ext4_sb_info *sbi) @@ -989,7 +1046,8 @@ void ext4_es_register_shrinker(struct ext4_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); sbi->s_es_last_sorted = 0; - sbi->s_es_shrinker.shrink = ext4_es_shrink; + sbi->s_es_shrinker.scan_objects = ext4_es_scan; + sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&sbi->s_es_shrinker); } @@ -1033,11 +1091,17 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, struct ext4_es_tree *tree = &ei->i_es_tree; struct rb_node *node; struct extent_status *es; - int nr_shrunk = 0; + unsigned long nr_shrunk = 0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (ei->i_es_lru_nr == 0) return 0; + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + node = rb_first(&tree->root); while (node != NULL) { es = rb_entry(node, struct extent_status, rb_node); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index e936730cc5b0..167f4ab8ecc3 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -29,16 +29,26 @@ /* * These flags live in the high bits of extent_status.es_pblk */ -#define EXTENT_STATUS_WRITTEN (1ULL << 63) -#define EXTENT_STATUS_UNWRITTEN (1ULL << 62) -#define EXTENT_STATUS_DELAYED (1ULL << 61) -#define EXTENT_STATUS_HOLE (1ULL << 60) +#define ES_SHIFT 60 + +#define EXTENT_STATUS_WRITTEN (1 << 3) +#define EXTENT_STATUS_UNWRITTEN (1 << 2) +#define EXTENT_STATUS_DELAYED (1 << 1) +#define EXTENT_STATUS_HOLE (1 << 0) #define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +#define ES_WRITTEN (1ULL << 63) +#define ES_UNWRITTEN (1ULL << 62) +#define ES_DELAYED (1ULL << 61) +#define ES_HOLE (1ULL << 60) + +#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ + ES_DELAYED | ES_HOLE) + struct ext4_sb_info; struct ext4_extent; @@ -60,7 +70,10 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned long long status); + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_delayed_extent_range(struct inode *inode, @@ -68,36 +81,35 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); -extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0; + return (es->es_pblk & ES_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0; + return (es->es_pblk & ES_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0; + return (es->es_pblk & ES_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_HOLE) != 0; + return (es->es_pblk & ES_HOLE) != 0; } -static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) +static inline unsigned int ext4_es_status(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_FLAGS); + return es->es_pblk >> ES_SHIFT; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { - return (es->es_pblk & ~EXTENT_STATUS_FLAGS); + return es->es_pblk & ~ES_MASK; } static inline void ext4_es_store_pblock(struct extent_status *es, @@ -105,19 +117,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es, { ext4_fsblk_t block; - block = (pb & ~EXTENT_STATUS_FLAGS) | - (es->es_pblk & EXTENT_STATUS_FLAGS); + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_status(struct extent_status *es, - unsigned long long status) + unsigned int status) { - ext4_fsblk_t block; - - block = (status & EXTENT_STATUS_FLAGS) | - (es->es_pblk & ~EXTENT_STATUS_FLAGS); - es->es_pblk = block; + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (es->es_pblk & ~ES_MASK)); } extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 319c9d26279a..3da21945ff1f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8bf5999875ee..137193ff389b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,18 +70,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -117,6 +115,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; + struct ext4_group_info *grp; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +184,8 @@ verify: put_bh(bh); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } ext4_unlock_group(sb, block_group); @@ -221,6 +222,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " @@ -266,7 +268,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) + /* Don't bother if the inode bitmap is corrupt. */ + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) goto error_return; BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -315,8 +319,10 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - } else + } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + } error_return: brelse(bitmap_bh); @@ -625,6 +631,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) + */ +#define RECENTCY_MIN 5 +#define RECENTCY_DIRTY 30 + +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + unsigned long dtime, now; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0, recentcy = RECENTCY_MIN; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + + bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (unlikely(!bh) || !buffer_uptodate(bh)) + /* + * If the block is not in the buffer cache, then it + * must have been written out. + */ + goto out; + + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + dtime = le32_to_cpu(raw_inode->i_dtime); + now = get_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && (dtime < now) && (now < dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; +} + +/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -652,6 +703,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, struct inode *ret; ext4_group_t i; ext4_group_t flex_group; + struct ext4_group_info *grp; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -725,10 +777,22 @@ got_group: continue; } + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (++group == ngroups) + group = 0; + continue; + } + brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!inode_bitmap_bh) - goto out; + /* Skip groups with suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (++group == ngroups) + group = 0; + continue; + } repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) @@ -741,6 +805,11 @@ repeat_in_this_group: "inode=%lu", ino + 1); continue; } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { + ino++; + goto next_inode; + } if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, @@ -764,6 +833,7 @@ repeat_in_this_group: ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ +next_inode: if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 87b30cd357e7..594009f5f523 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -23,7 +23,6 @@ #include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" -#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ #include <trace/events/ext4.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c2ca04e67a4f..0d424d7ac02b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -553,7 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, @@ -653,7 +653,7 @@ found: if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, @@ -727,8 +727,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ret = ext4_map_blocks(handle, inode, &map, flags); if (ret > 0) { + ext4_io_end_t *io_end = ext4_inode_aio(inode); + map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) + set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } @@ -969,7 +973,8 @@ retry_journal: ext4_journal_stop(handle); goto retry_grab; } - wait_on_page_writeback(page); + /* In case writeback began while the page was unlocked */ + wait_for_stable_page(page); if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); @@ -1633,7 +1638,7 @@ add_delayed: set_buffer_delay(bh); } else if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, @@ -1890,12 +1895,32 @@ static int ext4_writepage(struct page *page, return ret; } +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ + int len; + loff_t size = i_size_read(mpd->inode); + int err; + + BUG_ON(page->index != mpd->first_page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + clear_page_dirty_for_io(page); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); + if (!err) + mpd->wbc->nr_to_write--; + mpd->first_page++; + + return err; +} + #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). - * The rest of mballoc seems to handle chunks upto full group size. + * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 @@ -1904,82 +1929,94 @@ static int ext4_writepage(struct page *page, * * @mpd - extent of blocks * @lblk - logical number of the block in the file - * @b_state - b_state of the buffer head added + * @bh - buffer head we want to add to the extent * - * the function is used to collect contig. blocks in same state + * The function is used to collect contig. blocks in the same state. If the + * buffer doesn't require mapping for writeback and we haven't started the + * extent of buffers to map yet, the function returns 'true' immediately - the + * caller can write the buffer right away. Otherwise the function returns true + * if the block has been added to the extent, false if the block couldn't be + * added. */ -static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, - unsigned long b_state) +static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; - /* Don't go larger than mballoc is willing to allocate */ - if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) - return 0; + /* Buffer that doesn't need mapping for writeback? */ + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh))) { + /* So far no extent to map => we write the buffer right away */ + if (map->m_len == 0) + return true; + return false; + } /* First block in the extent? */ if (map->m_len == 0) { map->m_lblk = lblk; map->m_len = 1; - map->m_flags = b_state & BH_FLAGS; - return 1; + map->m_flags = bh->b_state & BH_FLAGS; + return true; } + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return false; + /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && - (b_state & BH_FLAGS) == map->m_flags) { + (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; - return 1; + return true; } - return 0; + return false; } -static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, - struct buffer_head *head, - struct buffer_head *bh, - ext4_lblk_t lblk) +/* + * mpage_process_page_bufs - submit page buffers for IO or add them to extent + * + * @mpd - extent of blocks for mapping + * @head - the first buffer in the page + * @bh - buffer we should start processing from + * @lblk - logical number of the block in the file corresponding to @bh + * + * Walk through page buffers from @bh upto @head (exclusive) and either submit + * the page for IO if all buffers in this page were mapped and there's no + * accumulated extent of buffers to map or add buffers in the page to the + * extent of buffers to map. The function returns 1 if the caller can continue + * by processing the next page, 0 if it should stop adding buffers to the + * extent to map because we cannot extend it anymore. It can also return value + * < 0 in case of error during IO submission. + */ +static int mpage_process_page_bufs(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) { struct inode *inode = mpd->inode; + int err; ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) >> inode->i_blkbits; do { BUG_ON(buffer_locked(bh)); - if (!buffer_dirty(bh) || !buffer_mapped(bh) || - (!buffer_delay(bh) && !buffer_unwritten(bh)) || - lblk >= blocks) { + if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) - return false; - if (lblk >= blocks) - return true; - continue; + return 0; + /* Everything mapped so far and we hit EOF */ + break; } - if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) - return false; } while (lblk++, (bh = bh->b_this_page) != head); - return true; -} - -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) -{ - int len; - loff_t size = i_size_read(mpd->inode); - int err; - - BUG_ON(page->index != mpd->first_page); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); - if (!err) - mpd->wbc->nr_to_write--; - mpd->first_page++; - - return err; + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_page(mpd, head->b_page); + if (err < 0) + return err; + } + return lblk < blocks; } /* @@ -2003,8 +2040,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct inode *inode = mpd->inode; struct buffer_head *head, *bh; int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; - ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) - >> inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; sector_t pblock; @@ -2026,7 +2061,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (page->index > end) break; - /* Upto 'end' pages must be contiguous */ + /* Up to 'end' pages must be contiguous */ BUG_ON(page->index != start); bh = head = page_buffers(page); do { @@ -2039,18 +2074,26 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) */ mpd->map.m_len = 0; mpd->map.m_flags = 0; - add_page_bufs_to_extent(mpd, head, bh, - lblk); + /* + * FIXME: If dioread_nolock supports + * blocksize < pagesize, we need to make + * sure we add size mapped so far to + * io_end->size as the following call + * can submit the page for IO. + */ + err = mpage_process_page_bufs(mpd, head, + bh, lblk); pagevec_release(&pvec); - return 0; + if (err > 0) + err = 0; + return err; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - } while (++lblk < blocks && - (bh = bh->b_this_page) != head); + } while (lblk++, (bh = bh->b_this_page) != head); /* * FIXME: This is going to break if dioread_nolock @@ -2199,12 +2242,10 @@ static int mpage_map_and_submit_extent(handle_t *handle, /* Update on-disk size after IO is submitted */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); if (disksize > EXT4_I(inode)->i_disksize) { int err2; - ext4_update_i_disksize(inode, disksize); + ext4_wb_update_i_disksize(inode, disksize); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) ext4_error(inode->i_sb, @@ -2219,7 +2260,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of - * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ @@ -2319,14 +2360,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) lblk = ((ext4_lblk_t)page->index) << (PAGE_CACHE_SHIFT - blkbits); head = page_buffers(page); - if (!add_page_bufs_to_extent(mpd, head, head, lblk)) + err = mpage_process_page_bufs(mpd, head, head, lblk); + if (err <= 0) goto out; - /* So far everything mapped? Submit the page for IO. */ - if (mpd->map.m_len == 0) { - err = mpage_submit_page(mpd, page); - if (err < 0) - goto out; - } + err = 0; /* * Accumulated enough dirty pages? This doesn't apply @@ -2410,7 +2447,7 @@ static int ext4_writepages(struct address_space *mapping, if (ext4_should_dioread_nolock(inode)) { /* - * We may need to convert upto one extent per block in + * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); @@ -2646,7 +2683,7 @@ retry_journal: goto retry_grab; } /* In case writeback began while the page was unlocked */ - wait_on_page_writeback(page); + wait_for_stable_page(page); ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -2991,19 +3028,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private, int ret, - bool is_async) + ssize_t size, void *private) { - struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; /* if not async direct IO just return */ - if (!io_end) { - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); + if (!io_end) return; - } ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3013,11 +3044,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, iocb->private = NULL; io_end->offset = offset; io_end->size = size; - if (is_async) { - io_end->iocb = iocb; - io_end->result = ret; - } - ext4_put_io_end_defer(io_end); + ext4_put_io_end(io_end); } /* @@ -3102,7 +3129,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ret = -ENOMEM; goto retake_lock; } - io_end->flag |= EXT4_IO_END_DIRECT; /* * Grab reference for DIO. Will be dropped in ext4_end_io_dio() */ @@ -3147,13 +3173,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { WARN_ON(iocb->private != io_end); WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - WARN_ON(io_end->iocb); - /* - * Generic code already did inode_dio_done() so we - * have to clear EXT4_IO_END_DIRECT to not do it for - * the second time. - */ - io_end->flag = 0; ext4_put_io_end(io_end); iocb->private = NULL; } @@ -4566,7 +4585,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + handle_t *handle; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4574,73 +4594,69 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } - } - - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size)) { - handle_t *handle; - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - if (ext4_handle_valid(handle)) { - error = ext4_orphan_add(handle, inode); - orphan = 1; - } - EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; - ext4_journal_stop(handle); - - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + if (S_ISREG(inode->i_mode) && + (attr->ia_size < inode->i_size)) { + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) { - /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, - EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); + if (error) goto err_out; - } - ext4_orphan_del(handle, inode); - orphan = 0; - ext4_journal_stop(handle); + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); goto err_out; } - } - } - - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != inode->i_size) { - loff_t oldsize = inode->i_size; - - i_size_write(inode, attr->ia_size); - /* - * Blocks are going to be removed from the inode. Wait - * for dio in flight. Temporarily disable - * dioread_nolock to prevent livelock. - */ - if (orphan) { - if (!ext4_should_journal_data(inode)) { - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ext4_inode_resume_unlocked_dio(inode); - } else - ext4_wait_for_tail_page_commit(inode); + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; } + down_write(&EXT4_I(inode)->i_data_sem); + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; /* - * Truncate pagecache after we've waited for commit - * in data=journal mode to make pages freeable. + * We have to update i_size under i_data_sem together + * with i_disksize to avoid races with writeback code + * running ext4_wb_update_i_disksize(). */ - truncate_pagecache(inode, oldsize, inode->i_size); + if (!error) + i_size_write(inode, attr->ia_size); + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); + if (error) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + } else + i_size_write(inode, attr->ia_size); + + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. Temporarily disable + * dioread_nolock to prevent livelock. + */ + if (orphan) { + if (!ext4_should_journal_data(inode)) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } else + ext4_wait_for_tail_page_commit(inode); } - ext4_truncate(inode); + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, inode->i_size); } + /* + * We want to call ext4_truncate() even if attr->ia_size == + * inode->i_size for cases like truncation of fallocated space + */ + if (attr->ia_valid & ATTR_SIZE) + ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c0427e2f6648..a569d335f804 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -17,7 +17,6 @@ #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" -#include "ext4_extents.h" #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) @@ -624,6 +623,8 @@ resizefs_out: return 0; } + case EXT4_IOC_PRECACHE_EXTENTS: + return ext4_ext_precache(inode); default: return -ENOTTY; @@ -688,6 +689,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_MOVE_EXT: case FITRIM: case EXT4_IOC_RESIZE_FS: + case EXT4_IOC_PRECACHE_EXTENTS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4bbbf13bd743..a41e3ba8cfaa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -751,13 +751,15 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd", + "%u clusters in bitmap, %u in gd; " + "block bitmap corrupt.", free, grp->bb_free); /* - * If we intent to continue, we consider group descritor + * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -1398,6 +1400,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + /* Don't bother if the block group is corrupt. */ + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return; + mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1423,7 +1429,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, inode ? inode->i_ino : 0, blocknr, "freeing already freed block " - "(bit %u)", block); + "(bit %u); block bitmap corrupt.", + block); + /* Mark the block group as corrupt. */ + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &e4b->bd_info->bb_state); mb_regenerate_buddy(e4b); goto done; } @@ -1790,6 +1800,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (err) return err; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { + ext4_mb_unload_buddy(e4b); + return 0; + } + ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); @@ -1987,6 +2002,9 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (cr <= 2 && free < ac->ac_g_ex.fe_len) return 0; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return 0; + /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); @@ -4585,6 +4603,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; @@ -4673,6 +4692,10 @@ do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( + ext4_get_group_info(sb, block_group)))) + return; + /* * Check to see if we are freeing blocks across a group * boundary. @@ -4784,7 +4807,6 @@ do_more: ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); @@ -4792,10 +4814,23 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - ext4_mb_unload_buddy(&e4b); - - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, + count_clusters); + spin_lock(&ei->i_block_reservation_lock); + if (flags & EXT4_FREE_BLOCKS_METADATA) + ei->i_reserved_meta_blocks += count_clusters; + else + ei->i_reserved_data_blocks += count_clusters; + spin_unlock(&ei->i_block_reservation_lock); + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_reclaim_block(inode, + EXT4_C2B(sbi, count_clusters)); + } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + + ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 49e8bdff9163..2ae73a80c19b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); - path = ext4_ext_find_extent(inode, lb->first_block, NULL); + path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); @@ -494,7 +494,7 @@ int ext4_ext_migrate(struct inode *inode) * superblock modification. * * For the tmp_inode we already have committed the - * trascation that created the inode. Later as and + * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e86dddbd8296..7fa4d855dbd5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, int ret = 0; struct ext4_ext_path *path; - path = ext4_ext_find_extent(inode, lblock, *orig_path); + path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); if (IS_ERR(path)) ret = PTR_ERR(path); else if (path[ext_depth(inode)].p_ext == NULL) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 35f55a0dbc4b..1bec5a5c1e45 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3005,15 +3005,19 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. + * + * n.b. old_{dentry,inode) refers to the source dentry/inode + * while new_{dentry,inode) refers to the destination dentry/inode + * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - handle_t *handle; + handle_t *handle = NULL; struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; - int retval, force_da_alloc = 0; + int retval; int inlined = 0, new_inlined = 0; struct ext4_dir_entry_2 *parent_de; @@ -3026,14 +3030,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * in separate transaction */ if (new_dentry->d_inode) dquot_initialize(new_dentry->d_inode); - handle = ext4_journal_start(old_dir, EXT4_HT_DIR, - (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - ext4_handle_sync(handle); old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); /* @@ -3056,6 +3052,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new_bh = NULL; } } + if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) + ext4_alloc_da_blocks(old_inode); + + handle = ext4_journal_start(old_dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + ext4_handle_sync(handle); + if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; @@ -3186,8 +3194,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext4_orphan_add(handle, new_inode); - if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) - force_da_alloc = 1; } retval = 0; @@ -3195,9 +3201,8 @@ end_rename: brelse(dir_bh); brelse(old_bh); brelse(new_bh); - ext4_journal_stop(handle); - if (retval == 0 && force_da_alloc) - ext4_alloc_da_blocks(old_inode); + if (handle) + ext4_journal_stop(handle); return retval; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 6625d210fb45..d7d0c7b46ed4 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -123,10 +123,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) ext4_finish_bio(bio); bio_put(bio); } - if (io_end->flag & EXT4_IO_END_DIRECT) - inode_dio_done(io_end->inode); - if (io_end->iocb) - aio_complete(io_end->iocb, io_end->result, 0); kmem_cache_free(io_end_cachep, io_end); } @@ -204,19 +200,14 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + /* Only reserved conversions from writeback should enter here */ + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle); spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (io_end->handle) { - wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; - if (list_empty(&ei->i_rsv_conversion_list)) - queue_work(wq, &ei->i_rsv_conversion_work); - list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); - } else { - wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; - if (list_empty(&ei->i_unrsv_conversion_list)) - queue_work(wq, &ei->i_unrsv_conversion_work); - list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); - } + wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } @@ -256,13 +247,6 @@ void ext4_end_io_rsv_work(struct work_struct *work) ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); } -void ext4_end_io_unrsv_work(struct work_struct *work) -{ - struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, - i_unrsv_conversion_work); - ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); -} - ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b59373b625e9..2c2e6cbc6bed 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -162,7 +162,7 @@ void *ext4_kvmalloc(size_t size, gfp_t flags) { void *ret; - ret = kmalloc(size, flags); + ret = kmalloc(size, flags | __GFP_NOWARN); if (!ret) ret = __vmalloc(size, flags, PAGE_KERNEL); return ret; @@ -172,7 +172,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags) { void *ret; - ret = kzalloc(size, flags); + ret = kzalloc(size, flags | __GFP_NOWARN); if (!ret) ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); return ret; @@ -762,9 +762,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - flush_workqueue(sbi->unrsv_conversion_wq); flush_workqueue(sbi->rsv_conversion_wq); - destroy_workqueue(sbi->unrsv_conversion_wq); destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { @@ -875,14 +873,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); - INIT_LIST_HEAD(&ei->i_unrsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); - INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work); return &ei->vfs_inode; } @@ -1134,8 +1130,8 @@ enum { Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, - Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, + Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -1179,6 +1175,7 @@ static const match_table_t tokens = { {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_abort, "abort"}, @@ -1338,6 +1335,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_NO_EXT2 0x0100 #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) +#define MOPT_STRING 0x0400 static const struct mount_opts { int token; @@ -1387,6 +1385,7 @@ static const struct mount_opts { {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, {Opt_journal_dev, 0, MOPT_GTE0}, + {Opt_journal_path, 0, MOPT_STRING}, {Opt_journal_ioprio, 0, MOPT_GTE0}, {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, @@ -1480,7 +1479,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - if (args->from && match_int(args, &arg)) + if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) return -1; if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) return -1; @@ -1544,6 +1543,44 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } *journal_devnum = arg; + } else if (token == Opt_journal_path) { + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext4_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return -1; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext4_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return -1; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext4_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return -1; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); } else if (token == Opt_journal_ioprio) { if (arg > 7) { ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" @@ -3954,14 +3991,6 @@ no_journal: goto failed_mount4; } - EXT4_SB(sb)->unrsv_conversion_wq = - alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!EXT4_SB(sb)->unrsv_conversion_wq) { - printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); - ret = -ENOMEM; - goto failed_mount4; - } - /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -4115,8 +4144,6 @@ failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - if (EXT4_SB(sb)->unrsv_conversion_wq) - destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); failed_mount_wq: if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -4564,7 +4591,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait) trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); - flush_workqueue(sbi->unrsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots @@ -4600,7 +4626,6 @@ static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) trace_ext4_sync_fs(sb, wait); flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); dquot_writeback_dquots(sb, -1); if (wait && test_opt(sb, BARRIER)) ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 66a6b85a51d8..bb312201ca95 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -182,7 +182,7 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; -int check_orphan_space(struct f2fs_sb_info *sbi) +int acquire_orphan_inode(struct f2fs_sb_info *sbi) { unsigned int max_orphans; int err = 0; @@ -197,10 +197,19 @@ int check_orphan_space(struct f2fs_sb_info *sbi) mutex_lock(&sbi->orphan_inode_mutex); if (sbi->n_orphans >= max_orphans) err = -ENOSPC; + else + sbi->n_orphans++; mutex_unlock(&sbi->orphan_inode_mutex); return err; } +void release_orphan_inode(struct f2fs_sb_info *sbi) +{ + mutex_lock(&sbi->orphan_inode_mutex); + sbi->n_orphans--; + mutex_unlock(&sbi->orphan_inode_mutex); +} + void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct list_head *head, *this; @@ -229,21 +238,18 @@ retry: list_add(&new->list, this->prev); else list_add_tail(&new->list, head); - - sbi->n_orphans++; out: mutex_unlock(&sbi->orphan_inode_mutex); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *this, *next, *head; + struct list_head *head; struct orphan_inode_entry *orphan; mutex_lock(&sbi->orphan_inode_mutex); head = &sbi->orphan_inode_list; - list_for_each_safe(this, next, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); + list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); kmem_cache_free(orphan_entry_slab, orphan); @@ -373,7 +379,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp1; - pre_version = le64_to_cpu(cp_block->checkpoint_ver); + pre_version = cur_cp_version(cp_block); /* Read the 2nd cp block in this CP pack */ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; @@ -388,7 +394,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp2; - cur_version = le64_to_cpu(cp_block->checkpoint_ver); + cur_version = cur_cp_version(cp_block); if (cur_version == pre_version) { *version = cur_version; @@ -793,7 +799,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) * Increase the version number so that * SIT entries and seg summaries are written at correct place */ - ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver); + ckpt_ver = cur_cp_version(ckpt); ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 035f9a345cdf..941f9b9ca3a5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -37,9 +37,9 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) struct page *node_page = dn->node_page; unsigned int ofs_in_node = dn->ofs_in_node; - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, false); - rn = (struct f2fs_node *)page_address(node_page); + rn = F2FS_NODE(node_page); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); @@ -117,7 +117,8 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) block_t start_blkaddr, end_blkaddr; BUG_ON(blk_addr == NEW_ADDR); - fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node; + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; /* Update the page address in the parent node */ __set_data_blkaddr(dn, blk_addr); @@ -176,7 +177,6 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) end_update: write_unlock(&fi->ext.ext_lock); sync_inode_page(dn); - return; } struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) @@ -260,8 +260,17 @@ repeat: if (PageUptodate(page)) return page; - BUG_ON(dn.data_blkaddr == NEW_ADDR); - BUG_ON(dn.data_blkaddr == NULL_ADDR); + /* + * A new dentry page is allocated but not able to be written, since its + * new inode page couldn't be allocated due to -ENOSPC. + * In such the case, its blkaddr can be remained as NEW_ADDR. + * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. + */ + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return page; + } err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) @@ -365,7 +374,6 @@ static void read_end_io(struct bio *bio, int err) } unlock_page(page); } while (bvec >= bio->bi_io_vec); - kfree(bio->bi_private); bio_put(bio); } @@ -391,7 +399,6 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, bio->bi_end_io = read_end_io; if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - kfree(bio->bi_private); bio_put(bio); up_read(&sbi->bio_sem); f2fs_put_page(page, 1); @@ -442,7 +449,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, unsigned int end_offset; end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE : + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_ |