您的位置：首页 > Web前端 > Node.js

inode缓存与dentry缓存

2014-01-30 10:45 639 查看

1. inode缓存

struct inode{

/* RCU path lookup touches following: */

umode_t            i_mode;

uid_t            i_uid;

gid_t            i_gid;

const struct inode_operations*i_op;

struct super_block*i_sb;

spinlock_t        i_lock;/* i_blocks, i_bytes, maybe i_size */

unsigned int        i_flags;

unsigned long        i_state;

#ifdef CONFIG_SECURITY

void*i_security;

#endif

struct mutex        i_mutex;

unsigned long        dirtied_when;/* jiffies of first dirtying */

struct hlist_node    i_hash;

struct list_head    i_wb_list;/* backing dev IO list */

struct list_head    i_lru;/*inode LRU list */

struct list_head    i_sb_list;

union{

struct list_head    i_dentry;

struct rcu_head        i_rcu;

};

unsigned long        i_ino;

atomic_t        i_count;

unsigned int        i_nlink;

dev_t            i_rdev;

unsigned int        i_blkbits;

u64            i_version;

loff_t            i_size;

#ifdef __NEED_I_SIZE_ORDERED

seqcount_t        i_size_seqcount;

#endif

struct timespec        i_atime;

struct timespec        i_mtime;

struct timespec        i_ctime;

blkcnt_t        i_blocks;

unsigned short          i_bytes;

struct rw_semaphore    i_alloc_sem;

const struct file_operations*i_fop;/* former ->i_op->default_file_ops */

struct file_lock*i_flock;

struct address_space*i_mapping;

struct address_space    i_data;

#ifdef CONFIG_QUOTA

struct dquot*i_dquot[MAXQUOTAS];

#endif

struct list_head    i_devices;

union{

struct pipe_inode_info*i_pipe;

struct block_device*i_bdev;

struct cdev*i_cdev;

};

__u32            i_generation;

#ifdef CONFIG_FSNOTIFY

__u32            i_fsnotify_mask;/* all events this inode cares about */

struct hlist_head    i_fsnotify_marks;

#endif

#ifdef CONFIG_IMA

atomic_t        i_readcount;/* struct files open RO */

#endif

atomic_t        i_writecount;

#ifdef CONFIG_FS_POSIX_ACL

struct posix_acl*i_acl;

struct posix_acl*i_default_acl;

#endif

void*i_private;/* fs or device private pointer */

};

inode可能处于三种状态：

1）unused，里面没有保存有效的内容，可以被复用为新的用途；

2）in use，正在被使用，其成员i_count以及i_nlink一定大于0，此时inode与文件系统或者说设备上的文件相关联，但是自从上次与设备同步后，内容没有发生改变，即不是dirty的；

3）dirty，inode里面的内容已经与文件系统中的文件内容不一致了，即脏了，需要进行文件同步操作。

前两种状态的inode都各自位于一个全局的链表中，而第三种的inode位于super_block结构体中的一个链表中。

先看inode结构体中的一个成员：

struct list_head i_lru; /* inode LRU list */

对应着一个全局的链表：

static LIST_HEAD(inode_lru);

static DEFINE_SPINLOCK(inode_lru_lock);

/*

* Called when we're dropping the last reference

* to an inode.

* Call the FS "drop_inode()" function, defaulting to

* the legacy UNIX filesystem behaviour.  If it tells

* us to evict inode, do so.  Otherwise, retain inode

* in cache if fs is alive, sync and evict if fs is

* shutting down.

*/

static void iput_final(struct inode *inode)

struct super_block *sb = inode->i_sb;

const struct super_operations *op = inode->i_sb->s_op;

int drop;

WARN_ON(inode->i_state & I_NEW);

if (op && op->drop_inode)

drop = op->drop_inode(inode);

else

drop = generic_drop_inode(inode);

     if (!drop && (sb->s_flags & MS_ACTIVE)){

inode->i_state |= I_REFERENCED;

if (!(inode->i_state & (I_DIRTY|I_SYNC)))

inode_lru_list_add(inode);

spin_unlock(&inode->i_lock);

return;

     if (!drop){

inode->i_state |= I_WILL_FREE;

spin_unlock(&inode->i_lock);

write_inode_now(inode, 1);

spin_lock(&inode->i_lock);

    WARN_ON(inode->i_state & I_NEW);

inode->i_state &= ~I_WILL_FREE;

inode->i_state |= I_FREEING;

inode_lru_list_del(inode);

spin_unlock(&inode->i_lock);

evict(inode);

函数iput_final是在当inode没有被任何地方引用后，即变成了unused状态后，回收inode的机制。

if (op && op->drop_inode)

       drop = op->drop_inode(inode);

   else

       drop = generic_drop_inode(inode);

drop为0时，表示i_nlink为0，并且inode没有保存着inode_hashtable中的拉链表，即这个inode可以被释放掉。

/*

* Normal UNIX filesystem behaviour: delete the

* inode when the usage count drops to zero, and

* i_nlink is zero.

*/

int generic_drop_inode(struct inode *inode)

return !inode->i_nlink || inode_unhashed(inode);

EXPORT_SYMBOL_GPL(generic_drop_inode);

if (!drop && (sb->s_flags & MS_ACTIVE)){

       inode->i_state |= I_REFERENCED;

       if (!(inode->i_state & (I_DIRTY|I_SYNC)))

          inode_lru_list_add(inode);

       spin_unlock(&inode->i_lock);

       return;

   }

如果superblock还存在在系统中，就调用inode_lru_list_add将inode添加到unused列表中，即将inode缓存起来。

否则，就先调用write_inode_now写回到磁盘上，再调用inode_lru_list_del将已经缓存下来的inode删除掉，最后调用evict函数将inode彻底删除。

static void inode_lru_list_add(struct inode *inode)

spin_lock(&inode_lru_lock);

     if (list_empty(&inode->i_lru)){

list_add(&inode->i_lru, &inode_lru);

inodes_stat.nr_unused++;

spin_unlock(&inode_lru_lock);

因此inode_lru就是全局的unused inode列表，通过“Least Recently Used”的顺序保存。

此外，操作inode_lru的函数还有prune_icache

/*

* Scan `goal' inodes on the unused list for freeable ones. They are moved to a

* temporary list and then are freed outside inode_lru_lock by dispose_list().

* Any inodes which are pinned purely because of attached pagecache have their

* pagecache removed.  If the inode has metadata buffers attached to

* mapping->private_list then try to remove them.

* If the inode has the I_REFERENCED flag set, then it means that it has been

* used recently - the flag is set in iput_final(). When we encounter such an

* inode, clear the flag and move it to the back of the LRU so it gets another

* pass through the LRU before it gets reclaimed. This is necessary because of

* the fact we are doing lazy LRU updates to minimise lock contention so the

* LRU does not have strict ordering. Hence we don't want to reclaim inodes

* with this flag set because they are the inodes that are out of order.

*/

static void prune_icache(int nr_to_scan)

LIST_HEAD(freeable);

int nr_scanned;

unsigned long reap = 0;

down_read(&iprune_sem);

spin_lock(&inode_lru_lock);

     for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++){

struct inode *inode;

if (list_empty(&inode_lru))

break;

inode = list_entry(inode_lru.prev, struct inode, i_lru);

/*

* we are inverting the inode_lru_lock/inode->i_lock here,

* so use a trylock. If we fail to get the lock, just move the

* inode to the back of the list so we don't spin on it.

*/

         if (!spin_trylock(&inode->i_lock)){

list_move(&inode->i_lru, &inode_lru);

continue;

/*

* Referenced or dirty inodes are still in use. Give them

* another pass through the LRU as we canot reclaim them now.

*/

if (atomic_read(&inode->i_count) ||

             (inode->i_state & ~I_REFERENCED)){

list_del_init(&inode->i_lru);

    spin_unlock(&inode->i_lock);

inodes_stat.nr_unused--;

continue;

 /*recently referenced inodes get one more pass */

         if (inode->i_state & I_REFERENCED){

inode->i_state &= ~I_REFERENCED;

list_move(&inode->i_lru, &inode_lru);

    spin_unlock(&inode->i_lock);

continue;

         if (inode_has_buffers(inode) || inode->i_data.nrpages){

__iget(inode);

    spin_unlock(&inode->i_lock);

   spin_unlock(&inode_lru_lock);

if (remove_inode_buffers(inode))

reap += invalidate_mapping_pages(&inode->i_data,

0, -1);

iput(inode);

   spin_lock(&inode_lru_lock);

if (inode != list_entry(inode_lru.next,

struct inode, i_lru))

    continue;/* wrong inode or list_empty */

/* avoid lock inversions with trylock */

if (!spin_trylock(&inode->i_lock))

    continue;

             if (!can_unuse(inode)){

   spin_unlock(&inode->i_lock);

    continue;

    WARN_ON(inode->i_state & I_NEW);

    inode->i_state |= I_FREEING;

spin_unlock(&inode->i_lock);

list_move(&inode->i_lru, &freeable);

inodes_stat.nr_unused--;

if (current_is_kswapd())

__count_vm_events(KSWAPD_INODESTEAL, reap);

else

__count_vm_events(PGINODESTEAL, reap);

spin_unlock(&inode_lru_lock);

dispose_list(&freeable);

up_read(&iprune_sem);

该函数的作用是在内存压力较大时，通过缩减缓存的inode列表inode_lru以释放出更多的内存。

该函数就是从inode_lru中从头开始取inode出来，做一些简单检查，如果inode还有一些原因需要继续存在在缓存中，就将该inode移到链表的尾部，然后检查下一个inode。

使得inode继续保留的原因包括：无法获取到操作inode中数据的锁i_lock；inode中的数据是脏的；inode的使用计数非0；inode刚刚被引用过等等。

还有一个比较实用的问题，我们看到在调用iput_final时，检查如果i_nlink为0，并且没有被用作拉链表的话，就将其放到缓存inode_lru中，但是在prune_icache时，会检查i_count引用计数是否为0。

这也就是说，如果一个inode对应的磁盘文件已经被删除了，但是还有进程对其进行操作的话，那么它不会被直接删除，而是会保存在缓存中，也就是说对其操作的进程仍然可以对已经缓存下来的数据页面page进行操作。

直到没有进程再对其进行操作了，才有可能被清除出缓存。

inode中有两个链表头元素，分别是i_sb_list和i_wb_list，其中i_sb_list是super_block->s_inodes列表的元素，而i_wb_list是用于维护设备的后备inode列表。

2. dentry缓存

dentry缓存的目的，为了减少对慢速磁盘的访问，每当VFS文件系统对底层的数据进行访问时，都会将访问的结果缓存下来，保存成一个dentry对象。

而且dentry对象的组织与管理，是和inode缓存极其相似的，也有一个hash表，和一个lru队列。

而且当内存压力较大时，也会调用prune_dcache来企图释放lru中优先级较低的dentry项目。

区别在于，inode是不需要维护目录的关系的，但是dentry需要，因此dentry的组织比inode要复杂。

static struct hlist_bl_head *dentry_hashtable __read_mostly;

在super_block中

/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */

     struct list_head    s_dentry_lru;/* unused dentry lru */

因此，保存dentry全局hash表的数据结构是全局的，而保存dentry缓存的数据结构是存在于super_block数据结构中。

/*

* dentry_lru_(add|del|move_tail) must be called with d_lock held.

*/

static void dentry_lru_add(struct dentry *dentry)

     if (list_empty(&dentry->d_lru)){

spin_lock(&dcache_lru_lock);

list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);

dentry->d_sb->s_nr_dentry_unused++;

dentry_stat.nr_unused++;

spin_unlock(&dcache_lru_lock);

dentry_lur_add函数用于向dentry缓存中添加一个释放的dentry，它被函数dput调用。

/*

* This is dput

* This is complicated by the fact that we do not want to put

* dentries that are no longer on any hash chain on the unused

* list: we'd much rather just get rid of them immediately.

* However, that implies that we have to traverse the dentry

* tree upwards to the parents which might _also_ now be

* scheduled for deletion (it may have been only waiting for

* its last child to go away).

* This tail recursion is done by hand as we don't want to depend

* on the compiler to always get this right (gcc generally doesn't).

* Real recursion would eat up our stack space.

*/

/*

* dput - release a dentry

* @dentry: dentry to release

* Release a dentry. This will drop the usage count and if appropriate

* call the dentry unlink method as well as removing it from the queues and

* releasing its resources. If the parent dentries were scheduled for release

* they too may now get deleted.

*/

void dput(struct dentry *dentry)

if (!dentry)

return;

repeat:

if (dentry->d_count == 1)

might_sleep();

spin_lock(&dentry->d_lock);

BUG_ON(!dentry->d_count);

     if (dentry->d_count > 1){

dentry->d_count--;

spin_unlock(&dentry->d_lock);

return;

     if (dentry->d_flags & DCACHE_OP_DELETE){

if (dentry->d_op->d_delete(dentry))

goto kill_it;

/*Unreachable? Get rid of it */

if (d_unhashed(dentry))

goto kill_it;

/*Otherwise leave it cached and ensure it's on the LRU */

dentry->d_flags |= DCACHE_REFERENCED;

dentry_lru_add(dentry);

dentry->d_count--;

spin_unlock(&dentry->d_lock);

return;

kill_it:

dentry = dentry_kill(dentry, 1);

if (dentry)

goto repeat;

EXPORT_SYMBOL(dput);

所有的dentry实例会形成一个网络，用于反映文件系统的结构。

d_subdirs成员，里面保存着所有的子目录以及该目录下的文件组成的列表。

d_child成员，是该dentry链接到其父目录的dentry节点的锚点。

这两个成员，是构成文件系统的层次结构的基本设施。

if (dentry->d_count == 1)

might_sleep();

参考：http://yuxu9710108.blog.163.com/blog/static/23751534201011715413404/

用于调试时，提示atomic context的可能睡眠情况。

分析dput函数的逻辑：

如果dentry的引用计数大于1，那么代表还有其他的地方在使用这个dentry，因此只减少引用计数，直接返回；

如果dentry->d_flags里面设置了delete标志，那么直接调用d_op->d_delete函数指针删除该dentry，再调用dentry_kill来处理；

【d_op->d_delete与dentry_kill在功能上有什么不同？】

如果在全局的hash表中也已经找不该dentry了，那么直接调用dentry_kill来处理；

如果dentry的引用计数为1，而且也不属于上面二种需要调用dentry_kill的情况，那么就将其缓存在super_block的LRU队列中。

我们看一种可能的d_delete的实现

/*

* This is called from dput() when d_count is going to 0.

*/

static int nfs_dentry_delete(const struct dentry *dentry)

dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",

dentry->d_parent->d_name.name, dentry->d_name.name,

dentry->d_flags);

/*Unhash any dentry with a stale inode */

if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))

return 1;

     if (dentry->d_flags & DCACHE_NFSFS_RENAMED){

 /*Unhash it, so that ->d_iput() would be called */

return 1;

     if (!(dentry->d_sb->s_flags & MS_ACTIVE)){

 /*Unhash it, so that ancestors of killed async unlink

* files will be cleaned up during umount */

return 1;

return 0;

可见，该函数是进行一些内部的判断，决定是否需要将该dentry从全局的hash表中删除掉。

if (dentry->d_flags & DCACHE_OP_DELETE){

       if (dentry->d_op->d_delete(dentry))

           goto kill_it;

   }

/*

* Finish off a dentry we've decided to kill.

* dentry->d_lock must be held, returns with it unlocked.

* If ref is non-zero, then decrement the refcount too.

* Returns dentry requiring refcount drop, or NULL if we're done.

*/

static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)

__releases(dentry->d_lock)

struct inode *inode;

struct dentry *parent;

inode = dentry->d_inode;

     if (inode && !spin_trylock(&inode->i_lock)){

relock:

spin_unlock(&dentry->d_lock);

cpu_relax();

         return dentry;/* try again with same dentry */

if (IS_ROOT(dentry))

parent = NULL;

else

parent = dentry->d_parent;

     if (parent && !spin_trylock(&parent->d_lock)){

if (inode)

    spin_unlock(&inode->i_lock);

goto relock;

if (ref)

dentry->d_count--;

/*if dentry was on the d_lru list delete it from there */

dentry_lru_del(dentry);

/*if it was on the hash then remove it */

__d_drop(dentry);

return d_kill(dentry, parent);

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航