您的位置:首页 > Web前端 > Node.js

inode缓存与dentry缓存

2014-01-30 10:45 639 查看

1. inode缓存

struct inode{


/* RCU path lookup touches following: */


umode_t            i_mode;


uid_t            i_uid;


gid_t            i_gid;


const struct inode_operations*i_op;


struct super_block*i_sb;


 


spinlock_t        i_lock;/* i_blocks, i_bytes, maybe i_size */


unsigned int        i_flags;


unsigned long        i_state;


#ifdef CONFIG_SECURITY


void*i_security;


#endif


struct mutex        i_mutex;


 


 


unsigned long        dirtied_when;/* jiffies of first dirtying */


 


struct hlist_node    i_hash;


struct list_head    i_wb_list;/* backing dev IO list */


struct list_head    i_lru;/*inode LRU list */


struct list_head    i_sb_list;


union{


struct list_head    i_dentry;


struct rcu_head        i_rcu;


};


unsigned long        i_ino;


atomic_t        i_count;


unsigned int        i_nlink;


dev_t            i_rdev;


unsigned int        i_blkbits;


u64            i_version;


loff_t            i_size;


#ifdef __NEED_I_SIZE_ORDERED


seqcount_t        i_size_seqcount;


#endif


struct timespec        i_atime;


struct timespec        i_mtime;


struct timespec        i_ctime;


blkcnt_t        i_blocks;


unsigned short          i_bytes;


struct rw_semaphore    i_alloc_sem;


const struct file_operations*i_fop;/* former ->i_op->default_file_ops */


struct file_lock*i_flock;


struct address_space*i_mapping;


struct address_space    i_data;


#ifdef CONFIG_QUOTA


struct dquot*i_dquot[MAXQUOTAS];


#endif


struct list_head    i_devices;


union{


struct pipe_inode_info*i_pipe;


struct block_device*i_bdev;


struct cdev*i_cdev;


};


 


__u32            i_generation;


 


#ifdef CONFIG_FSNOTIFY


__u32            i_fsnotify_mask;/* all events this inode cares about */


struct hlist_head    i_fsnotify_marks;


#endif


 


#ifdef CONFIG_IMA


atomic_t        i_readcount;/* struct files open RO */


#endif


atomic_t        i_writecount;


#ifdef CONFIG_FS_POSIX_ACL


struct posix_acl*i_acl;


struct posix_acl*i_default_acl;


#endif


void*i_private;/* fs or device private pointer */


};


inode可能处于三种状态:

1)unused,里面没有保存有效的内容,可以被复用为新的用途;

2)in use,正在被使用,其成员i_count以及i_nlink一定大于0,此时inode与文件系统或者说设备上的文件相关联,但是自从上次与设备同步后,内容没有发生改变,即不是dirty的;

3)dirty,inode里面的内容已经与文件系统中的文件内容不一致了,即脏了,需要进行文件同步操作。

 

前两种状态的inode都各自位于一个全局的链表中,而第三种的inode位于super_block结构体中的一个链表中。

 

先看inode结构体中的一个成员:


struct list_head   i_lru;       /* inode LRU list */

 



对应着一个全局的链表:


static LIST_HEAD(inode_lru);

static DEFINE_SPINLOCK(inode_lru_lock);

 



/*


* Called when we're dropping the last reference


* to an inode.


*


* Call the FS "drop_inode()" function, defaulting to


* the legacy UNIX filesystem behaviour.  If it tells


* us to evict inode, do so.  Otherwise, retain inode


* in cache if fs is alive, sync and evict if fs is


* shutting down.


*/


static void iput_final(struct inode *inode)


{


struct super_block *sb = inode->i_sb;


const struct super_operations *op = inode->i_sb->s_op;


int drop;


 


WARN_ON(inode->i_state & I_NEW);


 


if (op && op->drop_inode)


drop = op->drop_inode(inode);


else


drop = generic_drop_inode(inode);


 


     if (!drop && (sb->s_flags & MS_ACTIVE)){


inode->i_state |= I_REFERENCED;


if (!(inode->i_state & (I_DIRTY|I_SYNC)))


inode_lru_list_add(inode);


spin_unlock(&inode->i_lock);


return;


}


 


     if (!drop){


inode->i_state |= I_WILL_FREE;


spin_unlock(&inode->i_lock);


write_inode_now(inode, 1);


spin_lock(&inode->i_lock);


    WARN_ON(inode->i_state & I_NEW);


inode->i_state &= ~I_WILL_FREE;


}


 


inode->i_state |= I_FREEING;


inode_lru_list_del(inode);


spin_unlock(&inode->i_lock);


 


evict(inode);


}


函数iput_final是在当inode没有被任何地方引用后,即变成了unused状态后,回收inode的机制。


if (op && op->drop_inode)

       drop = op->drop_inode(inode);

   else

       drop = generic_drop_inode(inode);



drop为0时,表示i_nlink为0,并且inode没有保存着inode_hashtable中的拉链表,即这个inode可以被释放掉。

/*


* Normal UNIX filesystem behaviour: delete the


* inode when the usage count drops to zero, and


* i_nlink is zero.


*/


int generic_drop_inode(struct inode *inode)


{


return !inode->i_nlink || inode_unhashed(inode);


}


EXPORT_SYMBOL_GPL(generic_drop_inode);



if (!drop && (sb->s_flags & MS_ACTIVE)){

       inode->i_state |= I_REFERENCED;

       if (!(inode->i_state & (I_DIRTY|I_SYNC)))

          inode_lru_list_add(inode);

       spin_unlock(&inode->i_lock);

       return;

   }



如果superblock还存在在系统中,就调用inode_lru_list_add将inode添加到unused列表中,即将inode缓存起来。

否则,就先调用write_inode_now写回到磁盘上,再调用inode_lru_list_del将已经缓存下来的inode删除掉,最后调用evict函数将inode彻底删除。

static void inode_lru_list_add(struct inode *inode)


{


spin_lock(&inode_lru_lock);


     if (list_empty(&inode->i_lru)){


list_add(&inode->i_lru, &inode_lru);


inodes_stat.nr_unused++;


}


spin_unlock(&inode_lru_lock);


}


因此inode_lru就是全局的unused inode列表,通过“Least Recently Used”的顺序保存。

 

此外,操作inode_lru的函数还有prune_icache

/*


* Scan `goal' inodes on the unused list for freeable ones. They are moved to a


* temporary list and then are freed outside inode_lru_lock by dispose_list().


*


* Any inodes which are pinned purely because of attached pagecache have their


* pagecache removed.  If the inode has metadata buffers attached to


* mapping->private_list then try to remove them.


*


* If the inode has the I_REFERENCED flag set, then it means that it has been


* used recently - the flag is set in iput_final(). When we encounter such an


* inode, clear the flag and move it to the back of the LRU so it gets another


* pass through the LRU before it gets reclaimed. This is necessary because of


* the fact we are doing lazy LRU updates to minimise lock contention so the


* LRU does not have strict ordering. Hence we don't want to reclaim inodes


* with this flag set because they are the inodes that are out of order.


*/


static void prune_icache(int nr_to_scan)


{


LIST_HEAD(freeable);


int nr_scanned;


unsigned long reap = 0;


 


down_read(&iprune_sem);


spin_lock(&inode_lru_lock);


     for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++){


struct inode *inode;


 


if (list_empty(&inode_lru))


break;


 


inode = list_entry(inode_lru.prev, struct inode, i_lru);


 


  /*


* we are inverting the inode_lru_lock/inode->i_lock here,


* so use a trylock. If we fail to get the lock, just move the


* inode to the back of the list so we don't spin on it.


*/


         if (!spin_trylock(&inode->i_lock)){


list_move(&inode->i_lru, &inode_lru);


continue;


}


 


  /*


* Referenced or dirty inodes are still in use. Give them


* another pass through the LRU as we canot reclaim them now.


*/


if (atomic_read(&inode->i_count) ||


             (inode->i_state & ~I_REFERENCED)){


list_del_init(&inode->i_lru);


    spin_unlock(&inode->i_lock);


inodes_stat.nr_unused--;


continue;


}


 


 /*recently referenced inodes get one more pass */


         if (inode->i_state & I_REFERENCED){


inode->i_state &= ~I_REFERENCED;


list_move(&inode->i_lru, &inode_lru);


    spin_unlock(&inode->i_lock);


continue;


}


         if (inode_has_buffers(inode) || inode->i_data.nrpages){


__iget(inode);


    spin_unlock(&inode->i_lock);


   spin_unlock(&inode_lru_lock);


if (remove_inode_buffers(inode))


reap += invalidate_mapping_pages(&inode->i_data,


0, -1);


iput(inode);


   spin_lock(&inode_lru_lock);


 


if (inode != list_entry(inode_lru.next,


struct inode, i_lru))


    continue;/* wrong inode or list_empty */


/* avoid lock inversions with trylock */


if (!spin_trylock(&inode->i_lock))


    continue;


             if (!can_unuse(inode)){


   spin_unlock(&inode->i_lock);


    continue;


}


}


    WARN_ON(inode->i_state & I_NEW);


    inode->i_state |= I_FREEING;


spin_unlock(&inode->i_lock);


 


list_move(&inode->i_lru, &freeable);


inodes_stat.nr_unused--;


}


if (current_is_kswapd())


__count_vm_events(KSWAPD_INODESTEAL, reap);


else


__count_vm_events(PGINODESTEAL, reap);


spin_unlock(&inode_lru_lock);


 


dispose_list(&freeable);


up_read(&iprune_sem);


}


该函数的作用是在内存压力较大时,通过缩减缓存的inode列表inode_lru以释放出更多的内存。

该函数就是从inode_lru中从头开始取inode出来,做一些简单检查,如果inode还有一些原因需要继续存在在缓存中,就将该inode移到链表的尾部,然后检查下一个inode。

使得inode继续保留的原因包括:无法获取到操作inode中数据的锁i_lock;inode中的数据是脏的;inode的使用计数非0;inode刚刚被引用过等等。

 

还有一个比较实用的问题,我们看到在调用iput_final时,检查如果i_nlink为0,并且没有被用作拉链表的话,就将其放到缓存inode_lru中,但是在prune_icache时,会检查i_count引用计数是否为0。

这也就是说,如果一个inode对应的磁盘文件已经被删除了,但是还有进程对其进行操作的话,那么它不会被直接删除,而是会保存在缓存中,也就是说对其操作的进程仍然可以对已经缓存下来的数据页面page进行操作。

直到没有进程再对其进行操作了,才有可能被清除出缓存。

 

inode中有两个链表头元素,分别是i_sb_list和i_wb_list,其中i_sb_list是super_block->s_inodes列表的元素,而i_wb_list是用于维护设备的后备inode列表。

 

2. dentry缓存

dentry缓存的目的,为了减少对慢速磁盘的访问,每当VFS文件系统对底层的数据进行访问时,都会将访问的结果缓存下来,保存成一个dentry对象。

 

而且dentry对象的组织与管理,是和inode缓存极其相似的,也有一个hash表,和一个lru队列。

而且当内存压力较大时,也会调用prune_dcache来企图释放lru中优先级较低的dentry项目。

 

区别在于,inode是不需要维护目录的关系的,但是dentry需要,因此dentry的组织比inode要复杂。

 

static struct hlist_bl_head *dentry_hashtable __read_mostly;


 


在super_block中

/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */


     struct list_head    s_dentry_lru;/* unused dentry lru */


 


因此,保存dentry全局hash表的数据结构是全局的,而保存dentry缓存的数据结构是存在于super_block数据结构中。

/*


* dentry_lru_(add|del|move_tail) must be called with d_lock held.


*/


static void dentry_lru_add(struct dentry *dentry)


{


     if (list_empty(&dentry->d_lru)){


spin_lock(&dcache_lru_lock);


list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);


dentry->d_sb->s_nr_dentry_unused++;


dentry_stat.nr_unused++;


spin_unlock(&dcache_lru_lock);


}


}


dentry_lur_add函数用于向dentry缓存中添加一个释放的dentry,它被函数dput调用。

/* 


* This is dput


*


* This is complicated by the fact that we do not want to put


* dentries that are no longer on any hash chain on the unused


* list: we'd much rather just get rid of them immediately.


*


* However, that implies that we have to traverse the dentry


* tree upwards to the parents which might _also_ now be


* scheduled for deletion (it may have been only waiting for


* its last child to go away).


*


* This tail recursion is done by hand as we don't want to depend


* on the compiler to always get this right (gcc generally doesn't).


* Real recursion would eat up our stack space.


*/


 


/*


* dput - release a dentry


* @dentry: dentry to release 


*


* Release a dentry. This will drop the usage count and if appropriate


* call the dentry unlink method as well as removing it from the queues and


* releasing its resources. If the parent dentries were scheduled for release


* they too may now get deleted.


*/


void dput(struct dentry *dentry)


{


if (!dentry)


return;


 


repeat:


if (dentry->d_count == 1)


might_sleep();


spin_lock(&dentry->d_lock);


BUG_ON(!dentry->d_count);


     if (dentry->d_count > 1){


dentry->d_count--;


spin_unlock(&dentry->d_lock);


return;


}


 


     if (dentry->d_flags & DCACHE_OP_DELETE){


if (dentry->d_op->d_delete(dentry))


goto kill_it;


}


 


/*Unreachable? Get rid of it */


if (d_unhashed(dentry))


goto kill_it;


 


/*Otherwise leave it cached and ensure it's on the LRU */


dentry->d_flags |= DCACHE_REFERENCED;


dentry_lru_add(dentry);


 


dentry->d_count--;


spin_unlock(&dentry->d_lock);


return;


 


kill_it:


dentry = dentry_kill(dentry, 1);


if (dentry)


goto repeat;


}


EXPORT_SYMBOL(dput);


 

所有的dentry实例会形成一个网络,用于反映文件系统的结构。

d_subdirs成员,里面保存着所有的子目录以及该目录下的文件组成的列表。

d_child成员,是该dentry链接到其父目录的dentry节点的锚点。

这两个成员,是构成文件系统的层次结构的基本设施。

 


if (dentry->d_count == 1)

       might_sleep();



参考:http://yuxu9710108.blog.163.com/blog/static/23751534201011715413404/

用于调试时,提示atomic context的可能睡眠情况。

 

分析dput函数的逻辑:

如果dentry的引用计数大于1,那么代表还有其他的地方在使用这个dentry,因此只减少引用计数,直接返回;

如果dentry->d_flags里面设置了delete标志,那么直接调用d_op->d_delete函数指针删除该dentry,再调用dentry_kill来处理;

【d_op->d_delete与dentry_kill在功能上有什么不同?】

如果在全局的hash表中也已经找不该dentry了,那么直接调用dentry_kill来处理;

如果dentry的引用计数为1,而且也不属于上面二种需要调用dentry_kill的情况,那么就将其缓存在super_block的LRU队列中。

 

我们看一种可能的d_delete的实现

/*


* This is called from dput() when d_count is going to 0.


*/


static int nfs_dentry_delete(const struct dentry *dentry)


{


dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",


dentry->d_parent->d_name.name, dentry->d_name.name,


dentry->d_flags);


 


/*Unhash any dentry with a stale inode */


if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))


return 1;


 


     if (dentry->d_flags & DCACHE_NFSFS_RENAMED){


 /*Unhash it, so that ->d_iput() would be called */


return 1;


}


     if (!(dentry->d_sb->s_flags & MS_ACTIVE)){


 /*Unhash it, so that ancestors of killed async unlink


* files will be cleaned up during umount */


return 1;


}


return 0;


 


}


可见,该函数是进行一些内部的判断,决定是否需要将该dentry从全局的hash表中删除掉。


if (dentry->d_flags & DCACHE_OP_DELETE){

       if (dentry->d_op->d_delete(dentry))

           goto kill_it;

   }



/*


* Finish off a dentry we've decided to kill.


* dentry->d_lock must be held, returns with it unlocked.


* If ref is non-zero, then decrement the refcount too.


* Returns dentry requiring refcount drop, or NULL if we're done.


*/


static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)


__releases(dentry->d_lock)


{


struct inode *inode;


struct dentry *parent;


 


inode = dentry->d_inode;


     if (inode && !spin_trylock(&inode->i_lock)){


relock:


spin_unlock(&dentry->d_lock);


cpu_relax();


         return dentry;/* try again with same dentry */


}


if (IS_ROOT(dentry))


parent = NULL;


else


parent = dentry->d_parent;


     if (parent && !spin_trylock(&parent->d_lock)){


if (inode)


    spin_unlock(&inode->i_lock);


goto relock;


}


 


if (ref)


dentry->d_count--;


/*if dentry was on the d_lru list delete it from there */


dentry_lru_del(dentry);


/*if it was on the hash then remove it */


__d_drop(dentry);


return d_kill(dentry, parent);


}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: