文件系统(二)--buffer.c namei.c truncate.c open.c源码分析
2015-06-14 21:47
495 查看
buffer.c
1 /*
2 * linux/fs/buffer.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting a interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it. NOTE! As interrupts
11 * can wake up a caller, some cli-sti sequences are needed to check for
12 * sleep-on-calls. These should be extremely quick, though (I hope).
13 */
15 /*
16 * NOTE! There is one discordant note here: checking floppies for
17 * disk change. This is where it fits best, I think, as it should
18 * invalidate changed floppy-disk-caches.
19 */
20
21 #include <stdarg.h>
22
23 #include <linux/config.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
26 #include <asm/system.h>
27 #include <asm/io.h>
29 extern int end; //由链接器生成的,指向内核空间末端后一个字节
30 struct buffer_head * start_buffer = (struct buffer_head *) &end;
31 struct buffer_head * hash_table[NR_HASH];
32 static struct buffer_head * free_list; //空闲链表头
33 static struct task_struct * buffer_wait = NULL; //等待空闲缓冲块而睡眠的任务队列
34 int NR_BUFFERS = 0; //缓冲块个数
36 static inline void wait_on_buffer(struct buffer_head * bh)
37 {
38 cli();
39 while (bh->b_lock)
40 sleep_on(&bh->b_wait);
41 sti();
42 }
等待指定缓冲块解锁
44 int sys_sync(void)
45 {
46 int i;
47 struct buffer_head *bh;
49 sync_inodes(); /* write out inodes into
buffers */
50 bh = start_buffer;
51 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
52 wait_on_buffer(bh);
53 if (bh->b_dirt)
54 ll_rw_block(WRITE,bh);
55 }
56 return 0;
57 }
49行先进行inode的同步(具体过程下面分析),随后遍历所有的buffer_head,如果标记为脏,就进行写操作将其写入到磁盘中。
现在看一下sync_inodes:
fs/inode.c
59 void sync_inodes(void)
60 {
61 int i;
62 struct m_inode * inode;
63
64 inode = 0+inode_table;
65 for(i=0 ; i<NR_INODE ; i++,inode++) {
66 wait_on_inode(inode);
67 if (inode->i_dirt && !inode->i_pipe)
68 write_inode(inode);
69 }
70 }
之前的文章中提到过,内核把所有存在于内核中的inode保存在一个数组inode_table中,现在就遍历这个数组,如果该inode没有被锁定,并且是脏的,同时不是pipe,执行write_inode。
write_inode(inode):
314 static void write_inode(struct m_inode * inode)
315 {
316 struct super_block * sb;
317 struct buffer_head * bh;
318 int block;
319
320 lock_inode(inode);
321 if (!inode->i_dirt || !inode->i_dev) {
322 unlock_inode(inode);
323 return;
324 }
如果是干净的,直接返回
325 if (!(sb=get_super(inode->i_dev)))
326 panic("trying to write inode without device");
获取分区超级块
327 block = 2 + sb->s_imap_blocks + sb->s_zmap_blocks +
328 (inode->i_num-1)/INODES_PER_BLOCK;
这里是计算这里的inode节点的块号,这是为了从磁盘中读取inode节点,并与内存中的inode节点进行比对。
我们再来看一下为什么这么计算,2在这里分别代表了引导块与超级块,然后是imap所占的块号,然后是zmap(逻辑块map)所占的块号。inode->i_num是inode的编号,除以INODES_PER_BLOCK表示它对应的块号偏移。
329 if (!(bh=bread(inode->i_dev,block)))
330 panic("unable to read i-node block");
读取参数inode所在块的内容。
331 ((struct d_inode *)bh->b_data)
332 [(inode->i_num-1)%INODES_PER_BLOCK] =
333 *(struct d_inode *)inode;
这里的目的是把参数中的inode写入磁盘中。但是这里还是会经过缓冲区。
334 bh->b_dirt=1;
335 inode->i_dirt=0;
这时把bh设为脏,inode设为干净就可以了。
336 brelse(bh);
唤醒其他等待bh的任务
337 unlock_inode(inode);
解锁inode
338 }
可以看到这里只是把inode放回到了缓冲区,等待写入磁盘。
我们继续回到前面,sync_inode就是遍历inode table,把所有的标记为脏的inode并且不是pipe类型的写入磁盘。当然它必须先写入高速缓冲区,注意它是怎么计算对应磁盘位置的。
继续回到sys_sync:
44 int sys_sync(void)
45 {
46 int i;
47 struct buffer_head * bh;
48
49 sync_inodes(); /* write out inodes into buffers */
50 bh = start_buffer;
51 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
52 wait_on_buffer(bh);
53 if (bh->b_dirt)
54 ll_rw_block(WRITE,bh);
55 }
56 return 0;
57 }
这里50行开始会遍历所有的buffer_head,如果没有其他任务锁定这个buffer_head,并且buffer是脏的,就调用54行(驱动程序)把缓冲区内容写入到硬盘中。
59 int sync_dev(int dev)
60 {
61 int i;
62 struct buffer_head * bh;
63
64 bh = start_buffer;
65 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
66 if (bh->b_dev != dev)
67 continue;
68 wait_on_buffer(bh);
69 if (bh->b_dev == dev && bh->b_dirt)
70 ll_rw_block(WRITE,bh);
71 }
72 sync_inodes();
73 bh = start_buffer;
74 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
75 if (bh->b_dev != dev)
76 continue;
77 wait_on_buffer(bh);
78 if (bh->b_dev == dev && bh->b_dirt)
79 ll_rw_block(WRITE,bh);
80 }
81 return 0;
82 }
这里分两步执行是从效率的角度上考虑的,第一步先把脏的buffer写入到磁盘,第二部同步inode节点,第三步,把因为同步inode节点变脏的buffer再写入磁盘。
===========================================
我们平时编程时涉及到文件写操作的,如果仅仅是把数据写入到应用程序缓冲区中,这时高速缓冲区中并没有它的记录,这时如果应用程序退出,就会造成数据丢失。如果执行了flush就会把数据刷到告诉缓冲区中。
===========================================
84 void inline invalidate_buffers(int dev)
85 {
86 int i;
87 struct buffer_head * bh;
88
89 bh = start_buffer;
90 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
91 if (bh->b_dev != dev)
92 continue;
93 wait_on_buffer(bh);
94 if (bh->b_dev == dev)
95 bh->b_uptodate = bh->b_dirt = 0;
96 }
97 }
很简单,93行首先等待对bh的执行权,然后只需要设置标记b_uptodate,b_dirt即可。
99 /*
100 * This routine checks whether a floppy has been changed, and
101 * invalidates all buffer-cache-entries in that case. This
102 * is a relatively slow routine, so we have to try to minimize using
103 * it. Thus it is called only upon a 'mount' or 'open'. This
104 * is the best way of combining speed and utility, I think.
105 * People changing diskettes in the middle of an operation deserve
106 * to loose :-)
107 *
108 * NOTE! Although currently this is only for floppies, the idea is
109 * that any additional removable block-device will use this routine,
110 * and that mount/open needn't know that floppies/whatever are
111 * special.
112 */
113 void check_disk_change(int dev)
114 {
115 int i;
116
117 if (MAJOR(dev) != 2)
118 return;
119 if (!floppy_change(dev & 0x03))
120 return;
121 for (i=0 ; i<NR_SUPER ; i++)
122 if (super_block[i].s_dev == dev)
123 put_super(super_block[i].s_dev);
124 invalidate_inodes(dev);
125 invalidate_buffers(dev);
126 }
128 #define _hashfn(dev,block) (((unsigned)(dev^block))%NR_HASH)
129 #define hash(dev,block) hash_table[_hashfn(dev,block)]
131 static inline void remove_from_queues(struct buffer_head * bh)
132 {
133 /* remove from hash-queue */
134 if (bh->b_next)
135 bh->b_next->b_prev = bh->b_prev;
136 if (bh->b_prev)
137 bh->b_prev->b_next = bh->b_next;
buffer_head通过b_prev和b_next来链接成双向链表
138 if (hash(bh->b_dev,bh->b_blocknr) == bh)
139 hash(bh->b_dev,bh->b_blocknr) = bh->b_next;
hash table中相应的slot指向hash值相同的链表
140 /* remove from free list */
141 if (!(bh->b_prev_free) || !(bh->b_next_free))
142 panic("Free block list corrupted");
143 bh->b_prev_free->b_next_free = bh->b_next_free;
144 bh->b_next_free->b_prev_free = bh->b_prev_free;
通过b_next_free和b_prev_free连接成空闲双向链表
145 if (free_list == bh)
146 free_list = bh->b_next_free;
free_list作为空闲链表表头
147 }
149 static inline void insert_into_queues(struct buffer_head * bh)
150 {
151 /* put at end of free list */
152 bh->b_next_free = free_list;
153 bh->b_prev_free = free_list->b_prev_free;
154 free_list->b_prev_free->b_next_free = bh;
155 free_list->b_prev_free = bh;
可见free_list链表尾部是最近使用的,首部则是最不常使用的
156 /* put the buffer in new hash-queue if it has a device */
157 bh->b_prev = NULL;
158 bh->b_next = NULL;
159 if (!bh->b_dev)
160 return;
161 bh->b_next = hash(bh->b_dev,bh->b_blocknr);
162 hash(bh->b_dev,bh->b_blocknr) = bh;
163 bh->b_next->b_prev = bh;
添加到hash表中
164 }
166 static struct buffer_head * find_buffer(int dev, int block)
167 {
168 struct buffer_head * tmp;
169
170 for (tmp = hash(dev,block) ; tmp != NULL ; tmp = tmp->b_next)
171 if (tmp->b_dev==dev && tmp->b_blocknr==block)
172 return tmp;
173 return NULL;
174 }
非常简单。
176 /*
177 * Why like this, I hear you say... The reason is race-conditions.
178 * As we don't lock buffers (unless we are readint them, that is),
179 * something might happen to it while we sleep (ie a read-error
180 * will force it bad). This shouldn't really happen currently, but
181 * the code is ready.
182 */
183 struct buffer_head * get_hash_table(int dev, int block)
184 {
185 struct buffer_head * bh;
186
187 for (;;) {
188 if (!(bh=find_buffer(dev,block)))
189 return NULL;
190 bh->b_count++;
191 wait_on_buffer(bh);
192 if (bh->b_dev == dev && bh->b_blocknr == block)
193 return bh;
194 bh->b_count--;
195 }
196 }
返回对应设备和块号的buffer_head.192行重新判断是因为在睡眠过程中,可能整个世界都变了
205 #define BADNESS(bh) (((bh)->b_dirt<<1)+(bh)->b_lock)
206 struct buffer_head * getblk(int dev,int block)
207 {
208 struct buffer_head * tmp, * bh;
210 repeat:
211 if (bh = get_hash_table(dev,block))
212 return bh;
如果没有找到
213 tmp = free_list;
214 do {
215 if (tmp->b_count)
216 continue;
217 if (!bh || BADNESS(tmp)<BADNESS(bh)) {
218 bh = tmp;
219 if (!BADNESS(tmp))
220 break;
221 }
222 /* and repeat until we find something good */
223 } while ((tmp = tmp->b_next_free) != free_list);
224 if (!bh) {
225 sleep_on(&buffer_wait);
226 goto repeat;
227 }
228 wait_on_buffer(bh);
229 if (bh->b_count)
230 goto repeat;
231 while (bh->b_dirt) {
232 sync_dev(bh->b_dev);
233 wait_on_buffer(bh);
234 if (bh->b_count)
235 goto repeat;
236 }
237 /* NOTE!! While we slept waiting for this block, somebody else might */
238 /* already have added "this" block to the cache. check it */
239 if (find_buffer(dev,block))
240 goto repeat;
241 /* OK, FINALLY we know that this buffer is the only one of it's kind, */
242 /* and that it's unused (b_count=0), unlocked (b_lock=0), and clean */
243 bh->b_count=1;
244 bh->b_dirt=0;
245 bh->b_uptodate=0;
246 remove_from_queues(bh);
247 bh->b_dev=dev;
248 bh->b_blocknr=block;
249 insert_into_queues(bh);
250 return bh;
251 }
253 void brelse(struct buffer_head * buf)
254 {
255 if (!buf)
256 return;
257 wait_on_buffer(buf);
258 if (!(buf->b_count--))
259 panic("Trying to free free buffer");
260 wake_up(&buffer_wait);
261 }
关于这个函数只解释一下buffer_wait,我们知道buffer_head是有限的,如果一个任务请求磁盘操作,但此时所有的buffer_head都不是空闲的,那么该任务只有等待在buffer_wait上面。
263 /*
264 * bread() reads a specified block and returns the buffer that contains
265 * it. It returns NULL if the block was unreadable.
266 */
267 struct buffer_head * bread(int dev,int block)
268 {
269 struct buffer_head * bh;
271 if (!(bh=getblk(dev,block)))
272 panic("bread: getblk returned NULL\n");
273 if (bh->b_uptodate)
274 return bh;
275 ll_rw_block(READ,bh);
276 wait_on_buffer(bh);
277 if (bh->b_uptodate)
278 return bh;
279 brelse(bh);
280 return NULL;
281 }
这个我们在分析其他源码的过程中已经详细分析过了。
先从缓冲区中读,如果读不到就向设备驱动程序发起请求。
283 #define COPYBLK(from,to) \
284 __asm__("cld\n\t" \
285 "rep\n\t" \
286 "movsl\n\t" \
287 ::"c" (BLOCK_SIZE/4),"S" (from),"D" (to) \
288 :"cx","di","si")
源字符串指出由DS:SI和ES:DI指向目标字符串
290 /*
291 * bread_page reads four buffers into memory at the desired address. It's
292 * a function of its own, as there is some speed to be got by reading them
293 * all at the same time, not waiting for one to be read, and then another
294 * etc.
295 */
296 void bread_page(unsigned long address,int dev,int b[4])
297 {
298 struct buffer_head * bh[4];
299 int i;
300
301 for (i=0 ; i<4 ; i++)
302 if (b[i]) {
303 if (bh[i] = getblk(dev,b[i]))
304 if (!bh[i]->b_uptodate)
305 ll_rw_block(READ,bh[i]);
306 } else
307 bh[i] = NULL;
308 for (i=0 ; i<4 ; i++,address += BLOCK_SIZE)
309 if (bh[i]) {
310 wait_on_buffer(bh[i]);
311 if (bh[i]->b_uptodate)
312 COPYBLK((unsigned long) bh[i]->b_data,address);
313 brelse(bh[i]);
314 }
315 }
这里也比较简单,读取4个块,并拷贝到指定内存地址后释放buffer_head
317 /*
318 * Ok, breada can be used as bread, but additionally to mark other
319 * blocks for reading as well. End the argument list with a negative
320 * number.
321 */
322 struct buffer_head * breada(int dev,int first, ...)
323 {
324 va_list args;
325 struct buffer_head * bh, *tmp;
326
327 va_start(args,first);
328 if (!(bh=getblk(dev,first)))
329 panic("bread: getblk returned NULL\n");
330 if (!bh->b_uptodate)
331 ll_rw_block(READ,bh);
332 while ((first=va_arg(args,int))>=0) {
333 tmp=getblk(dev,first);
334 if (tmp) {
335 if (!tmp->b_uptodate)
336 ll_rw_block(READA,bh);
337 tmp->b_count--;
338 }
339 }
340 va_end(args);
341 wait_on_buffer(bh);
342 if (bh->b_uptodate)
343 return bh;
344 brelse(bh);
345 return (NULL);
346 }
这个函数可以接收可变参数,但是原理上与前面的bread一致
348 void buffer_init(long buffer_end)
349 {
350 struct buffer_head * h = start_buffer; //前面已经初始化了
351 void * b;
352 int i;
353
354 if (buffer_end == 1<<20)
355 b = (void *) (640*1024);
356 else
357 b = (void *) buffer_end;
358 while ( (b -= BLOCK_SIZE) >= ((void *) (h+1)) ) {
359 h->b_dev = 0;
360 h->b_dirt = 0;
361 h->b_count = 0;
362 h->b_lock = 0;
363 h->b_uptodate = 0;
364 h->b_wait = NULL;
365 h->b_next = NULL;
366 h->b_prev = NULL;
367 h->b_data = (char *) b;
368 h->b_prev_free = h-1;
369 h->b_next_free = h+1;
pre指向内存低地址,next指向内存高地址。
370 h++;
每个BLOCK_SIZE为1K,所以从尾部end开始为每一个buff设置buffer_head与其对应
371 NR_BUFFERS++;
372 if (b == (void *) 0x100000)
373 b = (void *) 0xA0000;
374 }
375 h--;
376 free_list = start_buffer;
377 free_list->b_prev_free = h; //双向链表
378 h->b_next_free = free_list; //处理链表最后一个和第一个
379 for (i=0;i<NR_HASH;i++)
380 hash_table[i]=NULL; //hash_table初始为空
381 }
到这里我们就介绍完了buffer.c整个源文件
总结
我们就从初始化函数开始总结,在buffer_init中,对缓冲区内存从尾部开始遍历每个块对应的从缓冲区首部为其设置buffer_head来描述之。缓冲区的地址就保存在buffer_head的b_data域中。初始时,hash_table内容均为空。空闲链表从尾部一直连接到首部(双向的)。后续在读取磁盘内容时,对应的内容会被读到buffer_head中,并且添加到hash_table和free_list中。hash_table中的冲突域采用b_next和b_pre属性链接成链。free_list指向的空闲链表则是通过buffer_head的b_pre_free和b_next_free链接成双向链表的。对于sync同步操作,依次遍历buffer_head,对标记为脏的buffer_head进行写操作,这里的写操作是实际调用磁盘驱动程序实现的。对于getblk操作,首先会通过hash方法寻找,如果发现为空,说明这时此磁盘内容还未被读入。由于每一个读入的磁盘内容都需要在缓冲区中使用一个buffer_head对其进行描述,因此现在就需要找到这样一个buffer_head,这是从空闲链表中进行寻找的。找到之后对该buffer_head的属性进行设置,注意此时我们并没有真正调用磁盘驱动程序来实际读取内容,只是把这个buffer_head加入相应的空闲链表和hash_table之中。真正的读取操作是在bread中进行的。在bread中首先进行getblk操作得到buffer_head后根据它的b_uptodate(内容是否是新的)标记来决定是否执行实际的读盘操作。至于释放操作brelse比较简单,首先等待buffer_head解锁,然后递减引用计数(i_count),最后唤醒等待在buffer_wait上的任务。
namei.c
1 /*
2 * linux/fs/namei.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 /*
8 * Some corrections by tytso.
9 */
10
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <asm/segment.h>
14
15 #include <string.h>
16 #include <fcntl.h>
17 #include <errno.h>
18 #include <const.h>
19 #include <sys/stat.h>
20
21 #define ACC_MODE(x) ("\004\002\006\377"[(x)&O_ACCMODE])
"\004\002\006\377"看成字符数组,[(x)&O_ACCMODE]就是数组索引
23 /*
24 * comment out this line if you want names > NAME_LEN chars to be
25 * truncated. Else they will be disallowed.
26 */
27 /* #define NO_TRUNCATE */
28
29 #define MAY_EXEC 1
30 #define MAY_WRITE 2
31 #define MAY_READ 4
33 /*
34 * permission()
35 *
36 * is used to check for read/write/execute permissions on a file.
37 * I don't know if we should look at just the euid or both euid and
38 * uid, but that should be easily changed.
39 */
40 static int permission(struct m_inode * inode,int mask)
41 {
42 int mode = inode->i_mode;
43
44 /* special case: not even root can read/write a deleted file */
45 if (inode->i_dev && !inode->i_nlinks) //设备不为0,链接数为0,说明已经被删除
46 return 0; //返回
47 else if (current->euid==inode->i_uid) //当前进程的euid == inode的uid
48 mode >>= 6;
49 else if (current->egid==inode->i_gid) //当前进程的egid == inode的gid
50 mode >>= 3;
51 if (((mode & mask & 0007) == mask) || suser())//如果mode与mask的判断通过或者是超级用户,返回1
52 return 1;
53 return 0;
54 }
下面介绍的函数涉及到struct dir_entry,所以我们先来看一下:
157 struct dir_entry {
158 unsigned short inode; //目录对应的inode
159 char name[NAME_LEN]; //目录名
160 };
56 /*
57 * ok, we cannot use strncmp, as the name is not in our data space.
58 * Thus we'll have to use match. No big problem. Match also makes
59 * some sanity tests.
60 *
61 * NOTE! unlike strncmp, match returns 1 for success, 0 for failure.
62 */
63 static int match(int len,const char * name,struct dir_entry * de)
64 {
65 register int same __asm__("ax");
66
67 if (!de || !de->inode || len > NAME_LEN)
68 return 0;
69 if (len < NAME_LEN && de->name[len])
70 return 0;
这个情况下,参数中指定的name长度小于目录名实际长度,直接返回不匹配0.
71 __asm__("cld\n\t"
72 "fs ; repe ; cmpsb\n\t"
73 "setz %%al"
74 :"=a" (same)
75 :"0" (0),"S" ((long) name),"D" ((long) de->name),"c" (len)
76 :"cx","di","si");
77 return same;
78 }
80 /*
81 * find_entry()
82 *
83 * finds an entry in the specified directory with the wanted name. It
84 * returns the cache buffer in which the entry was found, and the entry
85 * itself (as a parameter - res_dir). It does NOT read the inode of the
86 * entry - you'll have to do that yourself if you want to.
87 *
88 * This also takes care of the few special cases due to '..'-traversal
89 * over a pseudo-root and a
mount point.
90 */
91 static struct buffer_head * find_entry(struct m_inode ** dir,
92 const char * name, int namelen, struct dir_entry ** res_dir)
93 {
94 int entries;
95 int block,i;
96 struct buffer_head * bh;
97 struct dir_entry * de;
98 struct super_block * sb;
99
100 #ifdef NO_TRUNCATE
101 if (namelen > NAME_LEN)
102 return NULL;
103 #else //这时会截取字符串
104 if (namelen > NAME_LEN)
105 namelen = NAME_LEN;
106 #endif
107 entries = (*dir)->i_size / (sizeof (struct dir_entry));
如果inode是目录的话,那么它的内容将以目录项dir_entry的形式存放。这里是用来计算目录项个数的
108 *res_dir = NULL;
109 if (!namelen)
110 return NULL;
111 /* check for '..', as we might have to do some "magic" for it */
112 if (namelen==2 && get_fs_byte(name)=='.' && get_fs_byte(name+1)=='.') { // ".." 的情况
113 /* '..' in a pseudo-root results in a faked '.' (just change namelen) */
114 if ((*dir) == current->root) //如果指定目录是当期进程的伪根目录
115 namelen=1; //这时".."应变为"."
116 else if ((*dir)->i_num == ROOT_INO) { //如果指定目录是安装点
117 /* '..' over a mount-point results in 'dir' being exchanged for
the mounted
118 directory-inode. NOTE! We set mounted, so that we can iput the new dir */
在安装点上,".."会导致目录变为安装目录
119 sb=get_super((*dir)->i_dev);
120 if (sb->s_imount) {
121 iput(*dir);
122 (*dir)=sb->s_imount;
123 (*dir)->i_count++;
124 }
125 }
126 }
127 if (!(block = (*dir)->i_zone[0]))
128 return NULL;
先取出第一个块号
129 if (!(bh = bread((*dir)->i_dev,block)))
130 return NULL;
读取该块
131 i = 0;
132 de = (struct dir_entry *) bh->b_data;
转为dir_entry
133 while (i < entries) { //遍历每个entry
134 if ((char *)de >= BLOCK_SIZE+bh->b_data) { //如果已经搜索完了整个块
135 brelse(bh); //释放该块
136 bh = NULL;
137 if (!(block = bmap(*dir,i/DIR_ENTRIES_PER_BLOCK)) ||
138 !(bh = bread((*dir)->i_dev,block))) {
137行,首先根据dir中的块号计算实际块号(在磁盘中的实际块号)
然后读取该块。如果存在块号为0或者bh为NULL,则执行下面139,140行(这一块内没有存放目录或文件)
139 i += DIR_ENTRIES_PER_BLOCK;
140 continue;
141 }
142 de = (struct dir_entry *) bh->b_data;
143 }
144 if (match(namelen,name,de)) {
145 *res_dir = de;
146 return bh;
147 }
如果匹配了,就把该dir_entry保存到res_dir中,返回该bh
148 de++;
149 i++;
如果当前res_dir不是,继续搜索下一个
150 }//while
151 brelse(bh);
152 return NULL;
遍历完还没找到就放回bh,返回NULL
153 }
155 /*
156 * add_entry()
157 *
158 * adds a file entry to the specified directory, using the same
159 * semantics as find_entry(). It returns NULL if it failed.
160 *
161 * NOTE!! The inode part of 'de' is left at 0 - which means you
162 * may not sleep between calling this and putting something into
163 * the entry, as someone else might have used it while you slept.
164 */
165 static struct buffer_head * add_entry(struct m_inode * dir,
166 const char * name, int namelen, struct dir_entry ** res_dir)
167 {
168 int block,i;
169 struct buffer_head * bh;
170 struct dir_entry * de;
171
172 *res_dir = NULL;
173 #ifdef NO_TRUNCATE
174 if (namelen > NAME_LEN)
175 return NULL;
176 #else
177 if (namelen > NAME_LEN)
178 namelen = NAME_LEN;
179 #endif
180 if (!namelen)
181 return NULL;
182 if (!(block = dir->i_zone[0]))
183 return NULL;
184 if (!(bh = bread(dir->i_dev,block)))
185 return NULL;
186 i = 0;
187 de = (struct dir_entry *) bh->b_data;
上面与 find_entry都是一样的
188 while (1) {
189 if ((char *)de >= BLOCK_SIZE+bh->b_data) { //如果一个逻辑块遍历完成
190 brelse(bh); //把遍历完的块放回
191 bh = NULL;
192 block = create_block(dir,i/DIR_ENTRIES_PER_BLOCK); //获取下一个块号
193 if (!block)
194 return NULL;
195 if (!(bh = bread(dir->i_dev,block))) { //读入下一个块
196 i += DIR_ENTRIES_PER_BLOCK; //如果下一个块不存在,跳过,需要更新i
197 continue;
198 }
199 de = (struct dir_entry *) bh->b_data;
200 }
201 if (i*sizeof(struct dir_entry) >= dir->i_size) {
202 de->inode=0;
203 dir->i_size = (i+1)*sizeof(struct dir_entry);
204 dir->i_dirt = 1;
205 dir->i_ctime = CURRENT_TIME;
206 }
201行为true说明指定的目录没有删除的空目录项,现在要向它添加一个目录项,因此203行增加它的大小。202行暂时把目录项的inode设为0.置位脏标记,修改i_ctime。
207 if (!de->inode) {
208 dir->i_mtime = CURRENT_TIME;
209 for (i=0; i < NAME_LEN ; i++)
210 de->name[i]=(i<namelen)?get_fs_byte(name+i):0;
211 bh->b_dirt = 1;
212 *res_dir = de;
213 return bh;
214 }
207行为true,说明找到了满足条件的目录项,它或许是由于之前删除而留下的空项,或者是由于我们在201的if中为该目录新添加的。208行修改它的mtime;209-210行为其name属性赋值,211行标记bh为脏;212把目录项保存到res_dir;213返回该bh
215 de++;
216 i++;
如果当前项不符合,215,216行递增地址和计数,准备遍历下一个
217 }
218 brelse(bh);
219 return NULL;
220 }
222 /*
223 * get_dir()
224 *
225 * Getdir traverses the pathname until it hits the topmost directory.
226 * It returns NULL on failure.
227 */
228 static struct m_inode *get_dir(const char * pathname)
229 {
230 char c;
231 const char * thisname;
232 struct m_inode * inode;
233 struct buffer_head * bh;
234 int namelen,inr,idev;
235 struct dir_entry * de;
236
237 if (!current->root || !current->root->i_count)
238 panic("No root inode");
239 if (!current->pwd || !current->pwd->i_count)
240 panic("No cwd inode");
241 if ((c=get_fs_byte(pathname))=='/') {
242 inode = current->root;
243 pathname++;
244 } else if (c)
245 inode = current->pwd;
246 else
247 return NULL; /* empty name is bad */
因为是要获取the topmost directory,所以只要pathname中第一个字符为'/',inode就设为根目录
否则如果不为空,inode就设为当前工作目录为空返回NULL
248 inode->i_count++; //递增引用计数
249 while (1) {
250 thisname = pathname;
251 if (!S_ISDIR(inode->i_mode) || !permission(inode,MAY_EXEC)) { //权限检查
252 iput(inode);
253 return NULL;
254 }
255 for(namelen=0;(c=get_fs_byte(pathname++))&&(c!='/');namelen++)
256 /* nothing */ ;
243行如果pathname以/开头,则已经执行过++操作了。因此这for循环就是找pathname中的各个部分
257 if (!c)
258 return inode;
c为NULL,说明遍历完成了,这时我们已经把结果保存到inode,直接返回即可
259 if (!(bh = find_entry(&inode,thisname,namelen,&de))) {
260 iput(inode);
261 return NULL;
262 }
寻找目录项,找不到的话就返回NULL
263 inr = de->inode; //记录目录项对应的inode,下次循环还会用到
264 idev = inode->i_dev;
265 brelse(bh);
266 iput(inode); //放回inode(前面已经用过了)
267 if (!(inode = iget(idev,inr))) //如果找不到对应的inode,直接返回NULL
268 return NULL;
269 }
270 }
272 /*
273 * dir_namei()
274 *
275 * dir_namei() returns the inode of the directory of the
276 * specified name, and the name within that directory.
277 */
278 static struct m_inode * dir_namei(const char * pathname,
279 int * namelen, const char ** name)
280 {
281 char c;
282 const char * basename;
283 struct m_inode * dir;
284
285 if (!(dir = get_dir(pathname)))
286 return NULL;
首先获取目录inode
287 basename = pathname;
288 while (c=get_fs_byte(pathname++))
289 if (c=='/')
290 basename=pathname;
取得最后的文件名(比如/etc/passwd,这里basename就是passwd)
291 *namelen = pathname-basename-1;
保存文件名的长度
292 *name = basename;
保存文件名
293 return dir;
294 }
296 /*
297 * namei()
298 *
299 * is used by most simple commands to get the inode of a specified name.
300 * Open, link etc use their own routines, but this is enough for things
301 * like 'chmod' etc.
302 */
303 struct m_inode * namei(const char * pathname)
304 {
305 const char * basename;
306 int inr,dev,namelen;
307 struct m_inode * dir;
308 struct buffer_head * bh;
309 struct dir_entry * de;
310
311 if (!(dir = dir_namei(pathname,&namelen,&basename)))
312 return NULL;
获取目录inode
313 if (!namelen) /* special case: '/usr/' etc */
314 return dir;
315 bh = find_entry(&dir,basename,namelen,&de);
寻找目录项inode
316 if (!bh) {
317 iput(dir);
318 return NULL;
319 }
如果bh为空,放回dir
320 inr = de->inode;
321 dev = dir->i_dev;
获取目录项的inode和设备号
322 brelse(bh);
现在可以释放bh了,因为我们已经获得了需要的inode以及设备号,bh用不到了
323 iput(dir);
324 dir=iget(dev,inr);
获取目录项对应的inode(当然,目录项可能是目录也可能是文件)
325 if (dir) {
326 dir->i_atime=CURRENT_TIME; //更新访问时间
327 dir->i_dirt=1; //脏标记
328 }
329 return dir;
330 }
332 /*
333 * open_namei()
334 *
335 * namei for open - this is in fact almost the whole open-routine.
336 */
337 int open_namei(const char * pathname, int flag, int mode,
338 struct m_inode ** res_inode)
339 {
340 const char * basename;
341 int inr,dev,namelen;
342 struct m_inode * dir, *inode;
343 struct buffer_head * bh;
344 struct dir_entry * de;
345
346 if ((flag & O_TRUNC) && !(flag & O_ACCMODE))
347 flag |= O_WRONLY;
O_ACCMODE<0003>:读写文件操作时,用于取出flag的低2位
O_RDONLY<00>:只读打开
O_WRONLY<01>:只写打开
O_RDWR<02>:读写打开
348 mode &= 0777 & ~current->umask;
349 mode |= I_REGULAR;
350 if (!(dir = dir_namei(pathname,&namelen,&basename)))
351 return -ENOENT;
获取目录inode
352 if (!namelen) { /* special case: '/usr/' etc */
353 if (!(flag & (O_ACCMODE|O_CREAT|O_TRUNC))) {
354 *res_inode=dir;
355 return 0;
356 }
357 iput(dir);
358 return -EISDIR;
359 }
360 bh = find_entry(&dir,basename,namelen,&de);
获取目录项
361 if (!bh) {
//下面是没有读到目录项的情况
362 if (!(flag & O_CREAT)) {
363 iput(dir);
364 return -ENOENT;
365 }
如果没有指定不存在时创建文件,那么放回dir,返回。
366 if (!permission(dir,MAY_WRITE)) {
367 iput(dir);
368 return -EACCES;
369 }
如果指定不存在时创建文件,但是没有权限,返回。
370 inode = new_inode(dir->i_dev);
371 if (!inode) {
372 iput(dir);
373 return -ENOSPC;
374 }
否则就创建一个inode,如果创建失败,返回
375 inode->i_uid = current->euid;
376 inode->i_mode = mode;
377 inode->i_dirt = 1;
设置它的属性
378 bh = add_entry(dir,basename,namelen,&de);
添加到目录中
379 if (!bh) { //添加失败
380 inode->i_nlinks--;
381 iput(inode);
382 iput(dir);
383 return -ENOSPC;
384 }
385 de->inode = inode->i_num;
386 bh->b_dirt = 1;
387 brelse(bh);
388 iput(dir);
389 *res_inode = inode;
390 return 0;
391 }
运行到这里说明读到了目录项
392 inr = de->inode;
393 dev = dir->i_dev;
394 brelse(bh);
395 iput(dir);
396 if (flag & O_EXCL) //独占标记
397 return -EEXIST;
398 if (!(inode=iget(dev,inr)))
399 return -EACCES;
取得inode,取不到的话,返回NULL
400 if ((S_ISDIR(inode->i_mode) && (flag & O_ACCMODE)) ||
401 !permission(inode,ACC_MODE(flag))) {
402 iput(inode);
403 return -EPERM;
404 }
如果取得的inode是目录并且访问权限为只写或读写或者没有访问权限,则放回inode
405 inode->i_atime = CURRENT_TIME;
406 if (flag & O_TRUNC)
407 truncate(inode);
408 *res_inode = inode;
409 return 0;
410 }
总结
目录用dir_entry描述,包括inode和目录名。
根据路径找到文件的inode节点是一个非常重要的操作(namei),这个过程比较耗时,因为需要一层层的遍历目录。我们知道inode节点的i_zone数组可以用来保存磁盘块号。对于目录来说,其inode节点中保存的就是dir_entry,因此当我们需要寻找一个目录下的文件(或目录)时,需要依次读入这些磁盘块根据文件(目录)名进行比较(match),需要注意一些特殊情况,比如路径中没有指定根目录时需要以工作目录作为参考,在安装点上,".."会导致目录变为安装目录等,另外一点就是权限问题。还有一个比较重要的操作就是向目录中添加项(add_entry),同样需要搜索inode的数据块,找到一个合适的位置插入。
truncate.c
1 /*
2 * linux/fs/truncate.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 #include <linux/sched.h>
8
9 #include <sys/stat.h>
11 static void free_ind(int dev,int block)
12 {
13 struct buffer_head * bh;
14 unsigned short * p;
15 int i;
16
17 if (!block)
18 return;
19 if (bh=bread(dev,block)) {
20 p = (unsigned short *) bh->b_data;
21 for (i=0;i<512;i++,p++)
22 if (*p)
23 free_block(dev,*p);
24 brelse(bh);
25 }
26 free_block(dev,block);
27 }
我们看到p是short类型的指针,说明22行中*p是一个short数据,通过23行可以看出它代表block号。这其实是用来释放文件对应的块的。因为我们知道inode节点中有一个属性如下
100 unsigned short i_zone[9];
它代表文件所占用的块号数组。其中i_zone[0] - i_zone[7]是直接块号,i_zone[7]是一次间接块号,i_zone[8]是二次间接块号。
上面这个函数就是用来释放一次间接块号的。
29 static void free_dind(int dev,int block)
30 {
31 struct buffer_head * bh;
32 unsigned short * p;
33 int i;
34
35 if (!block)
36 return;
37 if (bh=bread(dev,block)) {
38 p = (unsigned short *) bh->b_data;
39 for (i=0;i<512;i++,p++)
40 if (*p)
41 free_ind(dev,*p);
42 brelse(bh);
43 }
44 free_block(dev,block);
45 }
释放二次间接块号,这里调用了前面的释放一次间接块号的函数。
47 void truncate(struct m_inode * inode)
48 {
49 int i;
50
51 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
52 return;
常规文件或者目录才可以调用此函数
53 for (i=0;i<7;i++)
54 if (inode->i_zone[i]) {
55 free_block(inode->i_dev,inode->i_zone[i]);
56 inode->i_zone[i]=0;
57 }
释放直接块
58 free_ind(inode->i_dev,inode->i_zone[7]);
59 free_dind(inode->i_dev,inode->i_zone[8]);
释放一次和二次间接快
60 inode->i_zone[7] = inode->i_zone[8] = 0;
61 inode->i_size = 0;
文件大小变为0
62 inode->i_dirt = 1;
标记为脏
63 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
这会改变文件的修改时间i_mtime和改变时间i_ctime。
64 }
总结
这个文件整体上是比较简单的。包括三个函数释放一级间接块,释放二级间接块,截取inode。对于直接块来说,i_zone中相应保存的就是inode所使用的磁盘块。一级间接块,顾名思义,i_zone指明的块中存放的不是普通数据,而是块号。因此对一级间接块的释放操作就是读取一级间接块,遍历其中每一个块调用free_block进行释放;对于二级间接块读取一级间接块后就可以转换为对一级间接块的释放操作。truncate操作则是对inode的所有块进行释放,最后设置其大小为0.
此外留意一下对于inode的操作,其atime,ctime,mtime是如何变化的。
open.c
1 /*
2 * linux/fs/open.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
7 #include <string.h>
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <sys/types.h>
11 #include <utime.h>
12 #include <sys/stat.h>
14 #include <linux/sched.h>
15 #include <linux/tty.h>
16 #include <linux/kernel.h>
17 #include <asm/segment.h>
19 int sys_ustat(int dev, struct ustat * ubuf)
20 {
21 return -ENOSYS;
22 }
23
24 int sys_utime(char * filename, struct utimbuf * times)
25 {
26 struct m_inode * inode;
27 long actime,modtime;
28
29 if (!(inode=namei(filename)))
30 return -ENOENT;
31 if (times) {
32 actime = get_fs_long((unsigned long *) ×->actime);
33 modtime = get_fs_long((unsigned long *) ×->modtime);
34 } else
35 actime = modtime = CURRENT_TIME;
36 inode->i_atime = actime;
37 inode->i_mtime = modtime;
38 inode->i_dirt = 1;
39 iput(inode);
40 return 0;
41 }
更新参数中指明的文件的事件,如果times不为空,就用times来更新;否则就用当前时间来更新
43 /*
44 * XXX should we use the real or effective uid? BSD uses the real uid,
45 * so as to make this call useful to setuid programs.
46 */
47 int sys_access(const char * filename,int mode)
48 {
49 struct m_inode * inode;
50 int res, i_mode;
52 mode &= 0007;
53 if (!(inode=namei(filename)))
54 return -EACCES;
55 i_mode = res = inode->i_mode & 0777;
取得参数中文件的i_mode属性
56 iput(inode);
57 if (current->uid == inode->i_uid)
58 res >>= 6;
59 else if (current->gid == inode->i_gid)
60 res >>= 6;
根据当前用户与inode的关系取得相应的mode,保存到res中
61 if ((res & 0007 & mode) == mode)
62 return 0;
63 /*
64 * XXX we are doing this test last because we really should be
65 * swapping the effective with the real user id (temporarily),
66 * and then calling suser() routine. If we do call the
67 * suser() routine, it needs to be called last.
68 */
69 if ((!current->uid) &&
70 (!(mode & 1) || (i_mode & 0111)))
71 return 0;
72 return -EACCES;
73 }
75 int sys_chdir(const char * filename)
76 {
77 struct m_inode * inode;
78
79 if (!(inode = namei(filename)))
80 return -ENOENT;
81 if (!S_ISDIR(inode->i_mode)) {
82 iput(inode);
83 return -ENOTDIR;
84 }
85 iput(current->pwd);
86 current->pwd = inode;
87 return (0);
88 }
90 int sys_chroot(const char * filename)
91 {
92 struct m_inode * inode;
93
94 if (!(inode=namei(filename)))
95 return -ENOENT;
96 if (!S_ISDIR(inode->i_mode)) {
97 iput(inode);
98 return -ENOTDIR;
99 }
100 iput(current->root);
101 current->root = inode;
102 return (0);
103 }
105 int sys_chmod(const char * filename,int mode)
106 {
107 struct m_inode * inode;
108
109 if (!(inode=namei(filename)))
110 return -ENOENT;
111 if ((current->euid != inode->i_uid) && !suser()) {
112 iput(inode);
113 return -EACCES;
114 }
115 inode->i_mode = (mode & 07777) | (inode->i_mode & ~07777);
116 inode->i_dirt = 1;
117 iput(inode);
118 return 0;
119 }
120
121 int sys_chown(const char * filename,int uid,int gid)
122 {
123 struct m_inode * inode;
125 if (!(inode=namei(filename)))
126 return -ENOENT;
127 if (!suser()) {
128 iput(inode);
129 return -EACCES;
130 }
131 inode->i_uid=uid;
132 inode->i_gid=gid;
133 inode->i_dirt=1;
134 iput(inode);
135 return 0;
136 }
138 int sys_open(const char * filename,int flag,int mode)
139 {
140 struct m_inode * inode;
141 struct file * f;
142 int i,fd;
144 mode &= 0777 & ~current->umask;
145 for(fd=0 ; fd<NR_OPEN ; fd++)
146 if (!current->filp[fd])
147 break;
148 if (fd>=NR_OPEN)
149 return -EINVAL;
150 current->close_on_exec &= ~(1<<fd);
151 f=0+file_table;
152 for (i=0 ; i<NR_FILE ; i++,f++)
153 if (!f->f_count) break;
154 if (i>=NR_FILE)
155 return -EINVAL;
156 (current->filp[fd]=f)->f_count++;
递增引用计数
157 if ((i=open_namei(filename,flag,mode,&inode))<0) {
158 current->filp[fd]=NULL;
159 f->f_count=0;
160 return i;
161 }
162 /* ttys are somewhat special (ttyxx major==4, tty major==5) */
163 if (S_ISCHR(inode->i_mode))
164 if (MAJOR(inode->i_zone[0])==4) {
165 if (current->leader && current->tty<0) {
166 current->tty = MINOR(inode->i_zone[0]);
167 tty_table[current->tty].pgrp = current->pgrp;
对于字符型设备,如果当前进程是组长进程并且设备没有终端
166行就设置它的tty为inode的设备号
167行设置当前进程tty表中与当前tty对应的表项的父进程组号等于当前进程组号
168 }
169 } else if (MAJOR(inode->i_zone[0])==5)
170 if (current->tty<0) { //没有终端,出错返回
171 iput(inode);
172 current->filp[fd]=NULL;
173 f->f_count=0;
174 return -EPERM;
175 }
176 /* Likewise with block-devices: check for floppy_change */
177 if (S_ISBLK(inode->i_mode))
178 check_disk_change(inode->i_zone[0]);
179 f->f_mode = inode->i_mode;
180 f->f_flags = flag;
181 f->f_count = 1;
182 f->f_inode = inode;
183 f->f_pos = 0;
184 return (fd);
185 }
187 int sys_creat(const char * pathname, int mode)
188 {
189 return sys_open(pathname, O_CREAT | O_TRUNC, mode);
190 }
192 int sys_close(unsigned int fd)
193 {
194 struct file * filp;
196 if (fd >= NR_OPEN)
197 return -EINVAL;
198 current->close_on_exec &= ~(1<<fd);
199 if (!(filp = current->filp[fd]))
200 return -EINVAL;
201 current->filp[fd] = NULL;
202 if (filp->f_count == 0)
203 panic("Close: file count is 0");
204 if (--filp->f_count)
205 return (0);
上面看到我们释放了filp[fd],还记得我们在打开文件时,从系统file_table中搜索一个引用计数为0的file,现在我们在关闭时对应的递减了它的引用计数,使得它可以重新被使用。
206 iput(filp->f_inode);
207 return (0);
208 }
另外还有一点需要说明一下,我们知道file_table是一个全局数组,用于存放系统中file,但是C语言定义一个数组时默认是不会对其进行初始化的,它是在fs/super.c文件的mount_root中初始化的
250 for(i=0;i<NR_FILE;i++)
251 file_table[i].f_count=0;
可以看到,只初始化f_count为0即可,我们在上面的sys_open中也看到了,它判断一个file空闲的标准就是根据这里的f_count是否为0
总结
大部分函数都是对inode的属性或者根据inode的属性进行一些操作或设置。看一下sys_open,它执行打开操作,操作对象可能是常规文件,也可能是设备等,打开的文件都要对应一个file,因此先从file_table中寻找空闲file。然后根据文件名执行打开操作,这个我们之前已经详细分析过了。操作成功后,如果打开的是tty设备则进行一些设置操作,最后初始化file的其他几个属性。对于sys_close,它根据指定的文件描述符清除close_on_exec中的相应的位,设current->filp[fd]
为NULL,并递减相应文件的引用计数。
1 /*
2 * linux/fs/buffer.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting a interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it. NOTE! As interrupts
11 * can wake up a caller, some cli-sti sequences are needed to check for
12 * sleep-on-calls. These should be extremely quick, though (I hope).
13 */
15 /*
16 * NOTE! There is one discordant note here: checking floppies for
17 * disk change. This is where it fits best, I think, as it should
18 * invalidate changed floppy-disk-caches.
19 */
20
21 #include <stdarg.h>
22
23 #include <linux/config.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
26 #include <asm/system.h>
27 #include <asm/io.h>
29 extern int end; //由链接器生成的,指向内核空间末端后一个字节
30 struct buffer_head * start_buffer = (struct buffer_head *) &end;
31 struct buffer_head * hash_table[NR_HASH];
32 static struct buffer_head * free_list; //空闲链表头
33 static struct task_struct * buffer_wait = NULL; //等待空闲缓冲块而睡眠的任务队列
34 int NR_BUFFERS = 0; //缓冲块个数
36 static inline void wait_on_buffer(struct buffer_head * bh)
37 {
38 cli();
39 while (bh->b_lock)
40 sleep_on(&bh->b_wait);
41 sti();
42 }
等待指定缓冲块解锁
44 int sys_sync(void)
45 {
46 int i;
47 struct buffer_head *bh;
49 sync_inodes(); /* write out inodes into
buffers */
50 bh = start_buffer;
51 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
52 wait_on_buffer(bh);
53 if (bh->b_dirt)
54 ll_rw_block(WRITE,bh);
55 }
56 return 0;
57 }
49行先进行inode的同步(具体过程下面分析),随后遍历所有的buffer_head,如果标记为脏,就进行写操作将其写入到磁盘中。
现在看一下sync_inodes:
fs/inode.c
59 void sync_inodes(void)
60 {
61 int i;
62 struct m_inode * inode;
63
64 inode = 0+inode_table;
65 for(i=0 ; i<NR_INODE ; i++,inode++) {
66 wait_on_inode(inode);
67 if (inode->i_dirt && !inode->i_pipe)
68 write_inode(inode);
69 }
70 }
之前的文章中提到过,内核把所有存在于内核中的inode保存在一个数组inode_table中,现在就遍历这个数组,如果该inode没有被锁定,并且是脏的,同时不是pipe,执行write_inode。
write_inode(inode):
314 static void write_inode(struct m_inode * inode)
315 {
316 struct super_block * sb;
317 struct buffer_head * bh;
318 int block;
319
320 lock_inode(inode);
321 if (!inode->i_dirt || !inode->i_dev) {
322 unlock_inode(inode);
323 return;
324 }
如果是干净的,直接返回
325 if (!(sb=get_super(inode->i_dev)))
326 panic("trying to write inode without device");
获取分区超级块
327 block = 2 + sb->s_imap_blocks + sb->s_zmap_blocks +
328 (inode->i_num-1)/INODES_PER_BLOCK;
这里是计算这里的inode节点的块号,这是为了从磁盘中读取inode节点,并与内存中的inode节点进行比对。
我们再来看一下为什么这么计算,2在这里分别代表了引导块与超级块,然后是imap所占的块号,然后是zmap(逻辑块map)所占的块号。inode->i_num是inode的编号,除以INODES_PER_BLOCK表示它对应的块号偏移。
329 if (!(bh=bread(inode->i_dev,block)))
330 panic("unable to read i-node block");
读取参数inode所在块的内容。
331 ((struct d_inode *)bh->b_data)
332 [(inode->i_num-1)%INODES_PER_BLOCK] =
333 *(struct d_inode *)inode;
这里的目的是把参数中的inode写入磁盘中。但是这里还是会经过缓冲区。
334 bh->b_dirt=1;
335 inode->i_dirt=0;
这时把bh设为脏,inode设为干净就可以了。
336 brelse(bh);
唤醒其他等待bh的任务
337 unlock_inode(inode);
解锁inode
338 }
可以看到这里只是把inode放回到了缓冲区,等待写入磁盘。
我们继续回到前面,sync_inode就是遍历inode table,把所有的标记为脏的inode并且不是pipe类型的写入磁盘。当然它必须先写入高速缓冲区,注意它是怎么计算对应磁盘位置的。
继续回到sys_sync:
44 int sys_sync(void)
45 {
46 int i;
47 struct buffer_head * bh;
48
49 sync_inodes(); /* write out inodes into buffers */
50 bh = start_buffer;
51 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
52 wait_on_buffer(bh);
53 if (bh->b_dirt)
54 ll_rw_block(WRITE,bh);
55 }
56 return 0;
57 }
这里50行开始会遍历所有的buffer_head,如果没有其他任务锁定这个buffer_head,并且buffer是脏的,就调用54行(驱动程序)把缓冲区内容写入到硬盘中。
59 int sync_dev(int dev)
60 {
61 int i;
62 struct buffer_head * bh;
63
64 bh = start_buffer;
65 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
66 if (bh->b_dev != dev)
67 continue;
68 wait_on_buffer(bh);
69 if (bh->b_dev == dev && bh->b_dirt)
70 ll_rw_block(WRITE,bh);
71 }
72 sync_inodes();
73 bh = start_buffer;
74 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
75 if (bh->b_dev != dev)
76 continue;
77 wait_on_buffer(bh);
78 if (bh->b_dev == dev && bh->b_dirt)
79 ll_rw_block(WRITE,bh);
80 }
81 return 0;
82 }
这里分两步执行是从效率的角度上考虑的,第一步先把脏的buffer写入到磁盘,第二部同步inode节点,第三步,把因为同步inode节点变脏的buffer再写入磁盘。
===========================================
我们平时编程时涉及到文件写操作的,如果仅仅是把数据写入到应用程序缓冲区中,这时高速缓冲区中并没有它的记录,这时如果应用程序退出,就会造成数据丢失。如果执行了flush就会把数据刷到告诉缓冲区中。
===========================================
84 void inline invalidate_buffers(int dev)
85 {
86 int i;
87 struct buffer_head * bh;
88
89 bh = start_buffer;
90 for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
91 if (bh->b_dev != dev)
92 continue;
93 wait_on_buffer(bh);
94 if (bh->b_dev == dev)
95 bh->b_uptodate = bh->b_dirt = 0;
96 }
97 }
很简单,93行首先等待对bh的执行权,然后只需要设置标记b_uptodate,b_dirt即可。
99 /*
100 * This routine checks whether a floppy has been changed, and
101 * invalidates all buffer-cache-entries in that case. This
102 * is a relatively slow routine, so we have to try to minimize using
103 * it. Thus it is called only upon a 'mount' or 'open'. This
104 * is the best way of combining speed and utility, I think.
105 * People changing diskettes in the middle of an operation deserve
106 * to loose :-)
107 *
108 * NOTE! Although currently this is only for floppies, the idea is
109 * that any additional removable block-device will use this routine,
110 * and that mount/open needn't know that floppies/whatever are
111 * special.
112 */
113 void check_disk_change(int dev)
114 {
115 int i;
116
117 if (MAJOR(dev) != 2)
118 return;
119 if (!floppy_change(dev & 0x03))
120 return;
121 for (i=0 ; i<NR_SUPER ; i++)
122 if (super_block[i].s_dev == dev)
123 put_super(super_block[i].s_dev);
124 invalidate_inodes(dev);
125 invalidate_buffers(dev);
126 }
128 #define _hashfn(dev,block) (((unsigned)(dev^block))%NR_HASH)
129 #define hash(dev,block) hash_table[_hashfn(dev,block)]
131 static inline void remove_from_queues(struct buffer_head * bh)
132 {
133 /* remove from hash-queue */
134 if (bh->b_next)
135 bh->b_next->b_prev = bh->b_prev;
136 if (bh->b_prev)
137 bh->b_prev->b_next = bh->b_next;
buffer_head通过b_prev和b_next来链接成双向链表
138 if (hash(bh->b_dev,bh->b_blocknr) == bh)
139 hash(bh->b_dev,bh->b_blocknr) = bh->b_next;
hash table中相应的slot指向hash值相同的链表
140 /* remove from free list */
141 if (!(bh->b_prev_free) || !(bh->b_next_free))
142 panic("Free block list corrupted");
143 bh->b_prev_free->b_next_free = bh->b_next_free;
144 bh->b_next_free->b_prev_free = bh->b_prev_free;
通过b_next_free和b_prev_free连接成空闲双向链表
145 if (free_list == bh)
146 free_list = bh->b_next_free;
free_list作为空闲链表表头
147 }
149 static inline void insert_into_queues(struct buffer_head * bh)
150 {
151 /* put at end of free list */
152 bh->b_next_free = free_list;
153 bh->b_prev_free = free_list->b_prev_free;
154 free_list->b_prev_free->b_next_free = bh;
155 free_list->b_prev_free = bh;
可见free_list链表尾部是最近使用的,首部则是最不常使用的
156 /* put the buffer in new hash-queue if it has a device */
157 bh->b_prev = NULL;
158 bh->b_next = NULL;
159 if (!bh->b_dev)
160 return;
161 bh->b_next = hash(bh->b_dev,bh->b_blocknr);
162 hash(bh->b_dev,bh->b_blocknr) = bh;
163 bh->b_next->b_prev = bh;
添加到hash表中
164 }
166 static struct buffer_head * find_buffer(int dev, int block)
167 {
168 struct buffer_head * tmp;
169
170 for (tmp = hash(dev,block) ; tmp != NULL ; tmp = tmp->b_next)
171 if (tmp->b_dev==dev && tmp->b_blocknr==block)
172 return tmp;
173 return NULL;
174 }
非常简单。
176 /*
177 * Why like this, I hear you say... The reason is race-conditions.
178 * As we don't lock buffers (unless we are readint them, that is),
179 * something might happen to it while we sleep (ie a read-error
180 * will force it bad). This shouldn't really happen currently, but
181 * the code is ready.
182 */
183 struct buffer_head * get_hash_table(int dev, int block)
184 {
185 struct buffer_head * bh;
186
187 for (;;) {
188 if (!(bh=find_buffer(dev,block)))
189 return NULL;
190 bh->b_count++;
191 wait_on_buffer(bh);
192 if (bh->b_dev == dev && bh->b_blocknr == block)
193 return bh;
194 bh->b_count--;
195 }
196 }
返回对应设备和块号的buffer_head.192行重新判断是因为在睡眠过程中,可能整个世界都变了
205 #define BADNESS(bh) (((bh)->b_dirt<<1)+(bh)->b_lock)
206 struct buffer_head * getblk(int dev,int block)
207 {
208 struct buffer_head * tmp, * bh;
210 repeat:
211 if (bh = get_hash_table(dev,block))
212 return bh;
如果没有找到
213 tmp = free_list;
214 do {
215 if (tmp->b_count)
216 continue;
217 if (!bh || BADNESS(tmp)<BADNESS(bh)) {
218 bh = tmp;
219 if (!BADNESS(tmp))
220 break;
221 }
222 /* and repeat until we find something good */
223 } while ((tmp = tmp->b_next_free) != free_list);
224 if (!bh) {
225 sleep_on(&buffer_wait);
226 goto repeat;
227 }
228 wait_on_buffer(bh);
229 if (bh->b_count)
230 goto repeat;
231 while (bh->b_dirt) {
232 sync_dev(bh->b_dev);
233 wait_on_buffer(bh);
234 if (bh->b_count)
235 goto repeat;
236 }
237 /* NOTE!! While we slept waiting for this block, somebody else might */
238 /* already have added "this" block to the cache. check it */
239 if (find_buffer(dev,block))
240 goto repeat;
241 /* OK, FINALLY we know that this buffer is the only one of it's kind, */
242 /* and that it's unused (b_count=0), unlocked (b_lock=0), and clean */
243 bh->b_count=1;
244 bh->b_dirt=0;
245 bh->b_uptodate=0;
246 remove_from_queues(bh);
247 bh->b_dev=dev;
248 bh->b_blocknr=block;
249 insert_into_queues(bh);
250 return bh;
251 }
253 void brelse(struct buffer_head * buf)
254 {
255 if (!buf)
256 return;
257 wait_on_buffer(buf);
258 if (!(buf->b_count--))
259 panic("Trying to free free buffer");
260 wake_up(&buffer_wait);
261 }
关于这个函数只解释一下buffer_wait,我们知道buffer_head是有限的,如果一个任务请求磁盘操作,但此时所有的buffer_head都不是空闲的,那么该任务只有等待在buffer_wait上面。
263 /*
264 * bread() reads a specified block and returns the buffer that contains
265 * it. It returns NULL if the block was unreadable.
266 */
267 struct buffer_head * bread(int dev,int block)
268 {
269 struct buffer_head * bh;
271 if (!(bh=getblk(dev,block)))
272 panic("bread: getblk returned NULL\n");
273 if (bh->b_uptodate)
274 return bh;
275 ll_rw_block(READ,bh);
276 wait_on_buffer(bh);
277 if (bh->b_uptodate)
278 return bh;
279 brelse(bh);
280 return NULL;
281 }
这个我们在分析其他源码的过程中已经详细分析过了。
先从缓冲区中读,如果读不到就向设备驱动程序发起请求。
283 #define COPYBLK(from,to) \
284 __asm__("cld\n\t" \
285 "rep\n\t" \
286 "movsl\n\t" \
287 ::"c" (BLOCK_SIZE/4),"S" (from),"D" (to) \
288 :"cx","di","si")
源字符串指出由DS:SI和ES:DI指向目标字符串
290 /*
291 * bread_page reads four buffers into memory at the desired address. It's
292 * a function of its own, as there is some speed to be got by reading them
293 * all at the same time, not waiting for one to be read, and then another
294 * etc.
295 */
296 void bread_page(unsigned long address,int dev,int b[4])
297 {
298 struct buffer_head * bh[4];
299 int i;
300
301 for (i=0 ; i<4 ; i++)
302 if (b[i]) {
303 if (bh[i] = getblk(dev,b[i]))
304 if (!bh[i]->b_uptodate)
305 ll_rw_block(READ,bh[i]);
306 } else
307 bh[i] = NULL;
308 for (i=0 ; i<4 ; i++,address += BLOCK_SIZE)
309 if (bh[i]) {
310 wait_on_buffer(bh[i]);
311 if (bh[i]->b_uptodate)
312 COPYBLK((unsigned long) bh[i]->b_data,address);
313 brelse(bh[i]);
314 }
315 }
这里也比较简单,读取4个块,并拷贝到指定内存地址后释放buffer_head
317 /*
318 * Ok, breada can be used as bread, but additionally to mark other
319 * blocks for reading as well. End the argument list with a negative
320 * number.
321 */
322 struct buffer_head * breada(int dev,int first, ...)
323 {
324 va_list args;
325 struct buffer_head * bh, *tmp;
326
327 va_start(args,first);
328 if (!(bh=getblk(dev,first)))
329 panic("bread: getblk returned NULL\n");
330 if (!bh->b_uptodate)
331 ll_rw_block(READ,bh);
332 while ((first=va_arg(args,int))>=0) {
333 tmp=getblk(dev,first);
334 if (tmp) {
335 if (!tmp->b_uptodate)
336 ll_rw_block(READA,bh);
337 tmp->b_count--;
338 }
339 }
340 va_end(args);
341 wait_on_buffer(bh);
342 if (bh->b_uptodate)
343 return bh;
344 brelse(bh);
345 return (NULL);
346 }
这个函数可以接收可变参数,但是原理上与前面的bread一致
348 void buffer_init(long buffer_end)
349 {
350 struct buffer_head * h = start_buffer; //前面已经初始化了
351 void * b;
352 int i;
353
354 if (buffer_end == 1<<20)
355 b = (void *) (640*1024);
356 else
357 b = (void *) buffer_end;
358 while ( (b -= BLOCK_SIZE) >= ((void *) (h+1)) ) {
359 h->b_dev = 0;
360 h->b_dirt = 0;
361 h->b_count = 0;
362 h->b_lock = 0;
363 h->b_uptodate = 0;
364 h->b_wait = NULL;
365 h->b_next = NULL;
366 h->b_prev = NULL;
367 h->b_data = (char *) b;
368 h->b_prev_free = h-1;
369 h->b_next_free = h+1;
pre指向内存低地址,next指向内存高地址。
370 h++;
每个BLOCK_SIZE为1K,所以从尾部end开始为每一个buff设置buffer_head与其对应
371 NR_BUFFERS++;
372 if (b == (void *) 0x100000)
373 b = (void *) 0xA0000;
374 }
375 h--;
376 free_list = start_buffer;
377 free_list->b_prev_free = h; //双向链表
378 h->b_next_free = free_list; //处理链表最后一个和第一个
379 for (i=0;i<NR_HASH;i++)
380 hash_table[i]=NULL; //hash_table初始为空
381 }
到这里我们就介绍完了buffer.c整个源文件
总结
我们就从初始化函数开始总结,在buffer_init中,对缓冲区内存从尾部开始遍历每个块对应的从缓冲区首部为其设置buffer_head来描述之。缓冲区的地址就保存在buffer_head的b_data域中。初始时,hash_table内容均为空。空闲链表从尾部一直连接到首部(双向的)。后续在读取磁盘内容时,对应的内容会被读到buffer_head中,并且添加到hash_table和free_list中。hash_table中的冲突域采用b_next和b_pre属性链接成链。free_list指向的空闲链表则是通过buffer_head的b_pre_free和b_next_free链接成双向链表的。对于sync同步操作,依次遍历buffer_head,对标记为脏的buffer_head进行写操作,这里的写操作是实际调用磁盘驱动程序实现的。对于getblk操作,首先会通过hash方法寻找,如果发现为空,说明这时此磁盘内容还未被读入。由于每一个读入的磁盘内容都需要在缓冲区中使用一个buffer_head对其进行描述,因此现在就需要找到这样一个buffer_head,这是从空闲链表中进行寻找的。找到之后对该buffer_head的属性进行设置,注意此时我们并没有真正调用磁盘驱动程序来实际读取内容,只是把这个buffer_head加入相应的空闲链表和hash_table之中。真正的读取操作是在bread中进行的。在bread中首先进行getblk操作得到buffer_head后根据它的b_uptodate(内容是否是新的)标记来决定是否执行实际的读盘操作。至于释放操作brelse比较简单,首先等待buffer_head解锁,然后递减引用计数(i_count),最后唤醒等待在buffer_wait上的任务。
namei.c
1 /*
2 * linux/fs/namei.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 /*
8 * Some corrections by tytso.
9 */
10
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <asm/segment.h>
14
15 #include <string.h>
16 #include <fcntl.h>
17 #include <errno.h>
18 #include <const.h>
19 #include <sys/stat.h>
20
21 #define ACC_MODE(x) ("\004\002\006\377"[(x)&O_ACCMODE])
"\004\002\006\377"看成字符数组,[(x)&O_ACCMODE]就是数组索引
23 /*
24 * comment out this line if you want names > NAME_LEN chars to be
25 * truncated. Else they will be disallowed.
26 */
27 /* #define NO_TRUNCATE */
28
29 #define MAY_EXEC 1
30 #define MAY_WRITE 2
31 #define MAY_READ 4
33 /*
34 * permission()
35 *
36 * is used to check for read/write/execute permissions on a file.
37 * I don't know if we should look at just the euid or both euid and
38 * uid, but that should be easily changed.
39 */
40 static int permission(struct m_inode * inode,int mask)
41 {
42 int mode = inode->i_mode;
43
44 /* special case: not even root can read/write a deleted file */
45 if (inode->i_dev && !inode->i_nlinks) //设备不为0,链接数为0,说明已经被删除
46 return 0; //返回
47 else if (current->euid==inode->i_uid) //当前进程的euid == inode的uid
48 mode >>= 6;
49 else if (current->egid==inode->i_gid) //当前进程的egid == inode的gid
50 mode >>= 3;
51 if (((mode & mask & 0007) == mask) || suser())//如果mode与mask的判断通过或者是超级用户,返回1
52 return 1;
53 return 0;
54 }
下面介绍的函数涉及到struct dir_entry,所以我们先来看一下:
157 struct dir_entry {
158 unsigned short inode; //目录对应的inode
159 char name[NAME_LEN]; //目录名
160 };
56 /*
57 * ok, we cannot use strncmp, as the name is not in our data space.
58 * Thus we'll have to use match. No big problem. Match also makes
59 * some sanity tests.
60 *
61 * NOTE! unlike strncmp, match returns 1 for success, 0 for failure.
62 */
63 static int match(int len,const char * name,struct dir_entry * de)
64 {
65 register int same __asm__("ax");
66
67 if (!de || !de->inode || len > NAME_LEN)
68 return 0;
69 if (len < NAME_LEN && de->name[len])
70 return 0;
这个情况下,参数中指定的name长度小于目录名实际长度,直接返回不匹配0.
71 __asm__("cld\n\t"
72 "fs ; repe ; cmpsb\n\t"
73 "setz %%al"
74 :"=a" (same)
75 :"0" (0),"S" ((long) name),"D" ((long) de->name),"c" (len)
76 :"cx","di","si");
77 return same;
78 }
80 /*
81 * find_entry()
82 *
83 * finds an entry in the specified directory with the wanted name. It
84 * returns the cache buffer in which the entry was found, and the entry
85 * itself (as a parameter - res_dir). It does NOT read the inode of the
86 * entry - you'll have to do that yourself if you want to.
87 *
88 * This also takes care of the few special cases due to '..'-traversal
89 * over a pseudo-root and a
mount point.
90 */
91 static struct buffer_head * find_entry(struct m_inode ** dir,
92 const char * name, int namelen, struct dir_entry ** res_dir)
93 {
94 int entries;
95 int block,i;
96 struct buffer_head * bh;
97 struct dir_entry * de;
98 struct super_block * sb;
99
100 #ifdef NO_TRUNCATE
101 if (namelen > NAME_LEN)
102 return NULL;
103 #else //这时会截取字符串
104 if (namelen > NAME_LEN)
105 namelen = NAME_LEN;
106 #endif
107 entries = (*dir)->i_size / (sizeof (struct dir_entry));
如果inode是目录的话,那么它的内容将以目录项dir_entry的形式存放。这里是用来计算目录项个数的
108 *res_dir = NULL;
109 if (!namelen)
110 return NULL;
111 /* check for '..', as we might have to do some "magic" for it */
112 if (namelen==2 && get_fs_byte(name)=='.' && get_fs_byte(name+1)=='.') { // ".." 的情况
113 /* '..' in a pseudo-root results in a faked '.' (just change namelen) */
114 if ((*dir) == current->root) //如果指定目录是当期进程的伪根目录
115 namelen=1; //这时".."应变为"."
116 else if ((*dir)->i_num == ROOT_INO) { //如果指定目录是安装点
117 /* '..' over a mount-point results in 'dir' being exchanged for
the mounted
118 directory-inode. NOTE! We set mounted, so that we can iput the new dir */
在安装点上,".."会导致目录变为安装目录
119 sb=get_super((*dir)->i_dev);
120 if (sb->s_imount) {
121 iput(*dir);
122 (*dir)=sb->s_imount;
123 (*dir)->i_count++;
124 }
125 }
126 }
127 if (!(block = (*dir)->i_zone[0]))
128 return NULL;
先取出第一个块号
129 if (!(bh = bread((*dir)->i_dev,block)))
130 return NULL;
读取该块
131 i = 0;
132 de = (struct dir_entry *) bh->b_data;
转为dir_entry
133 while (i < entries) { //遍历每个entry
134 if ((char *)de >= BLOCK_SIZE+bh->b_data) { //如果已经搜索完了整个块
135 brelse(bh); //释放该块
136 bh = NULL;
137 if (!(block = bmap(*dir,i/DIR_ENTRIES_PER_BLOCK)) ||
138 !(bh = bread((*dir)->i_dev,block))) {
137行,首先根据dir中的块号计算实际块号(在磁盘中的实际块号)
然后读取该块。如果存在块号为0或者bh为NULL,则执行下面139,140行(这一块内没有存放目录或文件)
139 i += DIR_ENTRIES_PER_BLOCK;
140 continue;
141 }
142 de = (struct dir_entry *) bh->b_data;
143 }
144 if (match(namelen,name,de)) {
145 *res_dir = de;
146 return bh;
147 }
如果匹配了,就把该dir_entry保存到res_dir中,返回该bh
148 de++;
149 i++;
如果当前res_dir不是,继续搜索下一个
150 }//while
151 brelse(bh);
152 return NULL;
遍历完还没找到就放回bh,返回NULL
153 }
155 /*
156 * add_entry()
157 *
158 * adds a file entry to the specified directory, using the same
159 * semantics as find_entry(). It returns NULL if it failed.
160 *
161 * NOTE!! The inode part of 'de' is left at 0 - which means you
162 * may not sleep between calling this and putting something into
163 * the entry, as someone else might have used it while you slept.
164 */
165 static struct buffer_head * add_entry(struct m_inode * dir,
166 const char * name, int namelen, struct dir_entry ** res_dir)
167 {
168 int block,i;
169 struct buffer_head * bh;
170 struct dir_entry * de;
171
172 *res_dir = NULL;
173 #ifdef NO_TRUNCATE
174 if (namelen > NAME_LEN)
175 return NULL;
176 #else
177 if (namelen > NAME_LEN)
178 namelen = NAME_LEN;
179 #endif
180 if (!namelen)
181 return NULL;
182 if (!(block = dir->i_zone[0]))
183 return NULL;
184 if (!(bh = bread(dir->i_dev,block)))
185 return NULL;
186 i = 0;
187 de = (struct dir_entry *) bh->b_data;
上面与 find_entry都是一样的
188 while (1) {
189 if ((char *)de >= BLOCK_SIZE+bh->b_data) { //如果一个逻辑块遍历完成
190 brelse(bh); //把遍历完的块放回
191 bh = NULL;
192 block = create_block(dir,i/DIR_ENTRIES_PER_BLOCK); //获取下一个块号
193 if (!block)
194 return NULL;
195 if (!(bh = bread(dir->i_dev,block))) { //读入下一个块
196 i += DIR_ENTRIES_PER_BLOCK; //如果下一个块不存在,跳过,需要更新i
197 continue;
198 }
199 de = (struct dir_entry *) bh->b_data;
200 }
201 if (i*sizeof(struct dir_entry) >= dir->i_size) {
202 de->inode=0;
203 dir->i_size = (i+1)*sizeof(struct dir_entry);
204 dir->i_dirt = 1;
205 dir->i_ctime = CURRENT_TIME;
206 }
201行为true说明指定的目录没有删除的空目录项,现在要向它添加一个目录项,因此203行增加它的大小。202行暂时把目录项的inode设为0.置位脏标记,修改i_ctime。
207 if (!de->inode) {
208 dir->i_mtime = CURRENT_TIME;
209 for (i=0; i < NAME_LEN ; i++)
210 de->name[i]=(i<namelen)?get_fs_byte(name+i):0;
211 bh->b_dirt = 1;
212 *res_dir = de;
213 return bh;
214 }
207行为true,说明找到了满足条件的目录项,它或许是由于之前删除而留下的空项,或者是由于我们在201的if中为该目录新添加的。208行修改它的mtime;209-210行为其name属性赋值,211行标记bh为脏;212把目录项保存到res_dir;213返回该bh
215 de++;
216 i++;
如果当前项不符合,215,216行递增地址和计数,准备遍历下一个
217 }
218 brelse(bh);
219 return NULL;
220 }
222 /*
223 * get_dir()
224 *
225 * Getdir traverses the pathname until it hits the topmost directory.
226 * It returns NULL on failure.
227 */
228 static struct m_inode *get_dir(const char * pathname)
229 {
230 char c;
231 const char * thisname;
232 struct m_inode * inode;
233 struct buffer_head * bh;
234 int namelen,inr,idev;
235 struct dir_entry * de;
236
237 if (!current->root || !current->root->i_count)
238 panic("No root inode");
239 if (!current->pwd || !current->pwd->i_count)
240 panic("No cwd inode");
241 if ((c=get_fs_byte(pathname))=='/') {
242 inode = current->root;
243 pathname++;
244 } else if (c)
245 inode = current->pwd;
246 else
247 return NULL; /* empty name is bad */
因为是要获取the topmost directory,所以只要pathname中第一个字符为'/',inode就设为根目录
否则如果不为空,inode就设为当前工作目录为空返回NULL
248 inode->i_count++; //递增引用计数
249 while (1) {
250 thisname = pathname;
251 if (!S_ISDIR(inode->i_mode) || !permission(inode,MAY_EXEC)) { //权限检查
252 iput(inode);
253 return NULL;
254 }
255 for(namelen=0;(c=get_fs_byte(pathname++))&&(c!='/');namelen++)
256 /* nothing */ ;
243行如果pathname以/开头,则已经执行过++操作了。因此这for循环就是找pathname中的各个部分
257 if (!c)
258 return inode;
c为NULL,说明遍历完成了,这时我们已经把结果保存到inode,直接返回即可
259 if (!(bh = find_entry(&inode,thisname,namelen,&de))) {
260 iput(inode);
261 return NULL;
262 }
寻找目录项,找不到的话就返回NULL
263 inr = de->inode; //记录目录项对应的inode,下次循环还会用到
264 idev = inode->i_dev;
265 brelse(bh);
266 iput(inode); //放回inode(前面已经用过了)
267 if (!(inode = iget(idev,inr))) //如果找不到对应的inode,直接返回NULL
268 return NULL;
269 }
270 }
272 /*
273 * dir_namei()
274 *
275 * dir_namei() returns the inode of the directory of the
276 * specified name, and the name within that directory.
277 */
278 static struct m_inode * dir_namei(const char * pathname,
279 int * namelen, const char ** name)
280 {
281 char c;
282 const char * basename;
283 struct m_inode * dir;
284
285 if (!(dir = get_dir(pathname)))
286 return NULL;
首先获取目录inode
287 basename = pathname;
288 while (c=get_fs_byte(pathname++))
289 if (c=='/')
290 basename=pathname;
取得最后的文件名(比如/etc/passwd,这里basename就是passwd)
291 *namelen = pathname-basename-1;
保存文件名的长度
292 *name = basename;
保存文件名
293 return dir;
294 }
296 /*
297 * namei()
298 *
299 * is used by most simple commands to get the inode of a specified name.
300 * Open, link etc use their own routines, but this is enough for things
301 * like 'chmod' etc.
302 */
303 struct m_inode * namei(const char * pathname)
304 {
305 const char * basename;
306 int inr,dev,namelen;
307 struct m_inode * dir;
308 struct buffer_head * bh;
309 struct dir_entry * de;
310
311 if (!(dir = dir_namei(pathname,&namelen,&basename)))
312 return NULL;
获取目录inode
313 if (!namelen) /* special case: '/usr/' etc */
314 return dir;
315 bh = find_entry(&dir,basename,namelen,&de);
寻找目录项inode
316 if (!bh) {
317 iput(dir);
318 return NULL;
319 }
如果bh为空,放回dir
320 inr = de->inode;
321 dev = dir->i_dev;
获取目录项的inode和设备号
322 brelse(bh);
现在可以释放bh了,因为我们已经获得了需要的inode以及设备号,bh用不到了
323 iput(dir);
324 dir=iget(dev,inr);
获取目录项对应的inode(当然,目录项可能是目录也可能是文件)
325 if (dir) {
326 dir->i_atime=CURRENT_TIME; //更新访问时间
327 dir->i_dirt=1; //脏标记
328 }
329 return dir;
330 }
332 /*
333 * open_namei()
334 *
335 * namei for open - this is in fact almost the whole open-routine.
336 */
337 int open_namei(const char * pathname, int flag, int mode,
338 struct m_inode ** res_inode)
339 {
340 const char * basename;
341 int inr,dev,namelen;
342 struct m_inode * dir, *inode;
343 struct buffer_head * bh;
344 struct dir_entry * de;
345
346 if ((flag & O_TRUNC) && !(flag & O_ACCMODE))
347 flag |= O_WRONLY;
O_ACCMODE<0003>:读写文件操作时,用于取出flag的低2位
O_RDONLY<00>:只读打开
O_WRONLY<01>:只写打开
O_RDWR<02>:读写打开
348 mode &= 0777 & ~current->umask;
349 mode |= I_REGULAR;
350 if (!(dir = dir_namei(pathname,&namelen,&basename)))
351 return -ENOENT;
获取目录inode
352 if (!namelen) { /* special case: '/usr/' etc */
353 if (!(flag & (O_ACCMODE|O_CREAT|O_TRUNC))) {
354 *res_inode=dir;
355 return 0;
356 }
357 iput(dir);
358 return -EISDIR;
359 }
360 bh = find_entry(&dir,basename,namelen,&de);
获取目录项
361 if (!bh) {
//下面是没有读到目录项的情况
362 if (!(flag & O_CREAT)) {
363 iput(dir);
364 return -ENOENT;
365 }
如果没有指定不存在时创建文件,那么放回dir,返回。
366 if (!permission(dir,MAY_WRITE)) {
367 iput(dir);
368 return -EACCES;
369 }
如果指定不存在时创建文件,但是没有权限,返回。
370 inode = new_inode(dir->i_dev);
371 if (!inode) {
372 iput(dir);
373 return -ENOSPC;
374 }
否则就创建一个inode,如果创建失败,返回
375 inode->i_uid = current->euid;
376 inode->i_mode = mode;
377 inode->i_dirt = 1;
设置它的属性
378 bh = add_entry(dir,basename,namelen,&de);
添加到目录中
379 if (!bh) { //添加失败
380 inode->i_nlinks--;
381 iput(inode);
382 iput(dir);
383 return -ENOSPC;
384 }
385 de->inode = inode->i_num;
386 bh->b_dirt = 1;
387 brelse(bh);
388 iput(dir);
389 *res_inode = inode;
390 return 0;
391 }
运行到这里说明读到了目录项
392 inr = de->inode;
393 dev = dir->i_dev;
394 brelse(bh);
395 iput(dir);
396 if (flag & O_EXCL) //独占标记
397 return -EEXIST;
398 if (!(inode=iget(dev,inr)))
399 return -EACCES;
取得inode,取不到的话,返回NULL
400 if ((S_ISDIR(inode->i_mode) && (flag & O_ACCMODE)) ||
401 !permission(inode,ACC_MODE(flag))) {
402 iput(inode);
403 return -EPERM;
404 }
如果取得的inode是目录并且访问权限为只写或读写或者没有访问权限,则放回inode
405 inode->i_atime = CURRENT_TIME;
406 if (flag & O_TRUNC)
407 truncate(inode);
408 *res_inode = inode;
409 return 0;
410 }
总结
目录用dir_entry描述,包括inode和目录名。
根据路径找到文件的inode节点是一个非常重要的操作(namei),这个过程比较耗时,因为需要一层层的遍历目录。我们知道inode节点的i_zone数组可以用来保存磁盘块号。对于目录来说,其inode节点中保存的就是dir_entry,因此当我们需要寻找一个目录下的文件(或目录)时,需要依次读入这些磁盘块根据文件(目录)名进行比较(match),需要注意一些特殊情况,比如路径中没有指定根目录时需要以工作目录作为参考,在安装点上,".."会导致目录变为安装目录等,另外一点就是权限问题。还有一个比较重要的操作就是向目录中添加项(add_entry),同样需要搜索inode的数据块,找到一个合适的位置插入。
truncate.c
1 /*
2 * linux/fs/truncate.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 #include <linux/sched.h>
8
9 #include <sys/stat.h>
11 static void free_ind(int dev,int block)
12 {
13 struct buffer_head * bh;
14 unsigned short * p;
15 int i;
16
17 if (!block)
18 return;
19 if (bh=bread(dev,block)) {
20 p = (unsigned short *) bh->b_data;
21 for (i=0;i<512;i++,p++)
22 if (*p)
23 free_block(dev,*p);
24 brelse(bh);
25 }
26 free_block(dev,block);
27 }
我们看到p是short类型的指针,说明22行中*p是一个short数据,通过23行可以看出它代表block号。这其实是用来释放文件对应的块的。因为我们知道inode节点中有一个属性如下
100 unsigned short i_zone[9];
它代表文件所占用的块号数组。其中i_zone[0] - i_zone[7]是直接块号,i_zone[7]是一次间接块号,i_zone[8]是二次间接块号。
上面这个函数就是用来释放一次间接块号的。
29 static void free_dind(int dev,int block)
30 {
31 struct buffer_head * bh;
32 unsigned short * p;
33 int i;
34
35 if (!block)
36 return;
37 if (bh=bread(dev,block)) {
38 p = (unsigned short *) bh->b_data;
39 for (i=0;i<512;i++,p++)
40 if (*p)
41 free_ind(dev,*p);
42 brelse(bh);
43 }
44 free_block(dev,block);
45 }
释放二次间接块号,这里调用了前面的释放一次间接块号的函数。
47 void truncate(struct m_inode * inode)
48 {
49 int i;
50
51 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
52 return;
常规文件或者目录才可以调用此函数
53 for (i=0;i<7;i++)
54 if (inode->i_zone[i]) {
55 free_block(inode->i_dev,inode->i_zone[i]);
56 inode->i_zone[i]=0;
57 }
释放直接块
58 free_ind(inode->i_dev,inode->i_zone[7]);
59 free_dind(inode->i_dev,inode->i_zone[8]);
释放一次和二次间接快
60 inode->i_zone[7] = inode->i_zone[8] = 0;
61 inode->i_size = 0;
文件大小变为0
62 inode->i_dirt = 1;
标记为脏
63 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
这会改变文件的修改时间i_mtime和改变时间i_ctime。
64 }
总结
这个文件整体上是比较简单的。包括三个函数释放一级间接块,释放二级间接块,截取inode。对于直接块来说,i_zone中相应保存的就是inode所使用的磁盘块。一级间接块,顾名思义,i_zone指明的块中存放的不是普通数据,而是块号。因此对一级间接块的释放操作就是读取一级间接块,遍历其中每一个块调用free_block进行释放;对于二级间接块读取一级间接块后就可以转换为对一级间接块的释放操作。truncate操作则是对inode的所有块进行释放,最后设置其大小为0.
此外留意一下对于inode的操作,其atime,ctime,mtime是如何变化的。
open.c
1 /*
2 * linux/fs/open.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
7 #include <string.h>
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <sys/types.h>
11 #include <utime.h>
12 #include <sys/stat.h>
14 #include <linux/sched.h>
15 #include <linux/tty.h>
16 #include <linux/kernel.h>
17 #include <asm/segment.h>
19 int sys_ustat(int dev, struct ustat * ubuf)
20 {
21 return -ENOSYS;
22 }
23
24 int sys_utime(char * filename, struct utimbuf * times)
25 {
26 struct m_inode * inode;
27 long actime,modtime;
28
29 if (!(inode=namei(filename)))
30 return -ENOENT;
31 if (times) {
32 actime = get_fs_long((unsigned long *) ×->actime);
33 modtime = get_fs_long((unsigned long *) ×->modtime);
34 } else
35 actime = modtime = CURRENT_TIME;
36 inode->i_atime = actime;
37 inode->i_mtime = modtime;
38 inode->i_dirt = 1;
39 iput(inode);
40 return 0;
41 }
更新参数中指明的文件的事件,如果times不为空,就用times来更新;否则就用当前时间来更新
43 /*
44 * XXX should we use the real or effective uid? BSD uses the real uid,
45 * so as to make this call useful to setuid programs.
46 */
47 int sys_access(const char * filename,int mode)
48 {
49 struct m_inode * inode;
50 int res, i_mode;
52 mode &= 0007;
53 if (!(inode=namei(filename)))
54 return -EACCES;
55 i_mode = res = inode->i_mode & 0777;
取得参数中文件的i_mode属性
56 iput(inode);
57 if (current->uid == inode->i_uid)
58 res >>= 6;
59 else if (current->gid == inode->i_gid)
60 res >>= 6;
根据当前用户与inode的关系取得相应的mode,保存到res中
61 if ((res & 0007 & mode) == mode)
62 return 0;
63 /*
64 * XXX we are doing this test last because we really should be
65 * swapping the effective with the real user id (temporarily),
66 * and then calling suser() routine. If we do call the
67 * suser() routine, it needs to be called last.
68 */
69 if ((!current->uid) &&
70 (!(mode & 1) || (i_mode & 0111)))
71 return 0;
72 return -EACCES;
73 }
75 int sys_chdir(const char * filename)
76 {
77 struct m_inode * inode;
78
79 if (!(inode = namei(filename)))
80 return -ENOENT;
81 if (!S_ISDIR(inode->i_mode)) {
82 iput(inode);
83 return -ENOTDIR;
84 }
85 iput(current->pwd);
86 current->pwd = inode;
87 return (0);
88 }
90 int sys_chroot(const char * filename)
91 {
92 struct m_inode * inode;
93
94 if (!(inode=namei(filename)))
95 return -ENOENT;
96 if (!S_ISDIR(inode->i_mode)) {
97 iput(inode);
98 return -ENOTDIR;
99 }
100 iput(current->root);
101 current->root = inode;
102 return (0);
103 }
105 int sys_chmod(const char * filename,int mode)
106 {
107 struct m_inode * inode;
108
109 if (!(inode=namei(filename)))
110 return -ENOENT;
111 if ((current->euid != inode->i_uid) && !suser()) {
112 iput(inode);
113 return -EACCES;
114 }
115 inode->i_mode = (mode & 07777) | (inode->i_mode & ~07777);
116 inode->i_dirt = 1;
117 iput(inode);
118 return 0;
119 }
120
121 int sys_chown(const char * filename,int uid,int gid)
122 {
123 struct m_inode * inode;
125 if (!(inode=namei(filename)))
126 return -ENOENT;
127 if (!suser()) {
128 iput(inode);
129 return -EACCES;
130 }
131 inode->i_uid=uid;
132 inode->i_gid=gid;
133 inode->i_dirt=1;
134 iput(inode);
135 return 0;
136 }
138 int sys_open(const char * filename,int flag,int mode)
139 {
140 struct m_inode * inode;
141 struct file * f;
142 int i,fd;
144 mode &= 0777 & ~current->umask;
145 for(fd=0 ; fd<NR_OPEN ; fd++)
146 if (!current->filp[fd])
147 break;
148 if (fd>=NR_OPEN)
149 return -EINVAL;
150 current->close_on_exec &= ~(1<<fd);
151 f=0+file_table;
152 for (i=0 ; i<NR_FILE ; i++,f++)
153 if (!f->f_count) break;
154 if (i>=NR_FILE)
155 return -EINVAL;
156 (current->filp[fd]=f)->f_count++;
递增引用计数
157 if ((i=open_namei(filename,flag,mode,&inode))<0) {
158 current->filp[fd]=NULL;
159 f->f_count=0;
160 return i;
161 }
162 /* ttys are somewhat special (ttyxx major==4, tty major==5) */
163 if (S_ISCHR(inode->i_mode))
164 if (MAJOR(inode->i_zone[0])==4) {
165 if (current->leader && current->tty<0) {
166 current->tty = MINOR(inode->i_zone[0]);
167 tty_table[current->tty].pgrp = current->pgrp;
对于字符型设备,如果当前进程是组长进程并且设备没有终端
166行就设置它的tty为inode的设备号
167行设置当前进程tty表中与当前tty对应的表项的父进程组号等于当前进程组号
168 }
169 } else if (MAJOR(inode->i_zone[0])==5)
170 if (current->tty<0) { //没有终端,出错返回
171 iput(inode);
172 current->filp[fd]=NULL;
173 f->f_count=0;
174 return -EPERM;
175 }
176 /* Likewise with block-devices: check for floppy_change */
177 if (S_ISBLK(inode->i_mode))
178 check_disk_change(inode->i_zone[0]);
179 f->f_mode = inode->i_mode;
180 f->f_flags = flag;
181 f->f_count = 1;
182 f->f_inode = inode;
183 f->f_pos = 0;
184 return (fd);
185 }
187 int sys_creat(const char * pathname, int mode)
188 {
189 return sys_open(pathname, O_CREAT | O_TRUNC, mode);
190 }
192 int sys_close(unsigned int fd)
193 {
194 struct file * filp;
196 if (fd >= NR_OPEN)
197 return -EINVAL;
198 current->close_on_exec &= ~(1<<fd);
199 if (!(filp = current->filp[fd]))
200 return -EINVAL;
201 current->filp[fd] = NULL;
202 if (filp->f_count == 0)
203 panic("Close: file count is 0");
204 if (--filp->f_count)
205 return (0);
上面看到我们释放了filp[fd],还记得我们在打开文件时,从系统file_table中搜索一个引用计数为0的file,现在我们在关闭时对应的递减了它的引用计数,使得它可以重新被使用。
206 iput(filp->f_inode);
207 return (0);
208 }
另外还有一点需要说明一下,我们知道file_table是一个全局数组,用于存放系统中file,但是C语言定义一个数组时默认是不会对其进行初始化的,它是在fs/super.c文件的mount_root中初始化的
250 for(i=0;i<NR_FILE;i++)
251 file_table[i].f_count=0;
可以看到,只初始化f_count为0即可,我们在上面的sys_open中也看到了,它判断一个file空闲的标准就是根据这里的f_count是否为0
总结
大部分函数都是对inode的属性或者根据inode的属性进行一些操作或设置。看一下sys_open,它执行打开操作,操作对象可能是常规文件,也可能是设备等,打开的文件都要对应一个file,因此先从file_table中寻找空闲file。然后根据文件名执行打开操作,这个我们之前已经详细分析过了。操作成功后,如果打开的是tty设备则进行一些设置操作,最后初始化file的其他几个属性。对于sys_close,它根据指定的文件描述符清除close_on_exec中的相应的位,设current->filp[fd]
为NULL,并递减相应文件的引用计数。
相关文章推荐
- The YubiKey NEO -- Smartcard features
- 初识angular.js之爱恨情仇
- 前端的几个概念
- [LeetCode][JavaScript]LRU Cache
- JS获取地址栏参数
- jQuery中trigger()的使用方法
- javascript表单验证
- css学习笔记-1
- leetcode--Populating Next Right Pointers in Each Node
- css布局之图片被拉伸问题
- CSS小知识---回到顶部
- Swap Nodes in Pairs
- 你所不知道的JavaScript(一)this 关键字
- bootstrap布局:嵌套布局和流动布局
- Yii 中使用 yii-bootstrap 扩展
- 关于css中两层div的水平垂直居中问题
- js中的闭包之我理解
- FaceBook/infer-infer捕捉的bug类型
- CSS3知识点
- 在jsp页面如何获得url参数