您的位置：首页 > 运维架构 > Linux

Linux源码解析-poll机制

2017-07-18 14:14 281 查看

1.poll函数

关于poll函数具体是干什么的，以及什么情况下使用等参考我的其他博客

2.poll机制分析

常见系统调用一般对应内核中sys_函数名，比如我们想看poll机制，具体怎么查看源码呢？

下载Soure Insight软件
下载linux源码
创建项目，导入源代码

Ctrl+Shift+F快捷键打开搜索窗口
打钩ProjectWide

搜索sys_poll

点击Select.c即可进入相应源码部分
notes：

很多其他博客都说poll调用的是内核函数sys_poll,结果在现在较新linux版本源码select.c中却找不到，较新linux版本中采用宏组合的方式来表示sys_poll,在select.c中，

我们应该查看的是SYSCALL_DEFINE3

/*

下列函数主要做了三件事

调用了一个时间转换函数，根据传入的时间计算出了一个另一种格式的超时时间
调用了do_sys_poll来完成主要工作（实现实际的轮询功能）
do_sys_poll被信号中断后的处理。如果do_sys_poll返回-EINTR，则意味着poll操作被信号打断，返回ERESTART_RESTARTBLOCK，由用户注册的信号如果设置了SA_RESTART，则可以在处理完用户注册的信号处理程序后，重新调用。

*/

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
long, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;

if (timeout_msecs >= 0) {
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,  //是一个时间转换函数，根据传入的时间参数计算超时时间
//存放入一个 struct timespec结构体实例to中
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}

ret = do_sys_poll(ufds, nfds, to);      //调用do_sys_poll完成主要工作（实现轮询功能）

if (ret == -EINTR) {           //do_sys_poll被信号中断的处理
struct restart_block *restart_block;

restart_block = ¤t_thread_info()->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;

if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;

ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}

linux/poll.h
struct poll_wqueues {
poll_table pt;
struct poll_table_page *table;
struct task_struct *polling_task;
int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
typedef struct poll_table_struct {
poll_queue_proc qproc;
} poll_table;   //其中就只有一个函数指针成员

void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);  //设置poll_table结构中的qproc函数指针为__pollwait函数，
// 就是pwq->pt->qproc=__pollwait。这个函数是一个回调函数，基本上这种机制的实现，就是依靠回调函数了，用于存储回调函数的指针

pwq->polling_task = current;       //调用poll_initwait时，其中的polling_task成员被赋值为当前进程的task_struct，也即current
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}

/*

do_sys_poll中首先把用户空间的struct pollfd拷贝到内核空间的struct poll_list类型的链表中(具体是块连接的形式），这链表的头定义在栈空间，

而其他成员则通过kmalloc在内核空间动态分配。

创建一个struct poll_wqueues类型的挑选队列，并由poll_initwait初始化，接着调用do_poll进入循环遍历poll_list的操作

*/

int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,

       struct timespec *end_time)

{

   struct poll_wqueues table; //创建一个struct poll_wqueues类型的挑选队列

    int err = -EFAULT, fdcount, len, size;

   /* Allocate small arguments on the stack to save memory and be

       faster - use long to make sure the buffer is aligned properly

       on 64 bit archs to avoid unaligned access */

   long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; //为了加快处理速度和提高系统性能，这里优先使用已经定好的一个栈空间，
//其大小为POLL_STACK_ALLOC，栈空间转换为struct poll_list结构，以存储需要被检测的文件描述符

   struct poll_list *const head = (struct poll_list *)stack_pps; //struct poll_list类型的指针指向这个栈空间，便于之后块连接

    struct poll_list *walk = head;

    unsigned long todo = nfds; //总共需要处理的文件描述符总数

   if (nfds > rlimit(RLIMIT_NOFILE))

       return -EINVAL;

   len = min_t(unsigned int, nfds, N_STACK_PPS); //找到nfds和N_STACK_PPS的较小者，

//N_STACK_PPS就是计算前面默认的固定栈大小能够存储多少个struct pollfd的

for (;;) {

       walk->next = NULL;

       walk->len = len;

       if (!len)

           break;

       if (copy_from_user(walk->entries, ufds + nfds-todo, //重点，将用户空间的struct pollfd中的len个数据拷贝到内核空间walk->entries中

                   sizeof(struct pollfd) * walk->len))

           goto out_fds;

       todo -= walk->len;

       if (!todo)

           break;
/*POLLFD_PER_PAGE表示一页的内存能够存储多少个struct pollfd，可以计算一下，一页是4K，而struct pollfd的内存占用8个字节，

就是一页的内存可以将近存储512个描述符。如果在分配一页的内存之后，还不够nfds来用，没关系，循环不会退出的，

会再分配一个页，并且所有分配的块都被struct poll_list链接起来，上面可以看到，这个结构有一个next域，就是专门做这个的。
*/在这之后，就会形成一个以stack_pps存储空间为头，然后一页一页分配的内存为接点的链表，这个链表上就存储了poll调用时传入的所有的文件描述符。

       len = min(todo, POLLFD_PER_PAGE);

       size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;

       walk = walk->next = kmalloc(size, GFP_KERNEL);

       if (!walk) {

           err = -ENOMEM;

           goto out_fds;

       }

   }

   poll_initwait(&table); //初始化挑选队列table，其中的polling_task成员被赋值为当前进程的task_struct,即current，回调函数指针设置为

__pollwait

   fdcount = do_poll(nfds, head, &table, end_time); //循环遍历poll_list链表，检测每个节点中的存储fd的数组，

//将链表上的所有struct pollfd中的revents的状态写入到用户空间

   poll_freewait(&table);//释放

   for (walk = head; walk; walk = walk->next) {

       struct pollfd *fds = walk->entries;

       int j;

       for (j = 0; j < walk->len; j++, ufds++)

           if (__put_user(fds[j].revents, &ufds->revents))

               goto out_fds;

   }

   err = fdcount;

out_fds:

   walk = head->next;

   while (walk) {

       struct poll_list *pos = walk;

       walk = walk->next;

       kfree(pos);

   }

   return err;

}

/*

其中参数nfds为用户传入的整数，代表传入的pollfd的数量，而head即为拷贝后的poll_list链表，wait是挑选队列，而end_time就是超时时间。
do_poll对poll_list链表进行循环处理，对于单个fd，则调用do_pollfd进行处理。另外注意到在一次遍历之后一旦返现do_pollfd的返回值不为0，
则说明该描述符可操作，计入count，如果count不为0或者超时则直接跳出循环，并返回活跃描述符的计数。

static int do_poll(unsigned int nfds,  struct poll_list *list,
struct poll_wqueues *wait, struct timespec *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;

/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt = NULL;
timed_out = 1;
}

if (end_time && !timed_out)
slack = estimate_accuracy(end_time);

for (;;) {
struct poll_list *walk;

for (walk = list; walk != NULL; walk = walk->next) {  //循环遍历poll_list链表
struct pollfd * pfd, * pfd_end;

pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill the poll_table, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt)) { //对于每个fd，都调用do_pollfd函数，如果返回值不为0，则说明描述符准备就绪，可操作，count加1
count++;
pt = NULL;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table to them on the next loop iteration.
*/
pt = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -EINTR;
}
if (count || timed_out)
break;

/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec_to_ktime(*end_time);
to = &expire;
}

if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
return count;
}
以下这段总结摘自： http://blog.csdn.net/zmxiangde_88/article/details/8099049

这个函数有以下几个要注意的点：
信号处理保障。在这个函数中先将当前进程设置为可以被信号中断，就是set_current_state(TASK_INTERRUPTIBLE)这一行，后面还会检查是否有需要处理的信号signal_pending(current)。这里的意思是就算是poll调用进入到sys_poll系统调用之后，也可以接收外部信号，从而退出当前系统调用（因为我们知道一般的系统调用都不会被中断的，所以系统调用一般都尽量很快的返回）。
外部大循环退出的条件，外部大循环退出的条件只有if (count || !*timeout) break;后面的条件容易理解，就是超时，前面的count是什么意思？它在每次调用do_pollfd函数之后，都有可能会加1，其实调用do_pollfd就是检查socket描述符状态的变化，如果有变化，就会使count加1，所以在结束内部遍历之后，count保存了所有的有状态变化的socket描述符数量。
这个函数会对之前以head为头结点的链表进行遍历，然后链表上每个结点中都包含很多很多的struct pollfd进行遍历（这些struct pollfd都被存储在struct poll_list结构的数组字段struct pollfd entries里面。
然后对每个struct pollfd调用do_pollfd（这会调用很多次，根据你传入多少个socket描述符而定），这个函数需要两个参数，一个是struct pollfd，这没得说的，另一个是刚刚初始化的table，就是那个暂时只是包含__pollwait回调指针的结构，还记得吧。

/* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used
by the fd-provided poll handler for waiting, * if non-NULL. */

/*

do_pollfd调用驱动提供的poll函数，如果没有则永远返回0。poll 返回位掩码, 它描述哪个操作可马上被实现;

例如, 如果设备有数据可用, 一个读可能不必睡眠而完成; poll 方法应当指示这个时间状态。

*/

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait){   unsigned int mask;   int fd;   mask = 0;   fd = pollfd->fd;   if (fd >= 0) {       int fput_needed;       struct file * file;       file = fget_light(fd, &fput_needed);
//根据 fd 指定的索引，从当前进程描述符中取出相应的 file 对象       mask = POLLNVAL;       if (file != NULL) {           mask = DEFAULT_POLLMASK;           if (file->f_op &&
file->f_op->poll) {

/*调用file->f_op->poll(file,pwait)，这是这个函数的核心调用，这其实也是linux的VFS的一部分，这会根据当前的文件是什么类型的文件来选择调用的入口，

如file是socket网络文件，此时调用的就是由网络驱动设备来实现的poll，如果file是ext3等文件系统上打开的一个文件，那就会调用由该文件系统来实现的poll函数*/               if (pwait)                   pwait->key = pollfd->events |                           POLLERR | POLLHUP;
    mask = file->f_op->poll(file, pwait);           }           /* Mask out unneeded events. */           mask &= pollfd->events | POLLERR | POLLHUP;           fput_light(file, fput_needed);       }   }   pollfd->revents = mask; //可以看出pollfd中的revents最后其实是被do_pollfd修改
return mask;}

//注明：以下摘自http://blog.csdn.net/zmxiangde_88/article/details/8099049

那么各种类型的驱动poll函数机制是怎么样的呢？

后续更新

3.POLL机制总结

调用poll函数。
进入sys_poll等系列内核调用。
准备数据：，注册__pollwait（这是通过初始化poll_wqueues来完成的），复制数据至内核，重新组织成struct poll_list等等。
对所有的struct pollfd循环，以调用do_pollfd函数。
do_pollfd调用file->f_op->poll函数。
然后调用__pollwait创建一个struct poll_table_entry，并将其与当前进程绑定。
将当前进程挂在socket的等待队列上。
有数据就绪时唤醒进程。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航