structepitem { /* RB tree node used to link this structure to the eventpoll RB tree */ structrb_noderbn;/*用于挂载到eventpoll管理的红黑树*/
/* List header used to link this structure to the eventpoll ready list */ structlist_headrdllink;/*挂载到eventpoll.rdlist的事件就绪队列*/
/* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ structepitem *next;/*用于主结构体中的链表*/
/* The file descriptor information this item refers to */ structepoll_filefdffd;/*该结构体对应的被监听的文件描述符信息(fd+file, 作为红黑树的key)*/
/* Number of active wait queue attached to poll operations */ int nwait; /*poll(轮询操作)的事件个数 /* List containing poll wait queues */ structlist_headpwqlist;/*双向链表,保存被监视文件的等待队列,功能类似于select/poll中的poll_table;同一个文件上可能会监视多种事件,这些事件可能从属于不同的wait_queue中,所以需要使用链表 /* The "container" of this item */ structeventpoll *ep;/*当前epitem的所有者(多个epitem从属于一个eventpoll)*/
/* List header used to link this item to the "struct file" items list */ structlist_headfllink;/*双向链表,用来链接被监视的文件描述符对应的struct file。因为file里有f_ep_link用来保存所有监视这个文件的epoll节点 /* The structure that describe the interested events and the source fd */ structepoll_eventevent;/*注册感兴趣的事件,也就是用户空间的epoll_event };
structeventpoll { /* Protect the this structure access */ spinlock_t lock; /*自旋锁,在kernel内部用自旋锁加锁,就可以同时多线(进)程对此结构体进行操作,主要是保护ready_list*/
/* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ structmutexmtx;/*防止使用时被删除*/
/* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /*sys_epoll_wait()使用的等待队列*/
/* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /*file->epoll()使用的等待队列*/
/* List of ready file descriptors */ structlist_headrdllist;/*事件就绪链表*/
/* RB tree root used to store monitored fd structs */ structrb_rootrbr;/*用于管理当前epoll关注的文件描述符(树根)*/
/* * This is a single linked list that chains all the "struct epitem" that * happened while transfering ready events to userspace w/out * holding ->lock. */ structepitem *ovflist;/*在向用户空间传输就绪事件的时候,将同时发生事件的文件描述符链入到这个链表里面*/ };
/* * Sanity check on the size parameter, and create the internal data * structure ( "struct eventpoll" ). */ error = -EINVAL; /*为ep分配内存并进行初始化*/ if (size <= 0 || (error = ep_alloc(&ep)) < 0) { fd = error; goto error_return; }
/* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ /*调用anon_inode_getfd新建一个struct file,也就是epoll可以看成一个文件(由* 于没有任何文件系统,为匿名文件)。并且将主结构体struct eventpoll *ep放入* file->private项中进行保存(sys_epoll_ctl会取用)*/ fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep); if (fd < 0) ep_free(ep);
/* Get the "struct file *" for the eventpoll file */ error = -EBADF; file = fget(epfd); /*epoll fd对应的文件对象*/ if (!file) goto error_return;
/* Get the "struct file *" for the target file */ tfile = fget(fd); /*fd对应的文件对象*/ if (!tfile) goto error_fput;
/* The target file descriptor must support poll */ error = -EPERM; if (!tfile->f_op || !tfile->f_op->poll) goto error_tgt_fput;
...
/* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = file->private_data; /*在create时存入进去的(anon_inode_getfd),现在取用。*/
mutex_lock(&ep->mtx);
/* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, tfile, fd); /*防止重复添加(在ep的红黑树中查找是否已经存在这个fd)*/
switch (op) { case EPOLL_CTL_ADD: /*新增一个监听fd*/ if (!epi) { epds.events |= POLLERR | POLLHUP; /*默认包含POLLERR和POLLHUP事件*/
/* Initialize the poll table using the queue callback */ epq.epi = epi; /*安装poll回调函数*/ init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ /* * 调用poll函数来获取当前事件位,其实是利用它来调用注册函数ep_ptable_queue_proc(poll_wait中调用)。 * 如果fd是套接字,f_op为socket_file_ops,poll函数是sock_poll()。 * 如果是TCP套接字的话,进而会调用到tcp_poll()函数。此处调用poll函数查看当前文件描述符的状态,存储在revents中。 * 在poll的处理函数(tcp_poll())中,会调用sock_poll_wait(), * 在sock_poll_wait()中会调用到epq.pt.qproc指向的函数,也就是ep_ptable_queue_proc()。 */ revents = tfile->f_op->poll(tfile, &epq.pt);
/* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_ep_lock); list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_ep_lock);
/* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); /*将该epi插入到ep的红黑树中*/
/* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags);
/* If the file is already "ready" we drop it inside the ready list */ /* * revents & event->events:刚才fop->poll的返回值中标识的事件有用户event关心的事件发生。 * !ep_is_linked(&epi->rdllink):epi的ready队列中有数据。ep_is_linked用于判断队列是否为空。 */
/* Notify waiting tasks that events are available */ /* 如果有进程正在等待文件的状态就绪,也就是调用epoll_wait睡眠的进程正在等待,则唤醒一个等待进程。waitqueue_active(q) 等待队列q中有等待的进程返回1,否则返回0。*/ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); /* 如果有进程等待eventpoll文件本身(???)的事件就绪,则增加临时变量pwake的值,pwake的值不为0时,在释放lock后,会唤醒等待进程。 */ if (waitqueue_active(&ep->poll_wait)) pwake++; }
spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */ if (pwake) /*唤醒等待eventpoll文件状态就绪的进程*/ ep_poll_safewake(&psw, &ep->poll_wait);
/* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock;
...
/* If this file is already in the ready list we exit soon */ if (ep_is_linked(&epi->rdllink)) goto is_linked; /*将该fd加入到epoll监听的就绪链表中*/ list_add_tail(&epi->rdllink, &ep->rdllist);
is_linked: /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ /*唤醒调用epoll_wait()函数时睡眠的进程。用户层epoll_wait(...) 超时前返回。*/ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++;
asmlinkage longsys_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout) { int error; structfile *file; structeventpoll *ep;
/* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL;
/* Verify that the area passed by the user is writeable */ /* 检查用户空间传入的events指向的内存是否可写。参见__range_not_ok()。*/ if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { error = -EFAULT; goto error_return; }
/* Get the "struct file *" for the eventpoll file */ /* 获取epfd对应的eventpoll文件的file实例,file结构是在epoll_create中创建。 */ error = -EBADF; file = fget(epfd); if (!file) goto error_return;
/* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ /* 通过检查epfd对应的文件操作是不是eventpoll_fops 来判断epfd是否是一个eventpoll文件。如果不是则返回EINVAL错误。 */ error = -EINVAL; if (!is_file_epoll(file)) goto error_fput;
/* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = file->private_data;
/* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, timeout);
staticintep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res, eavail; unsignedlong flags; long jtimeout; wait_queue_t wait;
/* * Calculate the timeout by checking for the "infinite" value ( -1 ) * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ /* timeout是以毫秒为单位,这里是要转换为jiffies时间。这里加上999(即1000-1),是为了向上取整。 */ jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
retry: spin_lock_irqsave(&ep->lock, flags);
res = 0; if (list_empty(&ep->rdllist)) { /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ /* 没有事件,所以需要睡眠。当有事件到来时,睡眠会被ep_poll_callback函数唤醒。*/ init_waitqueue_entry(&wait, current); /*将current进程放在wait这个等待队列中。*/ wait.flags |= WQ_FLAG_EXCLUSIVE; /* 将当前进程加入到eventpoll的等待队列中,等待文件状态就绪或直到超时,或被信号中断。 */ __add_wait_queue(&ep->wq, &wait);
for (;;) { /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state * to TASK_INTERRUPTIBLE before doing the checks. */ /* 执行ep_poll_callback()唤醒时应当需要将当前进程唤醒,所以当前进程状态应该为“可唤醒”TASK_INTERRUPTIBLE */ set_current_state(TASK_INTERRUPTIBLE); /* 如果就绪队列不为空,也就是说已经有文件的状态就绪或者超时,则退出循环。*/ if (!list_empty(&ep->rdllist) || !jtimeout) break; /* 如果当前进程接收到信号,则退出循环,返回EINTR错误 */ if (signal_pending(current)) { res = -EINTR; break; }
/* Is it worth to try to dig for events ? */ /* * ep->ovflist链表存储的向用户传递事件时暂存就绪的文件。 * 所以不管是就绪队列ep->rdllist不为空,或者ep->ovflist不等于 * EP_UNACTIVE_PTR,都有可能现在已经有文件的状态就绪。 * ep->ovflist不等于EP_UNACTIVE_PTR有两种情况,一种是NULL,此时 * 可能正在向用户传递事件,不一定就有文件状态就绪, * 一种情况时不为NULL,此时可以肯定有文件状态就绪, * 参见ep_send_events()。 */ eavail = !list_empty(&ep->rdllist);
spin_unlock_irqrestore(&ep->lock, flags);
/* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ /* 如果没有被信号中断,并且有事件就绪,但是没有获取到事件(有可能被其他进程获取到了),并且没有超时,则跳转到retry标签处,重新等待文件状态就绪。 */ if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && jtimeout) goto retry; /* 返回获取到的事件的个数或者错误码 */ return res; }