Table of Contents
TL;DR
seq_file
在内核里如果需要传输列表类型的数据还是比较好用的, 比如 show_fiq_list
, show_cache_info
之类的. 实现基本是每一个 fd 都创建一个不定长 buffer, 每次优先读取缓存内容然后用 item 级别更新缓存来确保不会出现格式或者数据问题. 为数不多的缺点可能在于锁和对写操作的缺乏支持 --- 因为在设计的时候就认为为只在最开始写入一次.
Linux Kernel
6.17.1
文件名和初始行号符合内核文件, 但是后面的行号可能会因为额外注释偏移
数据结构
struct seq_operations;
struct seq_file { // buffer char *buf; // buffer size size_t size; // remaining start size_t from; // remaining size size_t count; size_t pad_until; // last show index in list loff_t index; // last fd read_pos loff_t read_pos; // fd thread-safe guard struct mutex lock; const struct seq_operations *op; int poll_event; const struct file *file; // preserve data pointer to any data we want to store void *private;};
struct seq_operations { // setup seq_file with index, return the item pointer // e.g. we can lock write lock and find the item by index(named pos) void * (*start) (struct seq_file *m, loff_t *pos); // finish/cleanup seq_file feeding with given item // e.g. we can unlock write lock void (*stop) (struct seq_file *m, void *v); // roll to next item in list given pos and last item, return new item // e.g. go throught next pointer and update last_pos void * (*next) (struct seq_file *m, void *v, loff_t *pos); // dump item pointer to buffer in seq_file // e.g. snprintf or memcpy and update count/size int (*show) (struct seq_file *m, void *v);};
Helpers
判断如果目前 buffer
已经满或溢出(需要 realloc
一个更大的 buffer
)
/** * seq_has_overflowed - check if the buffer has overflowed * @m: the seq_file handle * * seq_files have a buffer which may overflow. When this happens a larger * buffer is reallocated and all the data will be printed again. * The overflow state is true when m->count == m->size. * * Returns true if the buffer received more than it can hold. */static inline bool seq_has_overflowed(struct seq_file *m){ return m->count == m->size;}
从 addr
复制 iov
里要求的大小
static __always_inline __must_checksize_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i){ if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); return 0;}
从 seq_file
里重新找到指定的偏移量
static int traverse(struct seq_file *m, loff_t offset){ loff_t pos = 0; int error = 0; void *p;
m->index = 0; m->count = m->from = 0; if (!offset) return 0;
if (!m->buf) { // init default size buffer m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } // start with given index p = m->op->start(m, &m->index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) break; // dump current item error = m->op->show(m, p); if (error < 0) break; if (unlikely(error)) { error = 0; m->count = 0; } if (seq_has_overflowed(m)) goto Eoverflow; // go next item p = m->op->next(m, p, &m->index); // if current buffer exceed required offset // update from and buffer remaining count if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; break; } // otherwise just keep iterate till it reached pos += m->count; // update buffer remaining size to zero caused it's before required offset m->count = 0; // if reached/perfect match if (pos == offset) break; } // cleanup m->op->stop(m, p); return error;
Eoverflow: m->op->stop(m, p); kvfree(m->buf); m->count = 0; // if overflow get bigger buffer and redo the process m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN;}
Open
/** * seq_open - initialize sequential file * @file: file we initialize * @op: method table describing the sequence * * seq_open() sets @file, associating it with a sequence described * by @op. @op->start() sets the iterator up and returns the first * element of sequence. @op->stop() shuts it down. @op->next() * returns the next element of sequence. @op->show() prints element * into the buffer. In case of error ->start() and ->next() return * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". * Note: seq_open() will allocate a struct seq_file and store its * pointer in @file->private_data. This pointer should not be modified. */int seq_open(struct file *file, const struct seq_operations *op){ struct seq_file *p;
WARN_ON(file->private_data);
p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM;
// set private data to seq_file struct file->private_data = p;
mutex_init(&p->lock); p->op = op;
// No refcounting: the lifetime of 'p' is constrained // to the lifetime of the file. p->file = file;
/* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical * reasons. * * If a client of seq_files a) implements file.write() and b) wishes to * support pwrite() then that client will need to implement its own * file.open() which calls seq_open() and then sets FMODE_PWRITE. */ file->f_mode &= ~FMODE_PWRITE; return 0;}EXPORT_SYMBOL(seq_open);
里面值得注意的是最后把文件的 write
标志强制取消了.
Read
/** * seq_read - ->read() method for sequential files. * @file: the file to read from * @buf: the buffer to read to * @size: the maximum number of bytes to read * @ppos: the current position in the file * * Ready-made ->f_op->read() */ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos){ // construct a iov request from one-time read struct iovec iov = { .iov_base = buf, .iov_len = size}; struct kiocb kiocb; struct iov_iter iter; ssize_t ret;
init_sync_kiocb(&kiocb, file); iov_iter_init(&iter, ITER_DEST, &iov, 1, size);
kiocb.ki_pos = *ppos; // enter real read function ret = seq_read_iter(&kiocb, &iter); *ppos = kiocb.ki_pos; return ret;}EXPORT_SYMBOL(seq_read);
/* * Ready-made ->f_op->read_iter() */ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter){ struct seq_file *m = iocb->ki_filp->private_data; size_t copied = 0; size_t n; void *p; int err = 0;
// if request to read 0 byte if (!iov_iter_count(iter)) return 0;
// make sure only one thread enter mutex_lock(&m->lock);
/* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; }
/* Don't assume ki_pos is where we left it */ if (unlikely(iocb->ki_pos != m->read_pos)) { // because our record position doesn't match the required position // it's likely because user want to read from a discrete position // so we need to re-find/reach to that position from beginning by calling traverse while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ m->read_pos = 0; m->index = 0; m->count = 0; goto Done; } else { m->read_pos = iocb->ki_pos; } }
/* grab buffer if we didn't have one */ if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } // something left in the buffer - copy it out first if (m->count) { n = copy_to_iter(m->buf + m->from, m->count, iter); m->count -= n; // update start from position for next read(if still something remaining) m->from += n; copied += n; if (m->count) // hadn't managed to copy everything goto Done; } // get a non-empty record in the buffer m->from = 0; p = m->op->start(m, &m->index); // then fetch one non-empty item into buffer while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) // EOF or an error break; err = m->op->show(m, p); if (err < 0) // hard error break; if (unlikely(err)) // ->show() says "skip it" m->count = 0; if (unlikely(!m->count)) { // empty record p = m->op->next(m, p, &m->index); continue; } if (!seq_has_overflowed(m)) // got it goto Fill; // need a bigger buffer m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; p = m->op->start(m, &m->index); } // EOF or an error m->op->stop(m, p); m->count = 0; goto Done;Fill: // one non-empty record is in the buffer; if they want more, // try to fit more in, but in any case we need to advance // the iterator once for every record shown. while (1) { size_t offs = m->count; loff_t pos = m->index;
p = m->op->next(m, p, &m->index); if (pos == m->index) { pr_info_ratelimited("buggy .next function %ps did not update position index\n", m->op->next); m->index++; } if (!p || IS_ERR(p)) // no next record for us break; if (m->count >= iov_iter_count(iter)) break; err = m->op->show(m, p); if (err > 0) { // ->show() says "skip it" m->count = offs; } else if (err || seq_has_overflowed(m)) { m->count = offs; break; } } m->op->stop(m, p); // feed the request from buffer n = copy_to_iter(m->buf, m->count, iter); copied += n; m->count -= n; m->from = n;Done: if (unlikely(!copied)) { copied = m->count ? -EFAULT : err; } else { iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); return copied;Enomem: err = -ENOMEM; goto Done;}EXPORT_SYMBOL(seq_read_iter);
Custom implementation
根据这套想法我自己实现了一遍针对 RCU 列表的 proc
读写, 其中有几个前提
- 所有 item 输出定长, 我这里是
#define LINE_BUFFER_MAX_SIZE (10 + 2 + 20 + 1 + 1)
也就是%d: %lu\n
格式 - 读取是读取一个列表, 每一项的长度如上
- 写入是添加信息到列表
RCU
一些文档
What’s RCU? by kernel documentation
listRCU by kernel documentation
What’s RCU, Fundamentally? by lwn.net
简单来说就是一套由三个原则组成的无锁操作原理
Memory barrier
在这里虽然没有 RCU
那么显著, 但还是隐式地用了, 比如 READ_ONCE
, WRITE_ONCE
或没用到的 smp_load_acquire
, smp_store_release
, 简单来说就是确保数据更新顺序
可以参考 CSDN
helpers
// #define debug_output#ifdef debug_output#define INFO(...) pr_warn("[INF] " __VA_ARGS__)#define ERROR(...) pr_err("[ERR] " __VA_ARGS__)#else#define INFO(...)#define ERROR(...)#endif
Data structure
// example payloadtypedef struct info { int pid; unsigned long cpu_time; struct list_head node; struct rcu_head rcu_head;} info;
typedef struct file_metadata { // last read end position loff_t read_pos; // last stop item index unsigned int last_index; // buffer content size unsigned short line_buf_size; // buffer read index unsigned short line_buf_idx; // enforce thread-safe struct mutex lock; // revision to sync of writing unsigned int revision; // fixed buffer per item char line_buf[LINE_BUFFER_MAX_SIZE];} file_metadata;
static struct kmem_cache *cache __ro_after_init;static struct kmem_cache *info_cache __ro_after_init;
// global variable to sync writestatic unsigned int revision = 0;
// info RCU liststatic LIST_HEAD(REG_PID_LIST);static DEFINE_MUTEX(PIDS_MUX);
Custom Read
static inline int dump_buf(char buf[], info *pos){ return snprintf(buf, LINE_BUFFER_MAX_SIZE, "%d: %lu\n", pos->pid, pos->cpu_time);}
static ssize_t proc_read(struct file *file, char __user *buf, size_t cnt, loff_t *ppos){ struct file_metadata *p = file->private_data; info *pos = NULL; unsigned int idx = 0; unsigned long ret_s = 0, copied = 0, copy_size = 0;
if (cnt == 0) return 0;
INFO("read start, ppos=%lld, cnt=%zu\n", *ppos, cnt);
// thread safe for changing p metadata mutex_lock(&p->lock);
// unlikely, but if read_pos don't match, invalidate the line buffer if (p->read_pos != *ppos) { p->line_buf_idx = 0; p->line_buf_size = 0; // need to go the that ppos position then fill the buffer or start from 0? // if required pos less than previous pos, or the list is updated find desired item from start // otherwise just go through to last item index idx = READ_ONCE(revision); if (*ppos < p->read_pos || p->revision != idx) { p->last_index = 0; p->read_pos = 0; p->revision = idx; } } // if buffer has remaining bytes else if (p->line_buf_idx < p->line_buf_size) { INFO("copy remaining line buffer\n"); copy_size = umin(p->line_buf_size - p->line_buf_idx, cnt); ret_s = copy_size - copy_to_user(buf, p->line_buf + p->line_buf_idx, copy_size); p->line_buf_idx += ret_s; copied += ret_s; if (p->line_buf_idx < p->line_buf_size || copied == cnt) { // user buffer size is smaller or equal to finish this unfinished line buffer goto DONE; } p->line_buf_idx = 0; p->read_pos += copied; *ppos += copied; // advanced the user buffer ptr as well buf += copied; copied = 0; p->last_index++; }
INFO("re-find the designed index\n"); idx = 0; ret_s = 0; copy_size = 0; rcu_read_lock(); list_for_each_entry_rcu (pos, ®_PID_LIST, node) { // first need to at the last_index position if (idx >= p->last_index) { ret_s = dump_buf(p->line_buf, pos); if (p->read_pos + copy_size + ret_s > *ppos) { // got the position p->last_index = idx; // ignore the unaligned pos part p->read_pos += copy_size; break; } else { // continue to find copy_size += ret_s; } ret_s = 0; } idx++; } rcu_read_unlock(); if (ret_s == 0) { // unlikely but the required index is out of bound INFO("require ppos out of bound\n"); goto DONE; } INFO("idx=%d\n", p->last_index); // prepare current index line buffer p->line_buf_size = ret_s; copy_size = umin(ret_s, cnt - copied);
while (true) { INFO("feed the buffer, last idx=%d\n", p->last_index); if (copy_size == 0) break; // NOTE: copy_to_user is not atomic and cannot run under rcu lock ret_s = copy_size - copy_to_user(buf + copied, p->line_buf, copy_size); p->line_buf_idx += ret_s; copied += ret_s; if (p->line_buf_idx < p->line_buf_size || copied == cnt) { // user buffer size is smaller or equal to finish this unfinished line buffer goto DONE; } p->line_buf_idx = 0; p->last_index++; idx = 0; rcu_read_lock(); list_for_each_entry_rcu (pos, ®_PID_LIST, node) { if (idx++ == p->last_index) { ret_s = dump_buf(p->line_buf, pos); p->line_buf_size = ret_s; copy_size = umin(ret_s, cnt - copied); break; } } rcu_read_unlock(); if (idx <= p->last_index) { // reach the end of list goto DONE; } }
INFO("read finish\n");
DONE: p->read_pos += copied; mutex_unlock(&p->lock); *ppos += copied; return copied;}
Custom Write
static ssize_t proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos){ int pid = 0, result; unsigned long cpu_time = 0; info *node = NULL;
if (*ppos > 0) { INFO("ignore arbitrary write position info\n"); }
result = kstrtoint_from_user(buffer, count, 10, &pid); if (result != 0) { ERROR("incoming PID is not valid\n"); return result; } if (pid <= 0 || pid > PID_MAX_LIMIT) { ERROR("PID %d out of valid range\n", pid); return -EINVAL; }
node = kmem_cache_zalloc(info_cache, GFP_KERNEL); if (!node) { ERROR("failed to kmalloc node\n"); return -ENOMEM; } node->pid = pid; if (get_cpu_use(pid, &cpu_time) == -1) { INFO("PID process reqistered not exists\n"); kmem_cache_free(info_cache, node); goto W_DONE; } node->cpu_time = cpu_time;
mutex_lock(&PIDS_MUX); list_add_rcu(&node->node, ®_PID_LIST); WRITE_ONCE(revision, READ_ONCE(revision) + 1); mutex_unlock(&PIDS_MUX);
INFO("register PID=%d\n", pid);W_DONE: INFO("write finish\n"); *ppos = count; return count;}
Custom Open, Release
int proc_open(struct inode *, struct file *f){ struct file_metadata *p; // only read mode fd need additional information if (f->f_mode & FMODE_READ) { p = kmem_cache_zalloc(cache, GFP_KERNEL); if (!p) return -ENOMEM; f->private_data = p; mutex_init(&p->lock); INFO("reader opened\n"); } return 0;}
int proc_release(struct inode *, struct file *f){ if (f->f_mode & FMODE_READ) { // Note: `mutex_destroy` is actually doing nothing if we didn't enable mutex_debug flag mutex_destroy(&((file_metadata *)(f->private_data))->lock); kmem_cache_free(cache, f->private_data); } return 0;}
Update
如果需要更新/删除信息
static void info_free_rcu(struct rcu_head *head){ info *info = container_of(head, info, rcu_head); kmem_cache_free(info_cache, info);}static void foo(){ info *pos = NULL, *tmp = NULL; unsigned long cpu_time; int result; bool deleted = false;
// need lock to protect from writer to writer mutex_lock(&PIDS_MUX); list_for_each_entry_safe (pos, tmp, ®_PID_LIST, node) { result = bar(&cpu_time); // some process if (result == -1) { INFO("remove PID %d\n", pos->pid); list_del_rcu(&pos->node); call_rcu(&pos->rcu_head, info_free_rcu); deleted = true; } else { pos->cpu_time = cpu_time; } } if (deleted) WRITE_ONCE(revision, READ_ONCE(revision) + 1); mutex_unlock(&PIDS_MUX);}