skip to content

Linux Kernel seq_file 接口

/ 14 min read

Table of Contents

TL;DR

seq_file 在内核里如果需要传输列表类型的数据还是比较好用的, 比如 show_fiq_list, show_cache_info 之类的. 实现基本是每一个 fd 都创建一个不定长 buffer, 每次优先读取缓存内容然后用 item 级别更新缓存来确保不会出现格式或者数据问题. 为数不多的缺点可能在于锁和对写操作的缺乏支持 --- 因为在设计的时候就认为为只在最开始写入一次.

Linux Kernel 6.17.1
文件名和初始行号符合内核文件, 但是后面的行号可能会因为额外注释偏移

数据结构

include/linux/seq_file.h
struct seq_operations;
struct seq_file {
// buffer
char *buf;
// buffer size
size_t size;
// remaining start
size_t from;
// remaining size
size_t count;
size_t pad_until;
// last show index in list
loff_t index;
// last fd read_pos
loff_t read_pos;
// fd thread-safe guard
struct mutex lock;
const struct seq_operations *op;
int poll_event;
const struct file *file;
// preserve data pointer to any data we want to store
void *private;
};
struct seq_operations {
// setup seq_file with index, return the item pointer
// e.g. we can lock write lock and find the item by index(named pos)
void * (*start) (struct seq_file *m, loff_t *pos);
// finish/cleanup seq_file feeding with given item
// e.g. we can unlock write lock
void (*stop) (struct seq_file *m, void *v);
// roll to next item in list given pos and last item, return new item
// e.g. go throught next pointer and update last_pos
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
// dump item pointer to buffer in seq_file
// e.g. snprintf or memcpy and update count/size
int (*show) (struct seq_file *m, void *v);
};

Helpers

判断如果目前 buffer 已经满或溢出(需要 realloc 一个更大的 buffer)

include/linux/seq_file.h
/**
* seq_has_overflowed - check if the buffer has overflowed
* @m: the seq_file handle
*
* seq_files have a buffer which may overflow. When this happens a larger
* buffer is reallocated and all the data will be printed again.
* The overflow state is true when m->count == m->size.
*
* Returns true if the buffer received more than it can hold.
*/
static inline bool seq_has_overflowed(struct seq_file *m)
{
return m->count == m->size;
}

addr 复制 iov 里要求的大小

include/linux/uio.h
static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
if (check_copy_size(addr, bytes, true))
return _copy_to_iter(addr, bytes, i);
return 0;
}

seq_file 里重新找到指定的偏移量

fs/seq_file.c
static int traverse(struct seq_file *m, loff_t offset)
{
loff_t pos = 0;
int error = 0;
void *p;
m->index = 0;
m->count = m->from = 0;
if (!offset)
return 0;
if (!m->buf) {
// init default size buffer
m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
if (!m->buf)
return -ENOMEM;
}
// start with given index
p = m->op->start(m, &m->index);
while (p) {
error = PTR_ERR(p);
if (IS_ERR(p))
break;
// dump current item
error = m->op->show(m, p);
if (error < 0)
break;
if (unlikely(error)) {
error = 0;
m->count = 0;
}
if (seq_has_overflowed(m))
goto Eoverflow;
// go next item
p = m->op->next(m, p, &m->index);
// if current buffer exceed required offset
// update from and buffer remaining count
if (pos + m->count > offset) {
m->from = offset - pos;
m->count -= m->from;
break;
}
// otherwise just keep iterate till it reached
pos += m->count;
// update buffer remaining size to zero caused it's before required offset
m->count = 0;
// if reached/perfect match
if (pos == offset)
break;
}
// cleanup
m->op->stop(m, p);
return error;
Eoverflow:
m->op->stop(m, p);
kvfree(m->buf);
m->count = 0;
// if overflow get bigger buffer and redo the process
m->buf = seq_buf_alloc(m->size <<= 1);
return !m->buf ? -ENOMEM : -EAGAIN;
}

Open

fs/seq_file.c
/**
* seq_open - initialize sequential file
* @file: file we initialize
* @op: method table describing the sequence
*
* seq_open() sets @file, associating it with a sequence described
* by @op. @op->start() sets the iterator up and returns the first
* element of sequence. @op->stop() shuts it down. @op->next()
* returns the next element of sequence. @op->show() prints element
* into the buffer. In case of error ->start() and ->next() return
* ERR_PTR(error). In the end of sequence they return %NULL. ->show()
* returns 0 in case of success and negative number in case of error.
* Returning SEQ_SKIP means "discard this element and move on".
* Note: seq_open() will allocate a struct seq_file and store its
* pointer in @file->private_data. This pointer should not be modified.
*/
int seq_open(struct file *file, const struct seq_operations *op)
{
struct seq_file *p;
WARN_ON(file->private_data);
p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);
if (!p)
return -ENOMEM;
// set private data to seq_file struct
file->private_data = p;
mutex_init(&p->lock);
p->op = op;
// No refcounting: the lifetime of 'p' is constrained
// to the lifetime of the file.
p->file = file;
/*
* seq_files support lseek() and pread(). They do not implement
* write() at all, but we clear FMODE_PWRITE here for historical
* reasons.
*
* If a client of seq_files a) implements file.write() and b) wishes to
* support pwrite() then that client will need to implement its own
* file.open() which calls seq_open() and then sets FMODE_PWRITE.
*/
file->f_mode &= ~FMODE_PWRITE;
return 0;
}
EXPORT_SYMBOL(seq_open);

里面值得注意的是最后把文件的 write 标志强制取消了.

Read

fs/seq_file.c
/**
* seq_read - ->read() method for sequential files.
* @file: the file to read from
* @buf: the buffer to read to
* @size: the maximum number of bytes to read
* @ppos: the current position in the file
*
* Ready-made ->f_op->read()
*/
ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
// construct a iov request from one-time read
struct iovec iov = { .iov_base = buf, .iov_len = size};
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, file);
iov_iter_init(&iter, ITER_DEST, &iov, 1, size);
kiocb.ki_pos = *ppos;
// enter real read function
ret = seq_read_iter(&kiocb, &iter);
*ppos = kiocb.ki_pos;
return ret;
}
EXPORT_SYMBOL(seq_read);
/*
* Ready-made ->f_op->read_iter()
*/
ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct seq_file *m = iocb->ki_filp->private_data;
size_t copied = 0;
size_t n;
void *p;
int err = 0;
// if request to read 0 byte
if (!iov_iter_count(iter))
return 0;
// make sure only one thread enter
mutex_lock(&m->lock);
/*
* if request is to read from zero offset, reset iterator to first
* record as it might have been already advanced by previous requests
*/
if (iocb->ki_pos == 0) {
m->index = 0;
m->count = 0;
}
/* Don't assume ki_pos is where we left it */
if (unlikely(iocb->ki_pos != m->read_pos)) {
// because our record position doesn't match the required position
// it's likely because user want to read from a discrete position
// so we need to re-find/reach to that position from beginning by calling traverse
while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN)
;
if (err) {
/* With prejudice... */
m->read_pos = 0;
m->index = 0;
m->count = 0;
goto Done;
} else {
m->read_pos = iocb->ki_pos;
}
}
/* grab buffer if we didn't have one */
if (!m->buf) {
m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
if (!m->buf)
goto Enomem;
}
// something left in the buffer - copy it out first
if (m->count) {
n = copy_to_iter(m->buf + m->from, m->count, iter);
m->count -= n;
// update start from position for next read(if still something remaining)
m->from += n;
copied += n;
if (m->count) // hadn't managed to copy everything
goto Done;
}
// get a non-empty record in the buffer
m->from = 0;
p = m->op->start(m, &m->index);
// then fetch one non-empty item into buffer
while (1) {
err = PTR_ERR(p);
if (!p || IS_ERR(p)) // EOF or an error
break;
err = m->op->show(m, p);
if (err < 0) // hard error
break;
if (unlikely(err)) // ->show() says "skip it"
m->count = 0;
if (unlikely(!m->count)) { // empty record
p = m->op->next(m, p, &m->index);
continue;
}
if (!seq_has_overflowed(m)) // got it
goto Fill;
// need a bigger buffer
m->op->stop(m, p);
kvfree(m->buf);
m->count = 0;
m->buf = seq_buf_alloc(m->size <<= 1);
if (!m->buf)
goto Enomem;
p = m->op->start(m, &m->index);
}
// EOF or an error
m->op->stop(m, p);
m->count = 0;
goto Done;
Fill:
// one non-empty record is in the buffer; if they want more,
// try to fit more in, but in any case we need to advance
// the iterator once for every record shown.
while (1) {
size_t offs = m->count;
loff_t pos = m->index;
p = m->op->next(m, p, &m->index);
if (pos == m->index) {
pr_info_ratelimited("buggy .next function %ps did not update position index\n",
m->op->next);
m->index++;
}
if (!p || IS_ERR(p)) // no next record for us
break;
if (m->count >= iov_iter_count(iter))
break;
err = m->op->show(m, p);
if (err > 0) { // ->show() says "skip it"
m->count = offs;
} else if (err || seq_has_overflowed(m)) {
m->count = offs;
break;
}
}
m->op->stop(m, p);
// feed the request from buffer
n = copy_to_iter(m->buf, m->count, iter);
copied += n;
m->count -= n;
m->from = n;
Done:
if (unlikely(!copied)) {
copied = m->count ? -EFAULT : err;
} else {
iocb->ki_pos += copied;
m->read_pos += copied;
}
mutex_unlock(&m->lock);
return copied;
Enomem:
err = -ENOMEM;
goto Done;
}
EXPORT_SYMBOL(seq_read_iter);

Custom implementation

根据这套想法我自己实现了一遍针对 RCU 列表的 proc 读写, 其中有几个前提

  • 所有 item 输出定长, 我这里是 #define LINE_BUFFER_MAX_SIZE (10 + 2 + 20 + 1 + 1) 也就是 %d: %lu\n 格式
  • 读取是读取一个列表, 每一项的长度如上
  • 写入是添加信息到列表

RCU

一些文档

What’s RCU? by kernel documentation
listRCU by kernel documentation
What’s RCU, Fundamentally? by lwn.net

简单来说就是一套由三个原则组成的无锁操作原理

Memory barrier

在这里虽然没有 RCU 那么显著, 但还是隐式地用了, 比如 READ_ONCE, WRITE_ONCE 或没用到的 smp_load_acquire, smp_store_release, 简单来说就是确保数据更新顺序

可以参考 CSDN

helpers

// #define debug_output
#ifdef debug_output
#define INFO(...) pr_warn("[INF] " __VA_ARGS__)
#define ERROR(...) pr_err("[ERR] " __VA_ARGS__)
#else
#define INFO(...)
#define ERROR(...)
#endif

Data structure

// example payload
typedef struct info {
int pid;
unsigned long cpu_time;
struct list_head node;
struct rcu_head rcu_head;
} info;
typedef struct file_metadata {
// last read end position
loff_t read_pos;
// last stop item index
unsigned int last_index;
// buffer content size
unsigned short line_buf_size;
// buffer read index
unsigned short line_buf_idx;
// enforce thread-safe
struct mutex lock;
// revision to sync of writing
unsigned int revision;
// fixed buffer per item
char line_buf[LINE_BUFFER_MAX_SIZE];
} file_metadata;
static struct kmem_cache *cache __ro_after_init;
static struct kmem_cache *info_cache __ro_after_init;
// global variable to sync write
static unsigned int revision = 0;
// info RCU list
static LIST_HEAD(REG_PID_LIST);
static DEFINE_MUTEX(PIDS_MUX);

Custom Read

static inline int dump_buf(char buf[], info *pos)
{
return snprintf(buf, LINE_BUFFER_MAX_SIZE, "%d: %lu\n", pos->pid,
pos->cpu_time);
}
static ssize_t proc_read(struct file *file, char __user *buf, size_t cnt,
loff_t *ppos)
{
struct file_metadata *p = file->private_data;
info *pos = NULL;
unsigned int idx = 0;
unsigned long ret_s = 0, copied = 0, copy_size = 0;
if (cnt == 0)
return 0;
INFO("read start, ppos=%lld, cnt=%zu\n", *ppos, cnt);
// thread safe for changing p metadata
mutex_lock(&p->lock);
// unlikely, but if read_pos don't match, invalidate the line buffer
if (p->read_pos != *ppos) {
p->line_buf_idx = 0;
p->line_buf_size = 0;
// need to go the that ppos position then fill the buffer or start from 0?
// if required pos less than previous pos, or the list is updated find desired item from start
// otherwise just go through to last item index
idx = READ_ONCE(revision);
if (*ppos < p->read_pos || p->revision != idx) {
p->last_index = 0;
p->read_pos = 0;
p->revision = idx;
}
}
// if buffer has remaining bytes
else if (p->line_buf_idx < p->line_buf_size) {
INFO("copy remaining line buffer\n");
copy_size = umin(p->line_buf_size - p->line_buf_idx, cnt);
ret_s = copy_size - copy_to_user(buf,
p->line_buf + p->line_buf_idx,
copy_size);
p->line_buf_idx += ret_s;
copied += ret_s;
if (p->line_buf_idx < p->line_buf_size || copied == cnt) {
// user buffer size is smaller or equal to finish this unfinished line buffer
goto DONE;
}
p->line_buf_idx = 0;
p->read_pos += copied;
*ppos += copied;
// advanced the user buffer ptr as well
buf += copied;
copied = 0;
p->last_index++;
}
INFO("re-find the designed index\n");
idx = 0;
ret_s = 0;
copy_size = 0;
rcu_read_lock();
list_for_each_entry_rcu (pos, &REG_PID_LIST, node) {
// first need to at the last_index position
if (idx >= p->last_index) {
ret_s = dump_buf(p->line_buf, pos);
if (p->read_pos + copy_size + ret_s > *ppos) {
// got the position
p->last_index = idx;
// ignore the unaligned pos part
p->read_pos += copy_size;
break;
} else {
// continue to find
copy_size += ret_s;
}
ret_s = 0;
}
idx++;
}
rcu_read_unlock();
if (ret_s == 0) {
// unlikely but the required index is out of bound
INFO("require ppos out of bound\n");
goto DONE;
}
INFO("idx=%d\n", p->last_index);
// prepare current index line buffer
p->line_buf_size = ret_s;
copy_size = umin(ret_s, cnt - copied);
while (true) {
INFO("feed the buffer, last idx=%d\n", p->last_index);
if (copy_size == 0)
break;
// NOTE: copy_to_user is not atomic and cannot run under rcu lock
ret_s = copy_size -
copy_to_user(buf + copied, p->line_buf, copy_size);
p->line_buf_idx += ret_s;
copied += ret_s;
if (p->line_buf_idx < p->line_buf_size || copied == cnt) {
// user buffer size is smaller or equal to finish this unfinished line buffer
goto DONE;
}
p->line_buf_idx = 0;
p->last_index++;
idx = 0;
rcu_read_lock();
list_for_each_entry_rcu (pos, &REG_PID_LIST, node) {
if (idx++ == p->last_index) {
ret_s = dump_buf(p->line_buf, pos);
p->line_buf_size = ret_s;
copy_size = umin(ret_s, cnt - copied);
break;
}
}
rcu_read_unlock();
if (idx <= p->last_index) {
// reach the end of list
goto DONE;
}
}
INFO("read finish\n");
DONE:
p->read_pos += copied;
mutex_unlock(&p->lock);
*ppos += copied;
return copied;
}

Custom Write

static ssize_t proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
int pid = 0, result;
unsigned long cpu_time = 0;
info *node = NULL;
if (*ppos > 0) {
INFO("ignore arbitrary write position info\n");
}
result = kstrtoint_from_user(buffer, count, 10, &pid);
if (result != 0) {
ERROR("incoming PID is not valid\n");
return result;
}
if (pid <= 0 || pid > PID_MAX_LIMIT) {
ERROR("PID %d out of valid range\n", pid);
return -EINVAL;
}
node = kmem_cache_zalloc(info_cache, GFP_KERNEL);
if (!node) {
ERROR("failed to kmalloc node\n");
return -ENOMEM;
}
node->pid = pid;
if (get_cpu_use(pid, &cpu_time) == -1) {
INFO("PID process reqistered not exists\n");
kmem_cache_free(info_cache, node);
goto W_DONE;
}
node->cpu_time = cpu_time;
mutex_lock(&PIDS_MUX);
list_add_rcu(&node->node, &REG_PID_LIST);
WRITE_ONCE(revision, READ_ONCE(revision) + 1);
mutex_unlock(&PIDS_MUX);
INFO("register PID=%d\n", pid);
W_DONE:
INFO("write finish\n");
*ppos = count;
return count;
}

Custom Open, Release

int proc_open(struct inode *, struct file *f)
{
struct file_metadata *p;
// only read mode fd need additional information
if (f->f_mode & FMODE_READ) {
p = kmem_cache_zalloc(cache, GFP_KERNEL);
if (!p)
return -ENOMEM;
f->private_data = p;
mutex_init(&p->lock);
INFO("reader opened\n");
}
return 0;
}
int proc_release(struct inode *, struct file *f)
{
if (f->f_mode & FMODE_READ) {
// Note: `mutex_destroy` is actually doing nothing if we didn't enable mutex_debug flag
mutex_destroy(&((file_metadata *)(f->private_data))->lock);
kmem_cache_free(cache, f->private_data);
}
return 0;
}

Update

如果需要更新/删除信息

static void info_free_rcu(struct rcu_head *head)
{
info *info = container_of(head, info, rcu_head);
kmem_cache_free(info_cache, info);
}
static void foo()
{
info *pos = NULL, *tmp = NULL;
unsigned long cpu_time;
int result;
bool deleted = false;
// need lock to protect from writer to writer
mutex_lock(&PIDS_MUX);
list_for_each_entry_safe (pos, tmp, &REG_PID_LIST, node) {
result = bar(&cpu_time); // some process
if (result == -1) {
INFO("remove PID %d\n", pos->pid);
list_del_rcu(&pos->node);
call_rcu(&pos->rcu_head, info_free_rcu);
deleted = true;
} else {
pos->cpu_time = cpu_time;
}
}
if (deleted)
WRITE_ONCE(revision, READ_ONCE(revision) + 1);
mutex_unlock(&PIDS_MUX);
}

more ref

可以参考 https://www.cnblogs.com/embedded-linux/p/9751995.html