Fix disallowed cr0 write protection and close_fd (#80)

Since the commit 8dbec27a242cd3e2816eeb98d3237b9f57cf6232 [1]
(kernel version v5.3+ [2]) the sensitive CR0 bits in x86 is pinned,
we need to use the inline asm [3][4] to bypass it.

commit 8dbec27a242cd3e2816eeb98d3237b9f57cf6232 :
> With sensitive CR4 bits pinned now, it's possible that the WP bit for
> CR0 might become a target as well.
>
> Following the same reasoning for the CR4 pinning, pin CR0's WP
> bit. Contrary to the cpu feature dependend CR4 pinning this can be done
> with a constant value.

Also, getting "sys_call_table" [8] from the symbol lookup by using the address
of "close_fd" does not work for v5.11+ [5][6]. The reason is the entry of
"sys_call_table[__NR_close]" is not the address of "close_fd", actually
it is "__x64_sys_close" in x86.

Two solutions were proposed: using "kallsyms_lookup_name" [7] or just specifying
the address into the module. The symbol "kallsyms_lookup_name"  is unexported
since v5.7; the address of "sys_call_table" can be found in
"/boot/System.map" or "/proc/kallsyms".

Since v5.7, the manual symbol lookup is not guaranteed to work
because of control-flow integrity (or control-flow enforcement [9][10]) is added
[11] for x86, but it is disabled since v5.11 [12][13]. To make sure manual symbol
lookup work, it only uses up to v5.4.

Reference:
[1] 8dbec27a24
[2] https://outflux.net/blog/archives/2019/11/14/security-things-in-linux-v5-3/
[3] https://patchwork.kernel.org/project/linux-kbuild/patch/20200903203053.3411268-3-samitolvanen@google.com/
[4] https://stackoverflow.com/questions/58512430/how-to-write-to-protected-pages-in-the-linux-kernel
[5] https://lore.kernel.org/bpf/20201120231441.29911-21-ebiederm@xmission.com/
[6] https://lore.kernel.org/bpf/87blj83ysq.fsf@x220.int.ebiederm.org/
[7] 0bd476e6c6
[8] 8f27766a88
[9] https://lore.kernel.org/lkml/20200204171425.28073-1-yu-cheng.yu@intel.com/
[10] https://lore.kernel.org/linux-doc/20201110162211.9207-1-yu-cheng.yu@intel.com/T/
[11] 5790921bc1
[12] 20bf2b3787
[13] https://lore.kernel.org/bpf/20210128123842.c9e33949e62f504b84bfadf5@gmail.com/
This commit is contained in:
linD026 2021-08-31 11:07:01 +08:00 committed by GitHub
parent d3bde7daed
commit cccc98ab2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 193 additions and 12 deletions

View File

@ -23,17 +23,44 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
/* The way we access "sys_call_table" varies as kernel internal changes.
* - ver <= 5.4 : manual symbol lookup
* - 5.4 < ver < 5.7 : kallsyms_lookup_name
* - 5.7 <= ver : Kprobes or specific kernel module parameter
*/
/* The in-kernel calls to the ksys_close() syscall were removed in Linux v5.11+.
*/
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0))
#include <linux/syscalls.h> /* ksys_close() wrapper for backward compatibility */
#define close_fd ksys_close
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0))
#if LINUX_VERSION_CODE <= KERNEL_VERSION(5, 4, 0)
#define HAVE_KSYS_CLOSE 1
#include <linux/syscalls.h> /* For ksys_close() */
#else
#include <linux/fdtable.h> /* For close_fd */
#include <linux/kallsyms.h> /* For kallsyms_lookup_name */
#endif
#else
#if defined(CONFIG_KPROBES)
#define HAVE_KPROBES 1
#include <linux/kprobes.h>
#else
#define HAVE_PARAM 1
#include <linux/kallsyms.h> /* For sprint_symbol */
/* The address of the sys_call_table, which can be obtained with looking up
* "/boot/System.map" or "/proc/kallsyms". When the kernel version is v5.7+,
* without CONFIG_KPROBES, you can input the parameter or the module will look
* up all the memory.
*/
static unsigned long sym = 0;
module_param(sym, ulong, 0644);
#endif
#endif
unsigned long **sys_call_table;
unsigned long original_cr0;
/* UID we want to spy on - will be filled from the command line. */
static int uid;
@ -83,19 +110,81 @@ asmlinkage int our_sys_open(const char *filename, int flags, int mode)
static unsigned long **aquire_sys_call_table(void)
{
#ifdef HAVE_KSYS_CLOSE
unsigned long int offset = PAGE_OFFSET;
unsigned long **sct;
while (offset < ULLONG_MAX) {
sct = (unsigned long **) offset;
if (sct[__NR_close] == (unsigned long *) close_fd)
if (sct[__NR_close] == (unsigned long *) ksys_close)
return sct;
offset += sizeof(void *);
}
return NULL;
#endif
#ifdef HAVE_PARAM
const char sct_name[15] = "sys_call_table";
char symbol[40] = {0};
if (sym == 0) {
pr_alert(
"For Linux v5.7+, Kprobes is the preferable way to get "
"symbol.\n");
pr_info(
"If Kprobes is absent, you have to specify the address of "
"sys_call_table symbol\n");
pr_info(
"by /boot/System.map or /proc/kallsyms, which contains all the "
"symbol addresses, into sym parameter.\n");
return NULL;
}
sprint_symbol(symbol, sym);
if (!strncmp(sct_name, symbol, sizeof(sct_name) - 1))
return (unsigned long **) sym;
return NULL;
#endif
#ifdef HAVE_KPROBES
unsigned long (*kallsyms_lookup_name)(const char *name);
struct kprobe kp = {
.symbol_name = "kallsyms_lookup_name",
};
if (register_kprobe(&kp) < 0)
return NULL;
kallsyms_lookup_name = (unsigned long (*)(const char *name)) kp.addr;
unregister_kprobe(&kp);
#endif
return (unsigned long **) kallsyms_lookup_name("sys_call_table");
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0)
static inline void __write_cr0(unsigned long cr0)
{
asm volatile("mov %0,%%cr0" : "+r"(cr0) : : "memory");
}
#else
#define __write_cr0 write_cr0
#endif
static void enable_write_protection(void)
{
unsigned long cr0 = read_cr0();
set_bit(16, &cr0);
__write_cr0(cr0);
}
static void disable_write_protection(void)
{
unsigned long cr0 = read_cr0();
clear_bit(16, &cr0);
__write_cr0(cr0);
}
static int __init syscall_start(void)
@ -103,9 +192,7 @@ static int __init syscall_start(void)
if (!(sys_call_table = aquire_sys_call_table()))
return -1;
original_cr0 = read_cr0();
write_cr0(original_cr0 & ~0x00010000);
disable_write_protection();
/* keep track of the original open function */
original_call = (void *) sys_call_table[__NR_open];
@ -113,7 +200,7 @@ static int __init syscall_start(void)
/* use our open function instead */
sys_call_table[__NR_open] = (unsigned long *) our_sys_open;
write_cr0(original_cr0);
enable_write_protection();
pr_info("Spying on UID:%d\n", uid);
@ -133,9 +220,9 @@ static void __exit syscall_end(void)
pr_alert("an unstable state.\n");
}
write_cr0(original_cr0 & ~0x00010000);
disable_write_protection();
sys_call_table[__NR_open] = (unsigned long *) original_call;
write_cr0(original_cr0);
enable_write_protection();
msleep(2000);
}

View File

@ -1204,6 +1204,100 @@ If you want to read this code, it is at the source file \verb|arch/$(architectur
So, if we want to change the way a certain system call works, what we need to do is to write our own function to implement it (usually by adding a bit of our own code, and then calling the original function) and then change the pointer at \cpp|sys_call_table| to point to our function.
Because we might be removed later and we don't want to leave the system in an unstable state, it's important for \cpp|cleanup_module| to restore the table to its original state.
To modify the content of \cpp|sys_call_table|, we need to consider the control register.
A control register is a processor register that changes or controls the general behavior of the CPU.
For x86 architecture, the \verb|cr0| register has various control flags that modify the basic operation of the processor.
The \verb|WP| flag in \verb|cr0| stands for write protection.
Once the \verb|WP| flag is set, the processor disallows further write attempts to the read-only sections
Therefore, we must disable the \verb|WP| flag before modifying \cpp|sys_call_table|.
Since Linux v5.3, the \cpp|write_cr0| function cannot be used because of the sensitive \verb|cr0| bits pinned by the security issue, the attacker may write into CPU control registers to disable CPU protections like write protection.
As a result, we have to provide the custom assembly routine to bypass it.
However, \cpp|sys_call_table| symbol is unexported to prevent misuse.
But there have few ways to get the symbol, manual symbol lookup and \cpp|kallsyms_lookup_name|.
Here we use both depend on the kernel version.
Because of the \textit{control-flow integrity}, which is a technique to prevent the redirect execution code from the attacker, for making sure that the indirect calls go to the expected addresses and the return addresses are not changed.
Since Linux v5.7, the kernel patched the series of \textit{control-flow enforcement} (CET) for x86, and some configurations of GCC, like GCC versions 9 and 10 in Ubuntu, will add with CET (the \verb|-fcf-protection| option) in the kernel by default.
Using that GCC to compile the kernel with retpoline off may result in CET being enabled in the kernel.
You can use the following command to check out the \verb|-fcf-protection| option is enabled or not:
\begin{verbatim}
$ gcc -v -Q -O2 --help=target | grep protection
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/9/lto-wrapper
...
gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu1~20.04)
COLLECT_GCC_OPTIONS='-v' '-Q' '-O2' '--help=target' '-mtune=generic' '-march=x86-64'
/usr/lib/gcc/x86_64-linux-gnu/9/cc1 -v ... -fcf-protection ...
GNU C17 (Ubuntu 9.3.0-17ubuntu1~20.04) version 9.3.0 (x86_64-linux-gnu)
...
\end{verbatim}
But CET should not be enabled in the kernel, it may break the Kprobes and bpf.
Consequently, CET is disabled since v.11.
To guarantee the manual symbol lookup worked, we only use up to v5.4.
Unfortunately, since Linux v5.7 \cpp|kallsyms_lookup_name| is also unexported, it needs certain trick to get the address of \cpp|kallsyms_lookup_name|.
If \cpp|CONFIG_KPROBES| is enabled, we can facilitate the retrieval of function addresses by means of Kprobes to dynamically break into the specific kernel routine.
Kprobes inserts a breakpoint at the entry of function by replacing the first bytes of the probed instruction.
When a CPU hits the breakpoint, registers are stored, and the control will pass to Kprobes.
It passes the addresses of the saved registers and the Kprobe struct to the handler you defined, then executes it.
Kprobes can be registered by symbol name or address.
Within the symbol name, the address will be handled by the kernel.
Otherwise, specify the address of \cpp|sys_call_table| from \verb|/proc/kallsyms| and \verb|/boot/System.map| into \cpp|sym| parameter.
Following is the sample usage for \verb|/proc/kallsyms|:
\begin{verbatim}
$ sudo grep sys_call_table /proc/kallsyms
ffffffff82000280 R x32_sys_call_table
ffffffff820013a0 R sys_call_table
ffffffff820023e0 R ia32_sys_call_table
$ sudo insmod syscall.ko sym=0xffffffff820013a0
\end{verbatim}
Using the address from \verb|/boot/System.map|, be careful about \verb|KASLR| (Kernel Address Space Layout Randomization).
\verb|KASLR| may randomize the address of kernel code and data at every boot time, such as the static address listed in \verb|/boot/System.map| will offset by some entropy.
The purpose of \verb|KASLR| is to protect the kernel space from the attacker.
Without \verb|KASLR|, the attacker may find the target address in the fixed address easily.
Then the attacker can use return-oriented programming to insert some malicious codes to execute or receive the target data by a tampered pointer.
\verb|KASLR| mitigates these kinds of attacks because the attacker cannot immediately know the target address, but a brute-force attack can still work.
If the address of a symbol in \verb|/proc/kallsyms| is different from the address in \verb|/boot/System.map|, \verb|KASLR| is enabled with the kernel, which your system running on.
\begin{verbatim}
$ grep GRUB_CMDLINE_LINUX_DEFAULT /etc/default/grub
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
$ sudo grep sys_call_table /boot/System.map-$(uname -r)
ffffffff82000300 R sys_call_table
$ sudo grep sys_call_table /proc/kallsyms
ffffffff820013a0 R sys_call_table
# Reboot
$ sudo grep sys_call_table /boot/System.map-$(uname -r)
ffffffff82000300 R sys_call_table
$ sudo grep sys_call_table /proc/kallsyms
ffffffff86400300 R sys_call_table
\end{verbatim}
If \verb|KASLR| is enabled, we have to take care of the address from \verb|/proc/kallsyms| each time we reboot the machine.
In order to use the address from \verb|/boot/System.map|, make sure that \verb|KASLR| is disabled.
You can add the \verb|nokaslr| for disabling \verb|KASLR| in next booting time:
\begin{verbatim}
$ grep GRUB_CMDLINE_LINUX_DEFAULT /etc/default/grub
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
$ sudo perl -i -pe 'm/quiet/ and s//quiet nokaslr/' /etc/default/grub
$ grep quiet /etc/default/grub
GRUB_CMDLINE_LINUX_DEFAULT="quiet nokaslr splash"
$ sudo update-grub
\end{verbatim}
For more information, check out the following:
\begin{itemize}
\item \href{https://lwn.net/Articles/804849/}{Cook: Security things in Linux v5.3}
\item \href{https://lwn.net/Articles/12211/}{Unexporting the system call table}
\item \href{https://lwn.net/Articles/810077/}{Control-flow integrity for the kernel}
\item \href{https://lwn.net/Articles/813350/}{Unexporting kallsyms\_lookup\_name()}
\item \href{https://www.kernel.org/doc/Documentation/kprobes.txt}{Kernel Probes (Kprobes)}
\item \href{https://lwn.net/Articles/569635/}{Kernel address space layout randomization}
\end{itemize}
The source code here is an example of such a kernel module.
We want to ``spy'' on a certain user, and to \cpp|pr_info()| a message whenever that user opens a file.
Towards this end, we replace the system call to open a file with our own function, called \cpp|our_sys_open|.