fish-shell/src/fallback.rs

//! This file only contains fallback implementations of functions which have been found to be missing
//! or broken by the configuration scripts.
//!
//! Many of these functions are more or less broken and incomplete.

use crate::widecharwidth::{WcLookupTable, WcWidth};
use crate::{common::is_console_session, wchar::prelude::*};
use errno::{errno, Errno};
use once_cell::sync::Lazy;
use std::cmp;
use std::fs::File;
use std::os::fd::FromRawFd;
use std::sync::atomic::{AtomicIsize, Ordering};
use std::{ffi::CString, mem};

/// Width of ambiguous East Asian characters and, as of TR11, all private-use characters.
/// 1 is the typical default, but we accept any non-negative override via `$fish_ambiguous_width`.
pub static FISH_AMBIGUOUS_WIDTH: AtomicIsize = AtomicIsize::new(1);

/// Width of emoji characters.
///
/// This must be configurable because the value changed between Unicode 8 and Unicode 9, `wcwidth()`
/// is emoji-unaware, and terminal emulators do different things.
///
/// See issues like #4539 and https://github.com/neovim/issues/4976 for how painful this is.
///
/// Valid values are 1, and 2. 1 is the typical emoji width used in Unicode 8 while some newer
/// terminals use a width of 2 since Unicode 9.
// For some reason, this is declared here and exposed here, but is set in `env_dispatch`.
pub static FISH_EMOJI_WIDTH: AtomicIsize = AtomicIsize::new(1);

static WC_LOOKUP_TABLE: Lazy<WcLookupTable> = Lazy::new(WcLookupTable::new);

/// A safe wrapper around the system `wcwidth()` function
pub fn wcwidth(c: char) -> isize {
    extern "C" {
        pub fn wcwidth(c: libc::wchar_t) -> libc::c_int;
    }

    const _: () = assert!(mem::size_of::<libc::wchar_t>() >= mem::size_of::<char>());
    let width = unsafe { wcwidth(c as libc::wchar_t) };
    isize::try_from(width).unwrap()
}

// Big hack to use our versions of wcswidth where we know them to be broken, which is
// EVERYWHERE (https://github.com/fish-shell/fish-shell/issues/2199)
pub fn fish_wcwidth(c: char) -> isize {
    // The system version of wcwidth should accurately reflect the ability to represent characters
    // in the console session, but knows nothing about the capabilities of other terminal emulators
    // or ttys. Use it from the start only if we are logged in to the physical console.
    if is_console_session() {
        return wcwidth(c);
    }

    // Check for VS16 which selects emoji presentation. This "promotes" a character like U+2764
    // (width 1) to an emoji (probably width 2). So treat it as width 1 so the sums work. See #2652.
    // VS15 selects text presentation.
    let variation_selector_16 = '\u{FE0F}';
    let variation_selector_15 = '\u{FE0E}';
    if c == variation_selector_16 {
        return 1;
    } else if c == variation_selector_15 {
        return 0;
    }

    // Check for Emoji_Modifier property. Only the Fitzpatrick modifiers have this, in range
    // 1F3FB..1F3FF. This is a hack because such an emoji appearing on its own would be drawn as
    // width 2, but that's unlikely to be useful. See #8275.
    if ('\u{1F3FB}'..='\u{1F3FF}').contains(&c) {
        return 0;
    }

    let width = WC_LOOKUP_TABLE.classify(c);
    match width {
        WcWidth::NonCharacter | WcWidth::NonPrint | WcWidth::Combining | WcWidth::Unassigned => {
            // Fall back to system wcwidth in this case.
            wcwidth(c)
        }
        WcWidth::Ambiguous | WcWidth::PrivateUse => {
            // TR11: "All private-use characters are by default classified as Ambiguous".
            FISH_AMBIGUOUS_WIDTH.load(Ordering::Relaxed)
        }
        WcWidth::One => 1,
        WcWidth::Two => 2,
        WcWidth::WidenedIn9 => FISH_EMOJI_WIDTH.load(Ordering::Relaxed),
    }
}

/// fish's internal versions of wcwidth and wcswidth, which can use an internal implementation if
/// the system one is busted.
pub fn fish_wcswidth(s: &wstr) -> isize {
    let mut result = 0;
    for c in s.chars() {
        let w = fish_wcwidth(c);
        if w < 0 {
            return -1;
        }
        result += w;
    }
    result
}

// Replacement for mkostemp(str, O_CLOEXEC)
// This uses mkostemp if available,
// otherwise it uses mkstemp followed by fcntl
pub fn fish_mkstemp_cloexec(name_template: CString) -> Result<(File, CString), Errno> {
    let name = name_template.into_raw();
    #[cfg(not(target_os = "macos"))]
    let fd = {
        use libc::O_CLOEXEC;
        unsafe { libc::mkostemp(name, O_CLOEXEC) }
    };
    #[cfg(target_os = "macos")]
    let fd = {
        use libc::{FD_CLOEXEC, F_SETFD};
        let fd = unsafe { libc::mkstemp(name) };
        if fd != -1 {
            unsafe { libc::fcntl(fd, F_SETFD, FD_CLOEXEC) };
        }
        fd
    };
    if fd == -1 {
        Err(errno())
    } else {
        unsafe { Ok((File::from_raw_fd(fd), CString::from_raw(name))) }
    }
}

pub fn wcscasecmp(lhs: &wstr, rhs: &wstr) -> cmp::Ordering {
    use std::char::ToLowercase;
    use widestring::utfstr::CharsUtf32;

    /// This struct streams the underlying lowercase chars of a `UTF32String` without allocating.
    ///
    /// `char::to_lowercase()` returns an iterator of chars and we sometimes need to cmp the last
    /// char of one char's `to_lowercase()` with the first char of the other char's
    /// `to_lowercase()`. This makes that possible.
    struct ToLowerBuffer<'a> {
        current: ToLowercase,
        chars: CharsUtf32<'a>,
    }

    impl<'a> Iterator for ToLowerBuffer<'a> {
        type Item = char;

        fn next(&mut self) -> Option<Self::Item> {
            if let Some(c) = self.current.next() {
                return Some(c);
            }

            self.current = self.chars.next()?.to_lowercase();
            self.next()
        }
    }

    impl<'a> ToLowerBuffer<'a> {
        pub fn from(w: &'a wstr) -> Self {
            let mut empty = 'a'.to_lowercase();
            let _ = empty.next();
            debug_assert!(empty.next().is_none());
            let mut chars = w.chars();
            Self {
                current: chars.next().map(|c| c.to_lowercase()).unwrap_or(empty),
                chars,
            }
        }
    }

    let lhs = ToLowerBuffer::from(lhs);
    let rhs = ToLowerBuffer::from(rhs);
    lhs.cmp(rhs)
}

#[test]
fn test_wcscasecmp() {
    use std::cmp::Ordering;

    // Comparison with empty
    assert_eq!(wcscasecmp(L!("a"), L!("")), Ordering::Greater);
    assert_eq!(wcscasecmp(L!(""), L!("a")), Ordering::Less);
    assert_eq!(wcscasecmp(L!(""), L!("")), Ordering::Equal);

    // Basic comparison
    assert_eq!(wcscasecmp(L!("A"), L!("a")), Ordering::Equal);
    assert_eq!(wcscasecmp(L!("B"), L!("a")), Ordering::Greater);
    assert_eq!(wcscasecmp(L!("A"), L!("B")), Ordering::Less);

    // Multi-byte comparison
    assert_eq!(wcscasecmp(L!("İ"), L!("i\u{307}")), Ordering::Equal);
    assert_eq!(wcscasecmp(L!("ia"), L!("İa")), Ordering::Less);
}
Port most of fallback 2023-04-09 19:37:12 +08:00			`//! This file only contains fallback implementations of functions which have been found to be missing`
			`//! or broken by the configuration scripts.`
			`//!`
			`//! Many of these functions are more or less broken and incomplete.`

			`use crate::widecharwidth::{WcLookupTable, WcWidth};`
Adopt the wchar prelude 2023-08-09 06:16:04 +08:00			`use crate::{common::is_console_session, wchar::prelude::*};`
Convert fish_mkstemp_cloexec() to return an OwnedFd 2024-03-06 04:29:31 +08:00			`use errno::{errno, Errno};`
Port most of fallback 2023-04-09 19:37:12 +08:00			`use once_cell::sync::Lazy;`
Port the rest of wcstringutil 2023-04-18 17:53:48 +08:00			`use std::cmp;`
Use `File` instead of `OwnedFd` in a few places (#10355) This is a step towards converting `wopen_cloexec()` to return `File` instead of `OwnedFd`/`AutocloseFd`.¹ In addition to letting us use native standard library functions instead of unsafe libc calls, we gain additional semantic safety because `File` operations that manipulate the state of the fd (e.g. `File::seek()`) require a `&mut` reference to the `File`, whereas using `RawFd` or `OwnedFd` everywhere leaves us in a position where it's not clear whether or not other references to the same fd will manipulate its underlying state. ¹ We actually wouldn't even need `wopen_cloexec()` at all (just a widechar wrapper) as Rust's native `File::open()`/`File::create()` functionality uses `FD_CLOEXEC` internally. 2024-03-18 00:20:44 +08:00			`use std::fs::File;`
			`use std::os::fd::FromRawFd;`
Make wcwidth an isize Seems more consistent with the rest of our code. 2024-02-15 05:18:49 +08:00			`use std::sync::atomic::{AtomicIsize, Ordering};`
Convert fish_mkstemp_cloexec() to return an OwnedFd 2024-03-06 04:29:31 +08:00			`use std::{ffi::CString, mem};`
Port most of fallback 2023-04-09 19:37:12 +08:00
Clean up FISH_EMOJI_WIDTH and FISH_AMBIGUOUS_WIDTH defines Pull in the correct descriptions merged from across the various C++ header and source files and get rid of the getter function that's only used in one place but causes us to split the documentation for FISH_EMOJI_WIDTH across multiple declarations. 2023-05-17 02:55:38 +08:00			`/// Width of ambiguous East Asian characters and, as of TR11, all private-use characters.`
			/// 1 is the typical default, but we accept any non-negative override via `$fish_ambiguous_width`.
Make wcwidth an isize Seems more consistent with the rest of our code. 2024-02-15 05:18:49 +08:00			`pub static FISH_AMBIGUOUS_WIDTH: AtomicIsize = AtomicIsize::new(1);`
Port most of fallback 2023-04-09 19:37:12 +08:00
Clean up FISH_EMOJI_WIDTH and FISH_AMBIGUOUS_WIDTH defines Pull in the correct descriptions merged from across the various C++ header and source files and get rid of the getter function that's only used in one place but causes us to split the documentation for FISH_EMOJI_WIDTH across multiple declarations. 2023-05-17 02:55:38 +08:00			`/// Width of emoji characters.`
			`///`
			/// This must be configurable because the value changed between Unicode 8 and Unicode 9, `wcwidth()`
			`/// is emoji-unaware, and terminal emulators do different things.`
			`///`
			`/// See issues like #4539 and https://github.com/neovim/issues/4976 for how painful this is.`
			`///`
			`/// Valid values are 1, and 2. 1 is the typical emoji width used in Unicode 8 while some newer`
			`/// terminals use a width of 2 since Unicode 9.`
			// For some reason, this is declared here and exposed here, but is set in `env_dispatch`.
Make wcwidth an isize Seems more consistent with the rest of our code. 2024-02-15 05:18:49 +08:00			`pub static FISH_EMOJI_WIDTH: AtomicIsize = AtomicIsize::new(1);`
Port most of fallback 2023-04-09 19:37:12 +08:00
Revert rename of wcwidth() to system_wcwidth() It's not clear whether or not `system_wcwidth()` was picked solely because of the namespace conflict (which is easily remedied) but using the most obvious name for this function should be the way to go. We already have our own overload of `wcwidth()` (`fish_wcwidth()`) so it should be more obvious which is the bare system call and which isn't. (I do want to move this w/ some of the other standalone extern C wrappers to the unix module later.) 2023-05-17 07:54:10 +08:00			`static WC_LOOKUP_TABLE: Lazy<WcLookupTable> = Lazy::new(WcLookupTable::new);`

			/// A safe wrapper around the system `wcwidth()` function
Make wcwidth an isize Seems more consistent with the rest of our code. 2024-02-15 05:18:49 +08:00			`pub fn wcwidth(c: char) -> isize {`
Revert rename of wcwidth() to system_wcwidth() It's not clear whether or not `system_wcwidth()` was picked solely because of the namespace conflict (which is easily remedied) but using the most obvious name for this function should be the way to go. We already have our own overload of `wcwidth()` (`fish_wcwidth()`) so it should be more obvious which is the bare system call and which isn't. (I do want to move this w/ some of the other standalone extern C wrappers to the unix module later.) 2023-05-17 07:54:10 +08:00			`extern "C" {`
			`pub fn wcwidth(c: libc::wchar_t) -> libc::c_int;`
			`}`

Port most of fallback 2023-04-09 19:37:12 +08:00			`const _: () = assert!(mem::size_of::<libc::wchar_t>() >= mem::size_of::<char>());`
Make wcwidth an isize Seems more consistent with the rest of our code. 2024-02-15 05:18:49 +08:00			`let width = unsafe { wcwidth(c as libc::wchar_t) };`
			`isize::try_from(width).unwrap()`
Port most of fallback 2023-04-09 19:37:12 +08:00			`}`

			`// Big hack to use our versions of wcswidth where we know them to be broken, which is`
			`// EVERYWHERE (https://github.com/fish-shell/fish-shell/issues/2199)`
Make wcwidth an isize Seems more consistent with the rest of our code. 2024-02-15 05:18:49 +08:00			`pub fn fish_wcwidth(c: char) -> isize {`
Port most of fallback 2023-04-09 19:37:12 +08:00			`// The system version of wcwidth should accurately reflect the ability to represent characters`
			`// in the console session, but knows nothing about the capabilities of other terminal emulators`
			`// or ttys. Use it from the start only if we are logged in to the physical console.`
			`if is_console_session() {`
Revert rename of wcwidth() to system_wcwidth() It's not clear whether or not `system_wcwidth()` was picked solely because of the namespace conflict (which is easily remedied) but using the most obvious name for this function should be the way to go. We already have our own overload of `wcwidth()` (`fish_wcwidth()`) so it should be more obvious which is the bare system call and which isn't. (I do want to move this w/ some of the other standalone extern C wrappers to the unix module later.) 2023-05-17 07:54:10 +08:00			`return wcwidth(c);`
Port most of fallback 2023-04-09 19:37:12 +08:00			`}`

			`// Check for VS16 which selects emoji presentation. This "promotes" a character like U+2764`
			`// (width 1) to an emoji (probably width 2). So treat it as width 1 so the sums work. See #2652.`
			`// VS15 selects text presentation.`
			`let variation_selector_16 = '\u{FE0F}';`
			`let variation_selector_15 = '\u{FE0E}';`
			`if c == variation_selector_16 {`
			`return 1;`
			`} else if c == variation_selector_15 {`
			`return 0;`
			`}`

			`// Check for Emoji_Modifier property. Only the Fitzpatrick modifiers have this, in range`
			`// 1F3FB..1F3FF. This is a hack because such an emoji appearing on its own would be drawn as`
			`// width 2, but that's unlikely to be useful. See #8275.`
fix typo 2023-12-29 21:51:36 +08:00			`if ('\u{1F3FB}'..='\u{1F3FF}').contains(&c) {`
Port most of fallback 2023-04-09 19:37:12 +08:00			`return 0;`
			`}`

			`let width = WC_LOOKUP_TABLE.classify(c);`
			`match width {`
			`WcWidth::NonCharacter \| WcWidth::NonPrint \| WcWidth::Combining \| WcWidth::Unassigned => {`
			`// Fall back to system wcwidth in this case.`
Revert rename of wcwidth() to system_wcwidth() It's not clear whether or not `system_wcwidth()` was picked solely because of the namespace conflict (which is easily remedied) but using the most obvious name for this function should be the way to go. We already have our own overload of `wcwidth()` (`fish_wcwidth()`) so it should be more obvious which is the bare system call and which isn't. (I do want to move this w/ some of the other standalone extern C wrappers to the unix module later.) 2023-05-17 07:54:10 +08:00			`wcwidth(c)`
Port most of fallback 2023-04-09 19:37:12 +08:00			`}`
			`WcWidth::Ambiguous \| WcWidth::PrivateUse => {`
			`// TR11: "All private-use characters are by default classified as Ambiguous".`
Remove unnecessary use of `static mut`. Atomic don't need to be `mut` to change since they use interior mutability. 2023-05-03 02:22:39 +08:00			`FISH_AMBIGUOUS_WIDTH.load(Ordering::Relaxed)`
Port most of fallback 2023-04-09 19:37:12 +08:00			`}`
			`WcWidth::One => 1,`
			`WcWidth::Two => 2,`
Clean up FISH_EMOJI_WIDTH and FISH_AMBIGUOUS_WIDTH defines Pull in the correct descriptions merged from across the various C++ header and source files and get rid of the getter function that's only used in one place but causes us to split the documentation for FISH_EMOJI_WIDTH across multiple declarations. 2023-05-17 02:55:38 +08:00			`WcWidth::WidenedIn9 => FISH_EMOJI_WIDTH.load(Ordering::Relaxed),`
Port most of fallback 2023-04-09 19:37:12 +08:00			`}`
			`}`

			`/// fish's internal versions of wcwidth and wcswidth, which can use an internal implementation if`
			`/// the system one is busted.`
Make wcwidth an isize Seems more consistent with the rest of our code. 2024-02-15 05:18:49 +08:00			`pub fn fish_wcswidth(s: &wstr) -> isize {`
Port most of fallback 2023-04-09 19:37:12 +08:00			`let mut result = 0;`
			`for c in s.chars() {`
			`let w = fish_wcwidth(c);`
			`if w < 0 {`
			`return -1;`
			`}`
			`result += w;`
			`}`
			`result`
			`}`

			`// Replacement for mkostemp(str, O_CLOEXEC)`
			`// This uses mkostemp if available,`
			`// otherwise it uses mkstemp followed by fcntl`
Use `File` instead of `OwnedFd` in a few places (#10355) This is a step towards converting `wopen_cloexec()` to return `File` instead of `OwnedFd`/`AutocloseFd`.¹ In addition to letting us use native standard library functions instead of unsafe libc calls, we gain additional semantic safety because `File` operations that manipulate the state of the fd (e.g. `File::seek()`) require a `&mut` reference to the `File`, whereas using `RawFd` or `OwnedFd` everywhere leaves us in a position where it's not clear whether or not other references to the same fd will manipulate its underlying state. ¹ We actually wouldn't even need `wopen_cloexec()` at all (just a widechar wrapper) as Rust's native `File::open()`/`File::create()` functionality uses `FD_CLOEXEC` internally. 2024-03-18 00:20:44 +08:00			`pub fn fish_mkstemp_cloexec(name_template: CString) -> Result<(File, CString), Errno> {`
Port most of fallback 2023-04-09 19:37:12 +08:00			`let name = name_template.into_raw();`
			`#[cfg(not(target_os = "macos"))]`
			`let fd = {`
			`use libc::O_CLOEXEC;`
			`unsafe { libc::mkostemp(name, O_CLOEXEC) }`
			`};`
			`#[cfg(target_os = "macos")]`
			`let fd = {`
			`use libc::{FD_CLOEXEC, F_SETFD};`
			`let fd = unsafe { libc::mkstemp(name) };`
			`if fd != -1 {`
			`unsafe { libc::fcntl(fd, F_SETFD, FD_CLOEXEC) };`
			`}`
			`fd`
			`};`
Convert fish_mkstemp_cloexec() to return an OwnedFd 2024-03-06 04:29:31 +08:00			`if fd == -1 {`
			`Err(errno())`
			`} else {`
Use `File` instead of `OwnedFd` in a few places (#10355) This is a step towards converting `wopen_cloexec()` to return `File` instead of `OwnedFd`/`AutocloseFd`.¹ In addition to letting us use native standard library functions instead of unsafe libc calls, we gain additional semantic safety because `File` operations that manipulate the state of the fd (e.g. `File::seek()`) require a `&mut` reference to the `File`, whereas using `RawFd` or `OwnedFd` everywhere leaves us in a position where it's not clear whether or not other references to the same fd will manipulate its underlying state. ¹ We actually wouldn't even need `wopen_cloexec()` at all (just a widechar wrapper) as Rust's native `File::open()`/`File::create()` functionality uses `FD_CLOEXEC` internally. 2024-03-18 00:20:44 +08:00			`unsafe { Ok((File::from_raw_fd(fd), CString::from_raw(name))) }`
Convert fish_mkstemp_cloexec() to return an OwnedFd 2024-03-06 04:29:31 +08:00			`}`
Port most of fallback 2023-04-09 19:37:12 +08:00			`}`

Port the rest of wcstringutil 2023-04-18 17:53:48 +08:00			`pub fn wcscasecmp(lhs: &wstr, rhs: &wstr) -> cmp::Ordering {`
Fix common::wcscasecmp() for multi-byte lowercase strings 2023-05-03 03:10:12 +08:00			`use std::char::ToLowercase;`
			`use widestring::utfstr::CharsUtf32;`

			/// This struct streams the underlying lowercase chars of a `UTF32String` without allocating.
			`///`
			/// `char::to_lowercase()` returns an iterator of chars and we sometimes need to cmp the last
			/// char of one char's `to_lowercase()` with the first char of the other char's
			/// `to_lowercase()`. This makes that possible.
			`struct ToLowerBuffer<'a> {`
			`current: ToLowercase,`
			`chars: CharsUtf32<'a>,`
			`}`

			`impl<'a> Iterator for ToLowerBuffer<'a> {`
			`type Item = char;`

			`fn next(&mut self) -> Option<Self::Item> {`
			`if let Some(c) = self.current.next() {`
			`return Some(c);`
			`}`

			`self.current = self.chars.next()?.to_lowercase();`
			`self.next()`
			`}`
			`}`

			`impl<'a> ToLowerBuffer<'a> {`
			`pub fn from(w: &'a wstr) -> Self {`
			`let mut empty = 'a'.to_lowercase();`
			`let _ = empty.next();`
			`debug_assert!(empty.next().is_none());`
			`let mut chars = w.chars();`
			`Self {`
			`current: chars.next().map(\|c\| c.to_lowercase()).unwrap_or(empty),`
			`chars,`
Port the rest of wcstringutil 2023-04-18 17:53:48 +08:00			`}`
			`}`
			`}`
Fix common::wcscasecmp() for multi-byte lowercase strings 2023-05-03 03:10:12 +08:00
			`let lhs = ToLowerBuffer::from(lhs);`
			`let rhs = ToLowerBuffer::from(rhs);`
			`lhs.cmp(rhs)`
Port most of fallback 2023-04-09 19:37:12 +08:00			`}`
Add multi-byte test for wcscasecmp() The lowercase of İ is two bytes, making it a good test candidate. 2023-05-03 03:18:43 +08:00
			`#[test]`
			`fn test_wcscasecmp() {`
			`use std::cmp::Ordering;`

			`// Comparison with empty`
			`assert_eq!(wcscasecmp(L!("a"), L!("")), Ordering::Greater);`
			`assert_eq!(wcscasecmp(L!(""), L!("a")), Ordering::Less);`
			`assert_eq!(wcscasecmp(L!(""), L!("")), Ordering::Equal);`

			`// Basic comparison`
			`assert_eq!(wcscasecmp(L!("A"), L!("a")), Ordering::Equal);`
			`assert_eq!(wcscasecmp(L!("B"), L!("a")), Ordering::Greater);`
			`assert_eq!(wcscasecmp(L!("A"), L!("B")), Ordering::Less);`

			`// Multi-byte comparison`
			`assert_eq!(wcscasecmp(L!("İ"), L!("i\u{307}")), Ordering::Equal);`
			`assert_eq!(wcscasecmp(L!("ia"), L!("İa")), Ordering::Less);`
			`}`