diff --git a/Cargo.lock b/Cargo.lock index 371e39a029989..276c59259377d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -528,6 +528,19 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "crossbeam" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + [[package]] name = "crossbeam-channel" version = "0.5.12" @@ -556,6 +569,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.19" @@ -1155,6 +1177,12 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +[[package]] +name = "jod-thread" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b23360e99b8717f20aaa4598f5a6541efbe30630039fbc7706cf954a87947ae" + [[package]] name = "js-sys" version = "0.3.68" @@ -1327,6 +1355,31 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "lsp-server" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248f65b78f6db5d8e1b1604b4098a28b43d21a8eb1deeca22b1c421b276c7095" +dependencies = [ + "crossbeam-channel", + "log", + "serde", + "serde_json", +] + +[[package]] +name = "lsp-types" +version = "0.95.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "158c1911354ef73e8fe42da6b10c0484cb65c7f1007f28022e847706c1ab6984" +dependencies = [ + "bitflags 1.3.2", + "serde", + "serde_json", + "serde_repr", + "url", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1982,6 +2035,7 @@ dependencies = [ "ruff_notebook", "ruff_python_ast", "ruff_python_formatter", + "ruff_server", "ruff_source_file", "ruff_text_size", "ruff_workspace", @@ -1996,6 +2050,8 @@ dependencies = [ "tikv-jemallocator", "toml", "tracing", + "tracing-subscriber", + "tracing-tree", "walkdir", "wild", ] @@ -2360,6 +2416,35 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "ruff_server" +version = "0.2.2" +dependencies = [ + "anyhow", + "crossbeam", + "insta", + "jod-thread", + "libc", + "lsp-server", + "lsp-types", + "ruff_diagnostics", + "ruff_formatter", + "ruff_linter", + "ruff_python_ast", + "ruff_python_codegen", + "ruff_python_formatter", + "ruff_python_index", + "ruff_python_parser", + "ruff_source_file", + "ruff_text_size", + "ruff_workspace", + "rustc-hash", + "serde", + "serde_json", + "similar", + "tracing", +] + [[package]] name = "ruff_shrinking" version = "0.3.2" @@ -2631,6 +2716,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_repr" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b2e6b945e9d3df726b65d6ee24060aff8e3533d431f677a9695db04eff9dfdb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "serde_spanned" version = "0.6.5" @@ -3083,6 +3179,17 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "tracing-log" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f751112709b4e791d8ce53e32c4ed2d353565a795ce84da2285393f41557bdf2" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + [[package]] name = "tracing-log" version = "0.2.0" @@ -3109,7 +3216,19 @@ dependencies = [ "thread_local", "tracing", "tracing-core", - "tracing-log", + "tracing-log 0.2.0", +] + +[[package]] +name = "tracing-tree" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ec6adcab41b1391b08a308cc6302b79f8095d1673f6947c2dc65ffb028b0b2d" +dependencies = [ + "nu-ansi-term", + "tracing-core", + "tracing-log 0.1.4", + "tracing-subscriber", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 69cfc45e70875..d516e2f1fe9a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ console_error_panic_hook = { version = "0.1.7" } console_log = { version = "1.0.0" } countme = { version = "3.0.1" } criterion = { version = "0.5.1", default-features = false } +crossbeam = { version = "0.8.4" } dirs = { version = "5.0.0" } drop_bomb = { version = "0.1.5" } env_logger = { version = "0.10.1" } @@ -52,10 +53,14 @@ is-macro = { version = "0.3.5" } is-wsl = { version = "0.4.0" } itertools = { version = "0.12.1" } js-sys = { version = "0.3.67" } +jod-thread = { version = "0.1.2" } lalrpop-util = { version = "0.20.0", default-features = false } lexical-parse-float = { version = "0.8.0", features = ["format"] } +libc = { version = "0.2.153" } libcst = { version = "1.1.0", default-features = false } log = { version = "0.4.17" } +lsp-server = { version = "0.7.6" } +lsp-types = { version = "0.95.0", features = ["proposed"] } memchr = { version = "2.7.1" } mimalloc = { version = "0.1.39" } natord = { version = "1.0.9" } @@ -97,6 +102,7 @@ toml = { version = "0.8.9" } tracing = { version = "0.1.40" } tracing-indicatif = { version = "0.3.6" } tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +tracing-tree = { version = "0.2.4" } typed-arena = { version = "2.0.2" } unic-ucd-category = { version = "0.9" } unicode-ident = { version = "1.0.12" } diff --git a/crates/ruff/Cargo.toml b/crates/ruff/Cargo.toml index 45c3bc3c0f007..7e83ca7df1609 100644 --- a/crates/ruff/Cargo.toml +++ b/crates/ruff/Cargo.toml @@ -20,6 +20,7 @@ ruff_macros = { path = "../ruff_macros" } ruff_notebook = { path = "../ruff_notebook" } ruff_python_ast = { path = "../ruff_python_ast" } ruff_python_formatter = { path = "../ruff_python_formatter" } +ruff_server = { path = "../ruff_server" } ruff_source_file = { path = "../ruff_source_file" } ruff_text_size = { path = "../ruff_text_size" } ruff_workspace = { path = "../ruff_workspace" } @@ -52,6 +53,8 @@ tempfile = { workspace = true } thiserror = { workspace = true } toml = { workspace = true } tracing = { workspace = true, features = ["log"] } +tracing-subscriber = { workspace = true, features = ["registry"]} +tracing-tree = { workspace = true } walkdir = { workspace = true } wild = { workspace = true } diff --git a/crates/ruff/src/args.rs b/crates/ruff/src/args.rs index 8aea65056394a..059d2eb10f9b9 100644 --- a/crates/ruff/src/args.rs +++ b/crates/ruff/src/args.rs @@ -126,6 +126,8 @@ pub enum Command { GenerateShellCompletion { shell: clap_complete_command::Shell }, /// Run the Ruff formatter on the given files or directories. Format(FormatCommand), + /// Run the language server. + Server(ServerCommand), /// Display Ruff's version Version { #[arg(long, value_enum, default_value = "text")] @@ -494,6 +496,9 @@ pub struct FormatCommand { pub range: Option, } +#[derive(Clone, Debug, clap::Parser)] +pub struct ServerCommand; + #[derive(Debug, Clone, Copy, clap::ValueEnum)] pub enum HelpFormat { Text, diff --git a/crates/ruff/src/commands/mod.rs b/crates/ruff/src/commands/mod.rs index 554a7a454add2..787a22ed43451 100644 --- a/crates/ruff/src/commands/mod.rs +++ b/crates/ruff/src/commands/mod.rs @@ -7,6 +7,7 @@ pub(crate) mod format; pub(crate) mod format_stdin; pub(crate) mod linter; pub(crate) mod rule; +pub(crate) mod server; pub(crate) mod show_files; pub(crate) mod show_settings; pub(crate) mod version; diff --git a/crates/ruff/src/commands/server.rs b/crates/ruff/src/commands/server.rs new file mode 100644 index 0000000000000..5ca37ed2b5007 --- /dev/null +++ b/crates/ruff/src/commands/server.rs @@ -0,0 +1,69 @@ +use crate::ExitStatus; +use anyhow::Result; +use ruff_linter::logging::LogLevel; +use ruff_server::Server; +use tracing::{level_filters::LevelFilter, metadata::Level, subscriber::Interest, Metadata}; +use tracing_subscriber::{ + layer::{Context, Filter, SubscriberExt}, + Layer, Registry, +}; +use tracing_tree::time::Uptime; + +pub(crate) fn run_server(log_level: LogLevel) -> Result { + let trace_level = if log_level == LogLevel::Verbose { + Level::TRACE + } else { + Level::DEBUG + }; + + let subscriber = Registry::default().with( + tracing_tree::HierarchicalLayer::default() + .with_indent_lines(true) + .with_indent_amount(2) + .with_bracketed_fields(true) + .with_targets(true) + .with_writer(|| Box::new(std::io::stderr())) + .with_timer(Uptime::default()) + .with_filter(LoggingFilter { trace_level }), + ); + + tracing::subscriber::set_global_default(subscriber)?; + + let server = Server::new()?; + + server.run().map(|()| ExitStatus::Success) +} + +struct LoggingFilter { + trace_level: Level, +} + +impl LoggingFilter { + fn is_enabled(&self, meta: &Metadata<'_>) -> bool { + let filter = if meta.target().starts_with("ruff") { + self.trace_level + } else { + Level::INFO + }; + + meta.level() <= &filter + } +} + +impl Filter for LoggingFilter { + fn enabled(&self, meta: &Metadata<'_>, _cx: &Context<'_, S>) -> bool { + self.is_enabled(meta) + } + + fn callsite_enabled(&self, meta: &'static Metadata<'static>) -> Interest { + if self.is_enabled(meta) { + Interest::always() + } else { + Interest::never() + } + } + + fn max_level_hint(&self) -> Option { + Some(LevelFilter::from_level(self.trace_level)) + } +} diff --git a/crates/ruff/src/lib.rs b/crates/ruff/src/lib.rs index cb5954a987e38..c8381ffc82855 100644 --- a/crates/ruff/src/lib.rs +++ b/crates/ruff/src/lib.rs @@ -7,7 +7,7 @@ use std::process::ExitCode; use std::sync::mpsc::channel; use anyhow::Result; -use args::GlobalConfigArgs; +use args::{GlobalConfigArgs, ServerCommand}; use clap::CommandFactory; use colored::Colorize; use log::warn; @@ -190,6 +190,7 @@ pub fn run( } Command::Check(args) => check(args, global_options), Command::Format(args) => format(args, global_options), + Command::Server(args) => server(args, global_options.log_level()), } } @@ -203,6 +204,12 @@ fn format(args: FormatCommand, global_options: GlobalConfigArgs) -> Result Result { + let ServerCommand {} = args; + commands::server::run_server(log_level) +} + pub fn check(args: CheckCommand, global_options: GlobalConfigArgs) -> Result { let (cli, config_arguments) = args.partition(global_options)?; diff --git a/crates/ruff_formatter/src/lib.rs b/crates/ruff_formatter/src/lib.rs index a78cfab1ebc0a..7f20543ee0736 100644 --- a/crates/ruff_formatter/src/lib.rs +++ b/crates/ruff_formatter/src/lib.rs @@ -545,6 +545,10 @@ impl PrintedRange { &self.code } + pub fn into_code(self) -> String { + self.code + } + /// The range the formatted code corresponds to in the source document. pub fn source_range(&self) -> TextRange { self.source_range diff --git a/crates/ruff_server/Cargo.toml b/crates/ruff_server/Cargo.toml new file mode 100644 index 0000000000000..36cacd9ca36ba --- /dev/null +++ b/crates/ruff_server/Cargo.toml @@ -0,0 +1,44 @@ +[package] +name = "ruff_server" +version = "0.2.2" +publish = false +authors = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } +repository = { workspace = true } +license = { workspace = true } + +[lib] + +[dependencies] +ruff_diagnostics = { path = "../ruff_diagnostics" } +ruff_formatter = { path = "../ruff_formatter" } +ruff_linter = { path = "../ruff_linter" } +ruff_python_ast = { path = "../ruff_python_ast" } +ruff_python_codegen = { path = "../ruff_python_codegen" } +ruff_python_formatter = { path = "../ruff_python_formatter" } +ruff_python_index = { path = "../ruff_python_index" } +ruff_python_parser = { path = "../ruff_python_parser" } +ruff_source_file = { path = "../ruff_source_file" } +ruff_text_size = { path = "../ruff_text_size" } +ruff_workspace = { path = "../ruff_workspace" } + +anyhow = { workspace = true } +crossbeam = { workspace = true } +jod-thread = { workspace = true } +libc = { workspace = true } +lsp-server = { workspace = true } +lsp-types = { workspace = true } +rustc-hash = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +similar = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +insta = { workspace = true } + +[lints] +workspace = true diff --git a/crates/ruff_server/README.md b/crates/ruff_server/README.md new file mode 100644 index 0000000000000..4123a253af865 --- /dev/null +++ b/crates/ruff_server/README.md @@ -0,0 +1 @@ +## The Ruff Language Server diff --git a/crates/ruff_server/resources/test/fixtures/pandas_html.py b/crates/ruff_server/resources/test/fixtures/pandas_html.py new file mode 100644 index 0000000000000..a4669fa1feff0 --- /dev/null +++ b/crates/ruff_server/resources/test/fixtures/pandas_html.py @@ -0,0 +1,1240 @@ +# +------------------------------------------------------------+ +# | Code adopted from: | +# | Repository: https://github.com/pandas-dev/pandas.git | +# | File: `io/html.py` | +# | Commit: 1f622e2b5303650fa5e497e4552d0554e51049cb | +# +------------------------------------------------------------+ +# This file should be used to test LSP functions that edit / fix a file. + +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with +HTML IO. + +""" + +from __future__ import annotations + +from collections import abc +import errno +import numbers +import os +import re +from re import Pattern +from typing import ( + TYPE_CHECKING, + Literal, + cast, +) + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + AbstractMethodError, + EmptyDataError, +) +from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.common import is_list_like + +from pandas import isna +from pandas.core.indexes.base import Index +from pandas.core.indexes.multi import MultiIndex +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + get_handle, + is_url, + stringify_path, + validate_header_arg, +) +from pandas.io.formats.printing import pprint_thing +from pandas.io.parsers import TextParser + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Sequence, + ) + + from pandas._typing import ( + BaseBuffer, + DtypeBackend, + FilePath, + HTMLFlavors, + ReadBuffer, + StorageOptions, + ) + + from pandas import DataFrame + +############# +# READ HTML # +############# +_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") + + +def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: + """ + Replace extra whitespace inside of a string with a single space. + + Parameters + ---------- + s : str or unicode + The string from which to remove extra whitespace. + regex : re.Pattern + The regular expression to use to remove extra whitespace. + + Returns + ------- + subd : str or unicode + `s` with all extra whitespace replaced with a single space. + """ + return regex.sub(" ", s.strip()) + + +def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: + """ + Get an iterator given an integer, slice or container. + + Parameters + ---------- + skiprows : int, slice, container + The iterator to use to skip rows; can also be a slice. + + Raises + ------ + TypeError + * If `skiprows` is not a slice, integer, or Container + + Returns + ------- + it : iterable + A proper iterator to use to skip rows of a DataFrame. + """ + if isinstance(skiprows, slice): + start, step = skiprows.start or 0, skiprows.step or 1 + return list(range(start, skiprows.stop, step)) + elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): + return cast("int | Sequence[int]", skiprows) + elif skiprows is None: + return 0 + raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") + + +def _read( + obj: FilePath | BaseBuffer, + encoding: str | None, + storage_options: StorageOptions | None, +) -> str | bytes: + """ + Try to read from a url, file or string. + + Parameters + ---------- + obj : str, unicode, path object, or file-like object + + Returns + ------- + raw_text : str + """ + try: + with get_handle( + obj, "r", encoding=encoding, storage_options=storage_options + ) as handles: + return handles.handle.read() + except OSError as err: + if not is_url(obj): + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}" + ) from err + raise + + +class _HtmlFrameParser: + """ + Base class for parsers that parse HTML into DataFrames. + + Parameters + ---------- + io : str or file-like + This can be either a string path, a valid URL using the HTTP, + FTP, or FILE protocols or a file-like object. + + match : str or regex + The text to match in the document. + + attrs : dict + List of HTML element attributes to match. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + Attributes + ---------- + io : str or file-like + raw HTML, URL, or file-like object + + match : regex + The text to match in the raw HTML + + attrs : dict-like + A dictionary of valid table attributes to use to search for table + elements. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + Notes + ----- + To subclass this class effectively you must override the following methods: + * :func:`_build_doc` + * :func:`_attr_getter` + * :func:`_href_getter` + * :func:`_text_getter` + * :func:`_parse_td` + * :func:`_parse_thead_tr` + * :func:`_parse_tbody_tr` + * :func:`_parse_tfoot_tr` + * :func:`_parse_tables` + * :func:`_equals_tag` + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + io: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + match: str | Pattern, + attrs: dict[str, str] | None, + encoding: str, + displayed_only: bool, + extract_links: Literal[None, "header", "footer", "body", "all"], + storage_options: StorageOptions = None, + ) -> None: + self.io = io + self.match = match + self.attrs = attrs + self.encoding = encoding + self.displayed_only = displayed_only + self.extract_links = extract_links + self.storage_options = storage_options + + def parse_tables(self): + """ + Parse and return all tables from the DOM. + + Returns + ------- + list of parsed (header, body, footer) tuples from tables. + """ + tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + return (self._parse_thead_tbody_tfoot(table) for table in tables) + + def _attr_getter(self, obj, attr): + """ + Return the attribute value of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + attr : str or unicode + The attribute, such as "colspan" + + Returns + ------- + str or unicode + The attribute value. + """ + # Both lxml and BeautifulSoup have the same implementation: + return obj.get(attr) + + def _href_getter(self, obj) -> str | None: + """ + Return a href if the DOM node contains a child or None. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + href : str or unicode + The href from the child of the DOM node. + """ + raise AbstractMethodError(self) + + def _text_getter(self, obj): + """ + Return the text of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + text : str or unicode + The text from an individual DOM node. + """ + raise AbstractMethodError(self) + + def _parse_td(self, obj): + """ + Return the td elements from a row element. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + list of node-like + These are the elements of each row, i.e., the columns. + """ + raise AbstractMethodError(self) + + def _parse_thead_tr(self, table): + """ + Return the list of thead row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains zero or more thead elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tbody_tr(self, table): + """ + Return the list of tbody row elements from the parsed table element. + + HTML5 table bodies consist of either 0 or more elements (which + only contain elements) or 0 or more elements. This method + checks for both structures. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tfoot_tr(self, table): + """ + Return the list of tfoot row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tables(self, document, match, attrs): + """ + Return all tables from the parsed DOM. + + Parameters + ---------- + document : the DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. + + Raises + ------ + ValueError : `match` does not match any text in the document. + + Returns + ------- + list of node-like + HTML
elements to be parsed into raw data. + """ + raise AbstractMethodError(self) + + def _equals_tag(self, obj, tag) -> bool: + """ + Return whether an individual DOM node matches a tag + + Parameters + ---------- + obj : node-like + A DOM node. + + tag : str + Tag name to be checked for equality. + + Returns + ------- + boolean + Whether `obj`'s tag name is `tag` + """ + raise AbstractMethodError(self) + + def _build_doc(self): + """ + Return a tree-like object that can be used to iterate over the DOM. + + Returns + ------- + node-like + The DOM from which to parse the table element. + """ + raise AbstractMethodError(self) + + def _parse_thead_tbody_tfoot(self, table_html): + """ + Given a table, return parsed header, body, and foot. + + Parameters + ---------- + table_html : node-like + + Returns + ------- + tuple of (header, body, footer), each a list of list-of-text rows. + + Notes + ----- + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of str text. + + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are . Move the top all- or + while body_rows and row_is_all_th(body_rows[0]): + header_rows.append(body_rows.pop(0)) + + header = self._expand_colspan_rowspan(header_rows, section="header") + body = self._expand_colspan_rowspan(body_rows, section="body") + footer = self._expand_colspan_rowspan(footer_rows, section="footer") + + return header, body, footer + + def _expand_colspan_rowspan( + self, rows, section: Literal["header", "footer", "body"] + ) -> list[list]: + """ + Given a list of s, return a list of text rows. + + Parameters + ---------- + rows : list of node-like + List of s + section : the section that the rows belong to (header, body or footer). + + Returns + ------- + list of list + Each returned row is a list of str text, or tuple (text, link) + if extract_links is not None. + + Notes + ----- + Any cell with ``rowspan`` or ``colspan`` will have its contents copied + to subsequent cells. + """ + all_texts = [] # list of rows, each a list of str + text: str | tuple + remainder: list[ + tuple[int, str | tuple, int] + ] = [] # list of (index, text, nrows) + + for tr in rows: + texts = [] # the output for this row + next_remainder = [] + + index = 0 + tds = self._parse_td(tr) + for td in tds: + # Append texts from previous rows with rowspan>1 that come + # before this or (see _parse_thead_tr). + return row.xpath("./td|./th") + + def _parse_tables(self, document, match, kwargs): + pattern = match.pattern + + # 1. check all descendants for the given pattern and only search tables + # GH 49929 + xpath_expr = f"//table[.//text()[re:test(., {pattern!r})]]" + + # if any table attributes were given build an xpath expression to + # search for them + if kwargs: + xpath_expr += _build_xpath_expr(kwargs) + + tables = document.xpath(xpath_expr, namespaces=_re_namespace) + + tables = self._handle_hidden_tables(tables, "attrib") + if self.displayed_only: + for table in tables: + # lxml utilizes XPATH 1.0 which does not have regex + # support. As a result, we find all elements with a style + # attribute and iterate them to check for display:none + for elem in table.xpath(".//style"): + elem.drop_tree() + for elem in table.xpath(".//*[@style]"): + if "display:none" in elem.attrib.get("style", "").replace(" ", ""): + elem.drop_tree() + if not tables: + raise ValueError(f"No tables found matching regex {pattern!r}") + return tables + + def _equals_tag(self, obj, tag) -> bool: + return obj.tag == tag + + def _build_doc(self): + """ + Raises + ------ + ValueError + * If a URL that lxml cannot parse is passed. + + Exception + * Any other ``Exception`` thrown. For example, trying to parse a + URL that is syntactically correct on a machine with no internet + connection will fail. + + See Also + -------- + pandas.io.html._HtmlFrameParser._build_doc + """ + from lxml.etree import XMLSyntaxError + from lxml.html import ( + HTMLParser, + parse, + ) + + parser = HTMLParser(recover=True, encoding=self.encoding) + + if is_url(self.io): + with get_handle(self.io, "r", storage_options=self.storage_options) as f: + r = parse(f.handle, parser=parser) + else: + # try to parse the input in the simplest way + try: + r = parse(self.io, parser=parser) + except OSError as err: + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}" + ) from err + try: + r = r.getroot() + except AttributeError: + pass + else: + if not hasattr(r, "text_content"): + raise XMLSyntaxError("no text parsed from document", 0, 0, 0) + + for br in r.xpath("*//br"): + br.tail = "\n" + (br.tail or "") + + return r + + def _parse_thead_tr(self, table): + rows = [] + + for thead in table.xpath(".//thead"): + rows.extend(thead.xpath("./tr")) + + # HACK: lxml does not clean up the clearly-erroneous + # . (Missing ). Add + # the and _pretend_ it's a ; _parse_td() will find its + # children as though it's a . + # + # Better solution would be to use html5lib. + elements_at_root = thead.xpath("./td|./th") + if elements_at_root: + rows.append(thead) + + return rows + + def _parse_tbody_tr(self, table): + from_tbody = table.xpath(".//tbody//tr") + from_root = table.xpath("./tr") + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.xpath(".//tfoot//tr") + + +def _expand_elements(body) -> None: + data = [len(elem) for elem in body] + lens = Series(data) + lens_max = lens.max() + not_max = lens[lens != lens_max] + + empty = [""] + for ind, length in not_max.items(): + body[ind] += empty * (lens_max - length) + + +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop("data") + header = kwargs.pop("header") + kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) + if head: + body = head + body + + # Infer header when there is a or top
+ - Move rows from bottom of body to footer only if + all elements inside row are + """ + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) + + def row_is_all_th(row): + return all(self._equals_tag(t, "th") for t in self._parse_td(row)) + + if not header_rows: + # The table has no
rows from + # body_rows to header_rows. (This is a common case because many + # tables in the wild have no
+ while remainder and remainder[0][0] <= index: + prev_i, prev_text, prev_rowspan = remainder.pop(0) + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + index += 1 + + # Append the text from this , colspan times + text = _remove_whitespace(self._text_getter(td)) + if self.extract_links in ("all", section): + href = self._href_getter(td) + text = (text, href) + rowspan = int(self._attr_getter(td, "rowspan") or 1) + colspan = int(self._attr_getter(td, "colspan") or 1) + + for _ in range(colspan): + texts.append(text) + if rowspan > 1: + next_remainder.append((index, text, rowspan - 1)) + index += 1 + + # Append texts from previous rows at the final position + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + + all_texts.append(texts) + remainder = next_remainder + + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder + + return all_texts + + def _handle_hidden_tables(self, tbl_list, attr_name: str): + """ + Return list of tables, potentially removing hidden elements + + Parameters + ---------- + tbl_list : list of node-like + Type of list elements will vary depending upon parser used + attr_name : str + Name of the accessor for retrieving HTML attributes + + Returns + ------- + list of node-like + Return type matches `tbl_list` + """ + if not self.displayed_only: + return tbl_list + + return [ + x + for x in tbl_list + if "display:none" + not in getattr(x, attr_name).get("style", "").replace(" ", "") + ] + + +class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. + + See Also + -------- + pandas.io.html._HtmlFrameParser + pandas.io.html._LxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`pandas.io.html._HtmlFrameParser`. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + from bs4 import SoupStrainer + + self._strainer = SoupStrainer("table") + + def _parse_tables(self, document, match, attrs): + element_name = self._strainer.name + tables = document.find_all(element_name, attrs=attrs) + if not tables: + raise ValueError("No tables found") + + result = [] + unique_tables = set() + tables = self._handle_hidden_tables(tables, "attrs") + + for table in tables: + if self.displayed_only: + for elem in table.find_all("style"): + elem.decompose() + + for elem in table.find_all(style=re.compile(r"display:\s*none")): + elem.decompose() + + if table not in unique_tables and table.find(string=match) is not None: + result.append(table) + unique_tables.add(table) + if not result: + raise ValueError(f"No tables found matching pattern {match.pattern!r}") + return result + + def _href_getter(self, obj) -> str | None: + a = obj.find("a", href=True) + return None if not a else a["href"] + + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag) -> bool: + return obj.name == tag + + def _parse_td(self, row): + return row.find_all(("td", "th"), recursive=False) + + def _parse_thead_tr(self, table): + return table.select("thead tr") + + def _parse_tbody_tr(self, table): + from_tbody = table.select("tbody tr") + from_root = table.find_all("tr", recursive=False) + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.select("tfoot tr") + + def _setup_build_doc(self): + raw_text = _read(self.io, self.encoding, self.storage_options) + if not raw_text: + raise ValueError(f"No text parsed from document: {self.io}") + return raw_text + + def _build_doc(self): + from bs4 import BeautifulSoup + + bdoc = self._setup_build_doc() + if isinstance(bdoc, bytes) and self.encoding is not None: + udoc = bdoc.decode(self.encoding) + from_encoding = None + else: + udoc = bdoc + from_encoding = self.encoding + + soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + for br in soup.find_all("br"): + br.replace_with("\n" + br.text) + + return soup + + +def _build_xpath_expr(attrs) -> str: + """ + Build an xpath expression to simulate bs4's ability to pass in kwargs to + search for attributes when using the lxml parser. + + Parameters + ---------- + attrs : dict + A dict of HTML attributes. These are NOT checked for validity. + + Returns + ------- + expr : unicode + An XPath expression that checks for the given HTML attributes. + """ + # give class attribute as class_ because class is a python keyword + if "class_" in attrs: + attrs["class"] = attrs.pop("class_") + + s = " and ".join([f"@{k}={v!r}" for k, v in attrs.items()]) + return f"[{s}]" + + +_re_namespace = {"re": "http://exslt.org/regular-expressions"} + + +class _LxmlFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses lxml under the hood. + + Warning + ------- + This parser can only handle HTTP, FTP, and FILE urls. + + See Also + -------- + _HtmlFrameParser + _BeautifulSoupLxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`_HtmlFrameParser`. + """ + + def _href_getter(self, obj) -> str | None: + href = obj.xpath(".//a/@href") + return None if not href else href[0] + + def _text_getter(self, obj): + return obj.text_content() + + def _parse_td(self, row): + # Look for direct children only: the "row" element here may be a + #
foobar
-only rows + if header is None: + if len(head) == 1: + header = 0 + else: + # ignore all-empty-text rows + header = [i for i, row in enumerate(head) if any(text for text in row)] + + if foot: + body += foot + + # fill out elements of body that are "ragged" + _expand_elements(body) + with TextParser(body, header=header, **kwargs) as tp: + return tp.read() + + +_valid_parsers = { + "lxml": _LxmlFrameParser, + None: _LxmlFrameParser, + "html5lib": _BeautifulSoupHtml5LibFrameParser, + "bs4": _BeautifulSoupHtml5LibFrameParser, +} + + +def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]: + """ + Choose the parser based on the input flavor. + + Parameters + ---------- + flavor : {{"lxml", "html5lib", "bs4"}} or None + The type of parser to use. This must be a valid backend. + + Returns + ------- + cls : _HtmlFrameParser subclass + The parser class based on the requested input flavor. + + Raises + ------ + ValueError + * If `flavor` is not a valid backend. + ImportError + * If you do not have the requested `flavor` + """ + valid_parsers = list(_valid_parsers.keys()) + if flavor not in valid_parsers: + raise ValueError( + f"{flavor!r} is not a valid flavor, valid flavors are {valid_parsers}" + ) + + if flavor in ("bs4", "html5lib"): + import_optional_dependency("html5lib") + import_optional_dependency("bs4") + else: + import_optional_dependency("lxml.etree") + return _valid_parsers[flavor] + + +def _print_as_set(s) -> str: + arg = ", ".join([pprint_thing(el) for el in s]) + return f"{{{arg}}}" + + +def _validate_flavor(flavor): + if flavor is None: + flavor = "lxml", "bs4" + elif isinstance(flavor, str): + flavor = (flavor,) + elif isinstance(flavor, abc.Iterable): + if not all(isinstance(flav, str) for flav in flavor): + raise TypeError( + f"Object of type {type(flavor).__name__!r} " + f"is not an iterable of strings" + ) + else: + msg = repr(flavor) if isinstance(flavor, str) else str(flavor) + msg += " is not a valid flavor" + raise ValueError(msg) + + flavor = tuple(flavor) + valid_flavors = set(_valid_parsers) + flavor_set = set(flavor) + + if not flavor_set & valid_flavors: + raise ValueError( + f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " + f"flavors are {_print_as_set(valid_flavors)}" + ) + return flavor + + +def _parse( + flavor, + io, + match, + attrs, + encoding, + displayed_only, + extract_links, + storage_options, + **kwargs, +): + flavor = _validate_flavor(flavor) + compiled_match = re.compile(match) # you can pass a compiled regex here + + retained = None + for flav in flavor: + parser = _parser_dispatch(flav) + p = parser( + io, + compiled_match, + attrs, + encoding, + displayed_only, + extract_links, + storage_options, + ) + + try: + tables = p.parse_tables() + except ValueError as caught: + # if `io` is an io-like object, check if it's seekable + # and try to rewind it before trying the next parser + if hasattr(io, "seekable") and io.seekable(): + io.seek(0) + elif hasattr(io, "seekable") and not io.seekable(): + # if we couldn't rewind it, let the user know + raise ValueError( + f"The flavor {flav} failed to parse your input. " + "Since you passed a non-rewindable file " + "object, we can't rewind it to try " + "another parser. Try read_html() with a different flavor." + ) from caught + + retained = caught + else: + break + else: + assert retained is not None # for mypy + raise retained + + ret = [] + for table in tables: + try: + df = _data_to_frame(data=table, **kwargs) + # Cast MultiIndex header to an Index of tuples when extracting header + # links and replace nan with None (therefore can't use mi.to_flat_index()). + # This maintains consistency of selection (e.g. df.columns.str[1]) + if extract_links in ("all", "header") and isinstance( + df.columns, MultiIndex + ): + df.columns = Index( + ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), + tupleize_cols=False, + ) + + ret.append(df) + except EmptyDataError: # empty table + continue + return ret + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_html( + io: FilePath | ReadBuffer[str], + *, + match: str | Pattern = ".+", + flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None, + header: int | Sequence[int] | None = None, + index_col: int | Sequence[int] | None = None, + skiprows: int | Sequence[int] | slice | None = None, + attrs: dict[str, str] | None = None, + parse_dates: bool = False, + thousands: str | None = ",", + encoding: str | None = None, + decimal: str = ".", + converters: dict | None = None, + na_values: Iterable[object] | None = None, + keep_default_na: bool = True, + displayed_only: bool = True, + extract_links: Literal[None, "header", "footer", "body", "all"] = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + storage_options: StorageOptions = None, +) -> list[DataFrame]: + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. + + Parameters + ---------- + io : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``read()`` function. + The string can represent a URL. Note that + lxml only accepts the http, ftp and file url protocols. If you have a + URL that starts with ``'https'`` you might try removing the ``'s'``. + + .. deprecated:: 2.1.0 + Passing html literal strings is deprecated. + Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead. + + match : str or compiled regular expression, optional + The set of tables containing text matching this regex or string will be + returned. Unless the HTML is extremely simple you will probably need to + pass a non-empty string here. Defaults to '.+' (match any non-empty + string). The default value will return all tables contained on a page. + This value is converted to a regular expression so that there is + consistent behavior between Beautiful Soup and lxml. + + flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional + The parsing engine (or list of parsing engines) to use. 'bs4' and + 'html5lib' are synonymous with each other, they are both there for + backwards compatibility. The default of ``None`` tries to use ``lxml`` + to parse and if that fails it falls back on ``bs4`` + ``html5lib``. + + header : int or list-like, optional + The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to + make the columns headers. + + index_col : int or list-like, optional + The column (or list of columns) to use to create the index. + + skiprows : int, list-like or slice, optional + Number of rows to skip after parsing the column integer. 0-based. If a + sequence of integers or a slice is given, will skip the rows indexed by + that sequence. Note that a single element sequence means 'skip the nth + row' whereas an integer means 'skip n rows'. + + attrs : dict, optional + This is a dictionary of attributes that you can pass to use to identify + the table in the HTML. These are not checked for validity before being + passed to lxml or Beautiful Soup. However, these attributes must be + valid HTML table attributes to work correctly. For example, :: + + attrs = {{"id": "table"}} + + is a valid attribute dictionary because the 'id' HTML tag attribute is + a valid HTML attribute for *any* HTML tag as per `this document + `__. :: + + attrs = {{"asdf": "table"}} + + is *not* a valid attribute dictionary because 'asdf' is not a valid + HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 + table attributes can be found `here + `__. A + working draft of the HTML 5 spec can be found `here + `__. It contains the + latest information on table attributes for the modern web. + + parse_dates : bool, optional + See :func:`~read_csv` for more details. + + thousands : str, optional + Separator to use to parse thousands. Defaults to ``','``. + + encoding : str, optional + The encoding used to decode the web page. Defaults to ``None``.``None`` + preserves the previous encoding behavior, which depends on the + underlying parser library (e.g., the parser library will try to use + the encoding provided by the document). + + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European + data). + + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + na_values : iterable, default None + Custom NA values. + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to. + + displayed_only : bool, default True + Whether elements with "display: none" should be parsed. + + extract_links : {{None, "all", "header", "body", "footer"}} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + {storage_options} + + .. versionadded:: 2.1.0 + + Returns + ------- + dfs + A list of DataFrames. + + See Also + -------- + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Notes + ----- + Before using this function you should read the :ref:`gotchas about the + HTML parsing libraries `. + + Expect to do some cleanup after you call this function. For example, you + might need to manually assign column names if the column names are + converted to NaN when you pass the `header=0` argument. We try to assume as + little as possible about the structure of the table and push the + idiosyncrasies of the HTML contained in the table to the user. + + This function searches for ```` elements and only for ```` + and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only ``
`` rows and ```` elements within each ``
`` + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + Similar to :func:`~read_csv` the `header` argument is applied + **after** `skiprows` is applied. + + This function will *always* return a list of :class:`DataFrame` *or* + it will fail, e.g., it will *not* return an empty list. + + Examples + -------- + See the :ref:`read_html documentation in the IO section of the docs + ` for some examples of reading in HTML tables. + """ + # Type check here. We don't want to parse only to fail because of an + # invalid value of an integer skiprows. + if isinstance(skiprows, numbers.Integral) and skiprows < 0: + raise ValueError( + "cannot skip rows starting from the end of the " + "data (you passed a negative value)" + ) + if extract_links not in [None, "header", "footer", "body", "all"]: + raise ValueError( + "`extract_links` must be one of " + '{None, "header", "footer", "body", "all"}, got ' + f'"{extract_links}"' + ) + + validate_header_arg(header) + check_dtype_backend(dtype_backend) + + io = stringify_path(io) + + return _parse( + flavor=flavor, + io=io, + match=match, + header=header, + index_col=index_col, + skiprows=skiprows, + parse_dates=parse_dates, + thousands=thousands, + attrs=attrs, + encoding=encoding, + decimal=decimal, + converters=converters, + na_values=na_values, + keep_default_na=keep_default_na, + displayed_only=displayed_only, + extract_links=extract_links, + dtype_backend=dtype_backend, + storage_options=storage_options, + ) diff --git a/crates/ruff_server/src/edit.rs b/crates/ruff_server/src/edit.rs new file mode 100644 index 0000000000000..0de59793397fe --- /dev/null +++ b/crates/ruff_server/src/edit.rs @@ -0,0 +1,50 @@ +//! Types and utilities for working with text, modifying source files, and `Ruff <-> LSP` type conversion. + +mod document; +mod range; + +pub use document::Document; +pub(crate) use document::DocumentVersion; +use lsp_types::PositionEncodingKind; +pub(crate) use range::{RangeExt, ToRangeExt}; + +/// A convenient enumeration for supported text encodings. Can be converted to [`lsp_types::PositionEncodingKind`]. +// Please maintain the order from least to greatest priority for the derived `Ord` impl. +#[derive(Default, Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum PositionEncoding { + /// UTF 16 is the encoding supported by all LSP clients. + #[default] + UTF16, + + /// Second choice because UTF32 uses a fixed 4 byte encoding for each character (makes conversion relatively easy) + UTF32, + + /// Ruff's preferred encoding + UTF8, +} + +impl From for lsp_types::PositionEncodingKind { + fn from(value: PositionEncoding) -> Self { + match value { + PositionEncoding::UTF8 => lsp_types::PositionEncodingKind::UTF8, + PositionEncoding::UTF16 => lsp_types::PositionEncodingKind::UTF16, + PositionEncoding::UTF32 => lsp_types::PositionEncodingKind::UTF32, + } + } +} + +impl TryFrom<&lsp_types::PositionEncodingKind> for PositionEncoding { + type Error = (); + + fn try_from(value: &PositionEncodingKind) -> Result { + Ok(if value == &PositionEncodingKind::UTF8 { + PositionEncoding::UTF8 + } else if value == &PositionEncodingKind::UTF16 { + PositionEncoding::UTF16 + } else if value == &PositionEncodingKind::UTF32 { + PositionEncoding::UTF32 + } else { + return Err(()); + }) + } +} diff --git a/crates/ruff_server/src/edit/document.rs b/crates/ruff_server/src/edit/document.rs new file mode 100644 index 0000000000000..bd6c49be3eece --- /dev/null +++ b/crates/ruff_server/src/edit/document.rs @@ -0,0 +1,123 @@ +use lsp_types::TextDocumentContentChangeEvent; +use ruff_source_file::LineIndex; + +use crate::PositionEncoding; + +use super::RangeExt; + +pub(crate) type DocumentVersion = i32; + +/// The state for an individual document in the server. Stays up-to-date +/// with changes made by the user, including unsaved changes. +#[derive(Debug, Clone)] +pub struct Document { + /// The string contents of the document. + contents: String, + /// A computed line index for the document. This should always reflect + /// the current version of `contents`. Using a function like [`Self::modify`] + /// will re-calculate the line index automatically when the `contents` value is updated. + index: LineIndex, + /// The latest version of the document, set by the LSP client. The server will panic in + /// debug mode if we attempt to update the document with an 'older' version. + version: DocumentVersion, +} + +impl Document { + pub fn new(contents: String, version: DocumentVersion) -> Self { + let index = LineIndex::from_source_text(&contents); + Self { + contents, + index, + version, + } + } + + pub fn contents(&self) -> &str { + &self.contents + } + + pub fn index(&self) -> &LineIndex { + &self.index + } + + pub fn version(&self) -> DocumentVersion { + self.version + } + + pub fn apply_changes( + &mut self, + changes: Vec, + new_version: DocumentVersion, + encoding: PositionEncoding, + ) { + if let [lsp_types::TextDocumentContentChangeEvent { + range: None, text, .. + }] = changes.as_slice() + { + tracing::debug!("Fast path - replacing entire document"); + self.modify(|contents, version| { + *contents = text.clone(); + *version = new_version; + }); + return; + } + + let old_contents = self.contents().to_string(); + let mut new_contents = self.contents().to_string(); + let mut active_index = self.index().clone(); + + for TextDocumentContentChangeEvent { + range, + text: change, + .. + } in changes + { + if let Some(range) = range { + let range = range.to_text_range(&new_contents, &active_index, encoding); + + new_contents.replace_range( + usize::from(range.start())..usize::from(range.end()), + &change, + ); + } else { + new_contents = change; + } + + if new_contents != old_contents { + active_index = LineIndex::from_source_text(&new_contents); + } + } + + self.modify_with_manual_index(|contents, version, index| { + if contents != &new_contents { + *index = active_index; + } + *contents = new_contents; + *version = new_version; + }); + } + + pub fn update_version(&mut self, new_version: DocumentVersion) { + self.modify_with_manual_index(|_, version, _| { + *version = new_version; + }); + } + + // A private function for modifying the document's internal state + fn modify(&mut self, func: impl FnOnce(&mut String, &mut DocumentVersion)) { + self.modify_with_manual_index(|c, v, i| { + func(c, v); + *i = LineIndex::from_source_text(c); + }); + } + + // A private function for overriding how we update the line index by default. + fn modify_with_manual_index( + &mut self, + func: impl FnOnce(&mut String, &mut DocumentVersion, &mut LineIndex), + ) { + let old_version = self.version; + func(&mut self.contents, &mut self.version, &mut self.index); + debug_assert!(self.version >= old_version); + } +} diff --git a/crates/ruff_server/src/edit/range.rs b/crates/ruff_server/src/edit/range.rs new file mode 100644 index 0000000000000..c26326e2ac74b --- /dev/null +++ b/crates/ruff_server/src/edit/range.rs @@ -0,0 +1,153 @@ +use super::PositionEncoding; +use lsp_types as types; +use ruff_source_file::OneIndexed; +use ruff_source_file::{LineIndex, SourceLocation}; +use ruff_text_size::{TextRange, TextSize}; + +pub(crate) trait RangeExt { + fn to_text_range(&self, text: &str, index: &LineIndex, encoding: PositionEncoding) + -> TextRange; +} + +pub(crate) trait ToRangeExt { + fn to_range(&self, text: &str, index: &LineIndex, encoding: PositionEncoding) -> types::Range; +} + +fn u32_index_to_usize(index: u32) -> usize { + usize::try_from(index).expect("u32 fits in usize") +} + +impl RangeExt for lsp_types::Range { + fn to_text_range( + &self, + text: &str, + index: &LineIndex, + encoding: PositionEncoding, + ) -> TextRange { + let start_line = index.line_range( + OneIndexed::from_zero_indexed(u32_index_to_usize(self.start.line)), + text, + ); + let end_line = index.line_range( + OneIndexed::from_zero_indexed(u32_index_to_usize(self.end.line)), + text, + ); + + let (start_column_offset, end_column_offset) = match encoding { + PositionEncoding::UTF8 => ( + TextSize::new(self.start.character), + TextSize::new(self.end.character), + ), + + PositionEncoding::UTF16 => { + // Fast path for ASCII only documents + if index.is_ascii() { + ( + TextSize::new(self.start.character), + TextSize::new(self.end.character), + ) + } else { + // UTF16 encodes characters either as one or two 16 bit words. + // The position in `range` is the 16-bit word offset from the start of the line (and not the character offset) + // UTF-16 with a text that may use variable-length characters. + ( + utf8_column_offset(self.start.character, &text[start_line]), + utf8_column_offset(self.end.character, &text[end_line]), + ) + } + } + PositionEncoding::UTF32 => { + // UTF-32 uses 4 bytes for each character. Meaning, the position in range is a character offset. + return TextRange::new( + index.offset( + OneIndexed::from_zero_indexed(u32_index_to_usize(self.start.line)), + OneIndexed::from_zero_indexed(u32_index_to_usize(self.start.character)), + text, + ), + index.offset( + OneIndexed::from_zero_indexed(u32_index_to_usize(self.end.line)), + OneIndexed::from_zero_indexed(u32_index_to_usize(self.end.character)), + text, + ), + ); + } + }; + + TextRange::new( + start_line.start() + start_column_offset.clamp(TextSize::new(0), start_line.end()), + end_line.start() + end_column_offset.clamp(TextSize::new(0), end_line.end()), + ) + } +} + +impl ToRangeExt for TextRange { + fn to_range(&self, text: &str, index: &LineIndex, encoding: PositionEncoding) -> types::Range { + types::Range { + start: offset_to_position(self.start(), text, index, encoding), + end: offset_to_position(self.end(), text, index, encoding), + } + } +} + +/// Converts a UTF-16 code unit offset for a given line into a UTF-8 column number. +fn utf8_column_offset(utf16_code_unit_offset: u32, line: &str) -> TextSize { + let mut utf8_code_unit_offset = TextSize::new(0); + + let mut i = 0u32; + + for c in line.chars() { + if i >= utf16_code_unit_offset { + break; + } + + // Count characters encoded as two 16 bit words as 2 characters. + { + utf8_code_unit_offset += + TextSize::new(u32::try_from(c.len_utf8()).expect("utf8 len always <=4")); + i += u32::try_from(c.len_utf16()).expect("utf16 len always <=2"); + } + } + + utf8_code_unit_offset +} + +fn offset_to_position( + offset: TextSize, + text: &str, + index: &LineIndex, + encoding: PositionEncoding, +) -> types::Position { + let location = match encoding { + PositionEncoding::UTF8 => { + let row = index.line_index(offset); + let column = offset - index.line_start(row, text); + + SourceLocation { + column: OneIndexed::from_zero_indexed(column.to_usize()), + row, + } + } + PositionEncoding::UTF16 => { + let row = index.line_index(offset); + + let column = if index.is_ascii() { + (offset - index.line_start(row, text)).to_usize() + } else { + let up_to_line = &text[TextRange::new(index.line_start(row, text), offset)]; + up_to_line.encode_utf16().count() + }; + + SourceLocation { + column: OneIndexed::from_zero_indexed(column), + row, + } + } + PositionEncoding::UTF32 => index.source_location(offset, text), + }; + + types::Position { + line: u32::try_from(location.row.to_zero_indexed()).expect("row usize fits in u32"), + character: u32::try_from(location.column.to_zero_indexed()) + .expect("character usize fits in u32"), + } +} diff --git a/crates/ruff_server/src/format.rs b/crates/ruff_server/src/format.rs new file mode 100644 index 0000000000000..d69c76dd2d931 --- /dev/null +++ b/crates/ruff_server/src/format.rs @@ -0,0 +1,33 @@ +use ruff_formatter::PrintedRange; +use ruff_python_formatter::format_module_source; +use ruff_text_size::TextRange; +use ruff_workspace::FormatterSettings; + +use crate::edit::Document; + +pub(crate) fn format( + document: &Document, + formatter_settings: &FormatterSettings, +) -> crate::Result { + // TODO(jane): support Jupyter Notebook + let format_options = formatter_settings + .to_format_options(ruff_python_ast::PySourceType::Python, document.contents()); + let formatted = format_module_source(document.contents(), format_options)?; + Ok(formatted.into_code()) +} + +pub(crate) fn format_range( + document: &Document, + formatter_settings: &FormatterSettings, + range: TextRange, +) -> crate::Result { + // TODO(jane): support Jupyter Notebook + let format_options = formatter_settings + .to_format_options(ruff_python_ast::PySourceType::Python, document.contents()); + + Ok(ruff_python_formatter::format_range( + document.contents(), + range, + format_options, + )?) +} diff --git a/crates/ruff_server/src/lib.rs b/crates/ruff_server/src/lib.rs new file mode 100644 index 0000000000000..b4d50d7523344 --- /dev/null +++ b/crates/ruff_server/src/lib.rs @@ -0,0 +1,21 @@ +//! ## The Ruff Language Server + +pub use edit::{Document, PositionEncoding}; +pub use server::Server; + +mod edit; +mod format; +mod lint; +mod server; +mod session; + +pub(crate) const SERVER_NAME: &str = "ruff"; +pub(crate) const DIAGNOSTIC_NAME: &str = "Ruff"; + +/// A common result type used in most cases where a +/// result type is needed. +pub(crate) type Result = anyhow::Result; + +pub(crate) fn version() -> &'static str { + ruff_linter::VERSION +} diff --git a/crates/ruff_server/src/lint.rs b/crates/ruff_server/src/lint.rs new file mode 100644 index 0000000000000..109e36fb285ba --- /dev/null +++ b/crates/ruff_server/src/lint.rs @@ -0,0 +1,120 @@ +//! Access to the Ruff linting API for the LSP + +use std::path::Path; + +use ruff_diagnostics::{Applicability, Diagnostic, DiagnosticKind, Fix}; +use ruff_linter::{ + directives::{extract_directives, Flags}, + linter::{check_path, LinterResult, TokenSource}, + registry::AsRule, + settings::{flags, LinterSettings}, + source_kind::SourceKind, +}; +use ruff_python_ast::PySourceType; +use ruff_python_codegen::Stylist; +use ruff_python_index::Indexer; +use ruff_python_parser::lexer::LexResult; +use ruff_python_parser::AsMode; +use ruff_source_file::Locator; +use serde::{Deserialize, Serialize}; + +use crate::{edit::ToRangeExt, PositionEncoding, DIAGNOSTIC_NAME}; + +#[derive(Serialize, Deserialize)] +pub(crate) struct DiagnosticFix { + pub(crate) kind: DiagnosticKind, + pub(crate) fix: Fix, +} + +pub(crate) fn check( + document: &crate::edit::Document, + linter_settings: &LinterSettings, + encoding: PositionEncoding, +) -> Vec { + let contents = document.contents(); + let index = document.index().clone(); + + let source_type = PySourceType::default(); + + // TODO(jane): Support Jupyter Notebooks + let source_kind = SourceKind::Python(contents.to_string()); + + // Tokenize once. + let tokens: Vec = ruff_python_parser::tokenize(contents, source_type.as_mode()); + + // Map row and column locations to byte slices (lazily). + let locator = Locator::with_index(contents, index); + + // Detect the current code style (lazily). + let stylist = Stylist::from_tokens(&tokens, &locator); + + // Extra indices from the code. + let indexer = Indexer::from_tokens(&tokens, &locator); + + // Extract the `# noqa` and `# isort: skip` directives from the source. + let directives = extract_directives(&tokens, Flags::empty(), &locator, &indexer); + + // Generate checks. + let LinterResult { + data: (diagnostics, _imports), + .. + } = check_path( + Path::new(""), + None, + &locator, + &stylist, + &indexer, + &directives, + linter_settings, + flags::Noqa::Enabled, + &source_kind, + source_type, + TokenSource::Tokens(tokens), + ); + + diagnostics + .into_iter() + .map(|diagnostic| to_lsp_diagnostic(diagnostic, document, encoding)) + .collect() +} + +fn to_lsp_diagnostic( + diagnostic: Diagnostic, + document: &crate::edit::Document, + encoding: PositionEncoding, +) -> lsp_types::Diagnostic { + let Diagnostic { + kind, range, fix, .. + } = diagnostic; + + let rule = kind.rule(); + + let data = fix.and_then(|fix| { + fix.applies(Applicability::Unsafe) + .then(|| { + serde_json::to_value(&DiagnosticFix { + kind: kind.clone(), + fix, + }) + .ok() + }) + .flatten() + }); + lsp_types::Diagnostic { + range: range.to_range(document.contents(), document.index(), encoding), + severity: Some(lsp_types::DiagnosticSeverity::ERROR), + code: Some(lsp_types::NumberOrString::String( + rule.noqa_code().to_string(), + )), + code_description: rule.url().and_then(|url| { + Some(lsp_types::CodeDescription { + href: lsp_types::Url::parse(&url).ok()?, + }) + }), + source: Some(DIAGNOSTIC_NAME.into()), + message: kind.body, + related_information: None, + tags: None, + data, + } +} diff --git a/crates/ruff_server/src/server.rs b/crates/ruff_server/src/server.rs new file mode 100644 index 0000000000000..bf88192e457b2 --- /dev/null +++ b/crates/ruff_server/src/server.rs @@ -0,0 +1,158 @@ +//! Scheduling, I/O, and API endpoints. + +use anyhow::anyhow; +use lsp::Connection; +use lsp_server as lsp; +use lsp_types as types; +use types::ClientCapabilities; +use types::CodeActionKind; +use types::CodeActionOptions; +use types::DiagnosticOptions; +use types::OneOf; +use types::TextDocumentSyncCapability; +use types::TextDocumentSyncKind; +use types::TextDocumentSyncOptions; +use types::WorkDoneProgressOptions; +use types::WorkspaceFoldersServerCapabilities; + +use self::schedule::event_loop_thread; +use crate::session::Session; +use crate::PositionEncoding; + +mod api; +mod client; +mod schedule; + +pub(crate) type Result = std::result::Result; + +pub struct Server { + conn: lsp::Connection, + threads: lsp::IoThreads, + session: Session, +} + +impl Server { + pub fn new() -> crate::Result { + let (conn, threads) = lsp::Connection::stdio(); + + let (id, params) = conn.initialize_start()?; + + let init_params: types::InitializeParams = serde_json::from_value(params)?; + + let client_capabilities = init_params.capabilities; + let server_capabilities = Self::server_capabilities(&client_capabilities); + + let workspaces = init_params + .workspace_folders + .map(|folders| folders.into_iter().map(|folder| folder.uri).collect()) + .or_else(|| init_params.root_uri.map(|u| vec![u])) + .ok_or_else(|| { + anyhow!("No workspace or root URI was given in the LSP initialization parameters. The server cannot start.") + })?; + + let initialize_data = serde_json::json!({ + "capabilities": server_capabilities, + "serverInfo": { + "name": crate::SERVER_NAME, + "version": crate::version() + } + }); + + conn.initialize_finish(id, initialize_data)?; + + Ok(Self { + conn, + threads, + session: Session::new(&server_capabilities, &workspaces)?, + }) + } + + pub fn run(self) -> crate::Result<()> { + let result = event_loop_thread(move || Self::event_loop(&self.conn, self.session))?.join(); + self.threads.join()?; + result + } + + fn event_loop(connection: &Connection, session: Session) -> crate::Result<()> { + // TODO(jane): Make thread count configurable + let mut scheduler = schedule::Scheduler::new(session, 4, &connection.sender); + for msg in &connection.receiver { + let task = match msg { + lsp::Message::Request(req) => { + if connection.handle_shutdown(&req)? { + return Ok(()); + } + api::request(req) + } + lsp::Message::Notification(notification) => api::notification(notification), + lsp::Message::Response(response) => { + tracing::error!( + "Expected request or notification, got response instead: {response:?}" + ); + continue; + } + }; + scheduler.dispatch(task); + } + Ok(()) + } + + fn server_capabilities(client_capabilities: &ClientCapabilities) -> types::ServerCapabilities { + let position_encoding = client_capabilities + .general + .as_ref() + .and_then(|general_capabilities| general_capabilities.position_encodings.as_ref()) + .and_then(|encodings| { + encodings + .iter() + .filter_map(|encoding| PositionEncoding::try_from(encoding).ok()) + .max() // this selects the highest priority position encoding + }) + .unwrap_or_default(); + types::ServerCapabilities { + position_encoding: Some(position_encoding.into()), + code_action_provider: Some(types::CodeActionProviderCapability::Options( + CodeActionOptions { + code_action_kinds: Some(vec![ + CodeActionKind::QUICKFIX, + CodeActionKind::SOURCE_ORGANIZE_IMPORTS, + ]), + work_done_progress_options: WorkDoneProgressOptions { + work_done_progress: Some(true), + }, + resolve_provider: Some(false), + }, + )), + workspace: Some(types::WorkspaceServerCapabilities { + workspace_folders: Some(WorkspaceFoldersServerCapabilities { + supported: Some(true), + change_notifications: Some(OneOf::Left(true)), + }), + file_operations: None, + }), + document_formatting_provider: Some(OneOf::Left(true)), + document_range_formatting_provider: Some(OneOf::Left(true)), + diagnostic_provider: Some(types::DiagnosticServerCapabilities::Options( + DiagnosticOptions { + identifier: Some(crate::DIAGNOSTIC_NAME.into()), + // multi-file analysis could change this + inter_file_dependencies: false, + workspace_diagnostics: false, + work_done_progress_options: WorkDoneProgressOptions { + work_done_progress: Some(true), + }, + }, + )), + text_document_sync: Some(TextDocumentSyncCapability::Options( + TextDocumentSyncOptions { + open_close: Some(true), + change: Some(TextDocumentSyncKind::INCREMENTAL), + will_save: Some(false), + will_save_wait_until: Some(false), + ..Default::default() + }, + )), + ..Default::default() + } + } +} diff --git a/crates/ruff_server/src/server/api.rs b/crates/ruff_server/src/server/api.rs new file mode 100644 index 0000000000000..dd04997898405 --- /dev/null +++ b/crates/ruff_server/src/server/api.rs @@ -0,0 +1,244 @@ +use crate::{server::schedule::Task, session::Session}; +use lsp_server as server; + +mod notifications; +mod requests; +mod traits; + +use notifications as notification; +use requests as request; + +use self::traits::{NotificationHandler, RequestHandler}; + +use super::{client::Responder, schedule::BackgroundSchedule, Result}; + +/// Defines the `document_url` method for implementors of [`traits::Notification`] and [`traits::Request`], +/// given the parameter type used by the implementor. +macro_rules! define_document_url { + ($params:ident: &$p:ty) => { + fn document_url($params: &$p) -> &lsp_types::Url { + &$params.text_document.uri + } + }; +} + +use define_document_url; + +pub(super) fn request<'a>(req: server::Request) -> Task<'a> { + let id = req.id.clone(); + + match req.method.as_str() { + request::CodeAction::METHOD => background_request_task::( + req, + BackgroundSchedule::LatencySensitive, + ), + request::DocumentDiagnostic::METHOD => { + background_request_task::( + req, + BackgroundSchedule::LatencySensitive, + ) + } + request::Format::METHOD => { + background_request_task::(req, BackgroundSchedule::Fmt) + } + request::FormatRange::METHOD => { + background_request_task::(req, BackgroundSchedule::Fmt) + } + method => { + tracing::warn!("Received request {method} which does not have a handler"); + return Task::nothing(); + } + } + .unwrap_or_else(|err| { + tracing::error!("Encountered error when routing request with ID {id}: {err}"); + let result: Result<()> = Err(err); + Task::immediate(id, result) + }) +} + +pub(super) fn notification<'a>(notif: server::Notification) -> Task<'a> { + match notif.method.as_str() { + notification::Cancel::METHOD => local_notification_task::(notif), + notification::DidChange::METHOD => { + local_notification_task::(notif) + } + notification::DidChangeConfiguration::METHOD => { + local_notification_task::(notif) + } + notification::DidChangeWorkspace::METHOD => { + local_notification_task::(notif) + } + notification::DidClose::METHOD => local_notification_task::(notif), + notification::DidOpen::METHOD => local_notification_task::(notif), + method => { + tracing::warn!("Received notification {method} which does not have a handler."); + return Task::nothing(); + } + } + .unwrap_or_else(|err| { + tracing::error!("Encountered error when routing notification: {err}"); + Task::nothing() + }) +} + +#[allow(dead_code)] +fn local_request_task<'a, R: traits::SyncRequestHandler>( + req: server::Request, +) -> super::Result> { + let (id, params) = cast_request::(req)?; + Ok(Task::local(|session, notifier, responder| { + let result = R::run(session, notifier, params); + respond::(id, result, &responder); + })) +} + +fn background_request_task<'a, R: traits::BackgroundDocumentRequestHandler>( + req: server::Request, + schedule: BackgroundSchedule, +) -> super::Result> { + let (id, params) = cast_request::(req)?; + Ok(Task::background(schedule, move |session: &Session| { + // TODO(jane): we should log an error if we can't take a snapshot. + let Some(snapshot) = session.take_snapshot(R::document_url(¶ms)) else { + return Box::new(|_, _| {}); + }; + Box::new(move |notifier, responder| { + let result = R::run_with_snapshot(snapshot, notifier, params); + respond::(id, result, &responder); + }) + })) +} + +fn local_notification_task<'a, N: traits::SyncNotificationHandler>( + notif: server::Notification, +) -> super::Result> { + let (id, params) = cast_notification::(notif)?; + Ok(Task::local(move |session, notifier, _| { + if let Err(err) = N::run(session, notifier, params) { + tracing::error!("An error occurred while running {id}: {err}"); + } + })) +} + +#[allow(dead_code)] +fn background_notification_thread<'a, N: traits::BackgroundDocumentNotificationHandler>( + req: server::Notification, + schedule: BackgroundSchedule, +) -> super::Result> { + let (id, params) = cast_notification::(req)?; + Ok(Task::background(schedule, move |session: &Session| { + // TODO(jane): we should log an error if we can't take a snapshot. + let Some(snapshot) = session.take_snapshot(N::document_url(¶ms)) else { + return Box::new(|_, _| {}); + }; + Box::new(move |notifier, _| { + if let Err(err) = N::run_with_snapshot(snapshot, notifier, params) { + tracing::error!("An error occurred while running {id}: {err}"); + } + }) + })) +} + +/// Tries to cast a serialized request from the server into +/// a parameter type for a specific request handler. +/// It is *highly* recommended to not override this function in your +/// implementation. +fn cast_request( + request: server::Request, +) -> super::Result<( + server::RequestId, + <::RequestType as lsp_types::request::Request>::Params, +)> +where + Req: traits::RequestHandler, +{ + request + .extract(Req::METHOD) + .map_err(|err| match err { + json_err @ server::ExtractError::JsonError { .. } => { + anyhow::anyhow!("JSON parsing failure:\n{json_err}") + } + server::ExtractError::MethodMismatch(_) => { + unreachable!("A method mismatch should not be possible here unless you've used a different handler (`Req`) \ + than the one whose method name was matched against earlier.") + } + }) + .with_failure_code(server::ErrorCode::InternalError) +} + +/// Sends back a response to the server using a [`Responder`]. +fn respond( + id: server::RequestId, + result: crate::server::Result< + <::RequestType as lsp_types::request::Request>::Result, + >, + responder: &Responder, +) where + Req: traits::RequestHandler, +{ + if let Err(err) = responder.respond(id, result) { + tracing::error!("Failed to send response: {err}"); + } +} + +/// Tries to cast a serialized request from the server into +/// a parameter type for a specific request handler. +fn cast_notification( + notification: server::Notification, +) -> super::Result< + ( + &'static str, + <::NotificationType as lsp_types::notification::Notification>::Params, +)> where N: traits::NotificationHandler{ + Ok(( + N::METHOD, + notification + .extract(N::METHOD) + .map_err(|err| match err { + json_err @ server::ExtractError::JsonError { .. } => { + anyhow::anyhow!("JSON parsing failure:\n{json_err}") + } + server::ExtractError::MethodMismatch(_) => { + unreachable!("A method mismatch should not be possible here unless you've used a different handler (`N`) \ + than the one whose method name was matched against earlier.") + } + }) + .with_failure_code(server::ErrorCode::InternalError)?, + )) +} + +pub(crate) struct Error { + pub(crate) code: server::ErrorCode, + pub(crate) error: anyhow::Error, +} + +/// A trait to convert result types into the server result type, [`super::Result`]. +trait LSPResult { + fn with_failure_code(self, code: server::ErrorCode) -> super::Result; +} + +impl> LSPResult for core::result::Result { + fn with_failure_code(self, code: server::ErrorCode) -> super::Result { + self.map_err(|err| Error::new(err.into(), code)) + } +} + +impl Error { + pub(crate) fn new(err: anyhow::Error, code: server::ErrorCode) -> Self { + Self { code, error: err } + } +} + +// Right now, we treat the error code as invisible data that won't +// be printed. +impl std::fmt::Debug for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.error.fmt(f) + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.error.fmt(f) + } +} diff --git a/crates/ruff_server/src/server/api/notifications.rs b/crates/ruff_server/src/server/api/notifications.rs new file mode 100644 index 0000000000000..bb7b52bc70422 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications.rs @@ -0,0 +1,14 @@ +mod cancel; +mod did_change; +mod did_change_configuration; +mod did_change_workspace; +mod did_close; +mod did_open; + +use super::traits::{NotificationHandler, SyncNotificationHandler}; +pub(super) use cancel::Cancel; +pub(super) use did_change::DidChange; +pub(super) use did_change_configuration::DidChangeConfiguration; +pub(super) use did_change_workspace::DidChangeWorkspace; +pub(super) use did_close::DidClose; +pub(super) use did_open::DidOpen; diff --git a/crates/ruff_server/src/server/api/notifications/cancel.rs b/crates/ruff_server/src/server/api/notifications/cancel.rs new file mode 100644 index 0000000000000..bd7ecd943ea86 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/cancel.rs @@ -0,0 +1,23 @@ +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct Cancel; + +impl super::NotificationHandler for Cancel { + type NotificationType = notif::Cancel; +} + +impl super::SyncNotificationHandler for Cancel { + #[tracing::instrument(skip_all)] + fn run( + _session: &mut Session, + _notifier: Notifier, + _params: types::CancelParams, + ) -> Result<()> { + // TODO(jane): Handle this once we have task cancellation in the scheduler. + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_change.rs b/crates/ruff_server/src/server/api/notifications/did_change.rs new file mode 100644 index 0000000000000..7b1fada792441 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_change.rs @@ -0,0 +1,44 @@ +use crate::server::api::LSPResult; +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidChange; + +impl super::NotificationHandler for DidChange { + type NotificationType = notif::DidChangeTextDocument; +} + +impl super::SyncNotificationHandler for DidChange { + #[tracing::instrument(skip_all, fields(file=%uri))] + fn run( + session: &mut Session, + _notifier: Notifier, + types::DidChangeTextDocumentParams { + text_document: + types::VersionedTextDocumentIdentifier { + uri, + version: new_version, + }, + content_changes, + }: types::DidChangeTextDocumentParams, + ) -> Result<()> { + let encoding = session.encoding(); + let document = session + .document_controller(&uri) + .with_failure_code(lsp_server::ErrorCode::InvalidParams)?; + + if content_changes.is_empty() { + document.make_mut().update_version(new_version); + return Ok(()); + } + + document + .make_mut() + .apply_changes(content_changes, new_version, encoding); + + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_change_configuration.rs b/crates/ruff_server/src/server/api/notifications/did_change_configuration.rs new file mode 100644 index 0000000000000..c1998b777b32e --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_change_configuration.rs @@ -0,0 +1,22 @@ +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidChangeConfiguration; + +impl super::NotificationHandler for DidChangeConfiguration { + type NotificationType = notif::DidChangeConfiguration; +} + +impl super::SyncNotificationHandler for DidChangeConfiguration { + fn run( + _session: &mut Session, + _notifier: Notifier, + _params: types::DidChangeConfigurationParams, + ) -> Result<()> { + // TODO(jane): get this wired up after the pre-release + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_change_workspace.rs b/crates/ruff_server/src/server/api/notifications/did_change_workspace.rs new file mode 100644 index 0000000000000..bb5226ef5075a --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_change_workspace.rs @@ -0,0 +1,32 @@ +use crate::server::api::LSPResult; +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidChangeWorkspace; + +impl super::NotificationHandler for DidChangeWorkspace { + type NotificationType = notif::DidChangeWorkspaceFolders; +} + +impl super::SyncNotificationHandler for DidChangeWorkspace { + fn run( + session: &mut Session, + _notifier: Notifier, + params: types::DidChangeWorkspaceFoldersParams, + ) -> Result<()> { + for new in params.event.added { + session + .open_workspace_folder(&new.uri) + .with_failure_code(lsp_server::ErrorCode::InvalidParams)?; + } + for removed in params.event.removed { + session + .close_workspace_folder(&removed.uri) + .with_failure_code(lsp_server::ErrorCode::InvalidParams)?; + } + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_close.rs b/crates/ruff_server/src/server/api/notifications/did_close.rs new file mode 100644 index 0000000000000..d8a1ac29d3158 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_close.rs @@ -0,0 +1,27 @@ +use crate::server::api::LSPResult; +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidClose; + +impl super::NotificationHandler for DidClose { + type NotificationType = notif::DidCloseTextDocument; +} + +impl super::SyncNotificationHandler for DidClose { + #[tracing::instrument(skip_all, fields(file=%uri))] + fn run( + session: &mut Session, + _notifier: Notifier, + types::DidCloseTextDocumentParams { + text_document: types::TextDocumentIdentifier { uri }, + }: types::DidCloseTextDocumentParams, + ) -> Result<()> { + session + .close_document(&uri) + .with_failure_code(lsp_server::ErrorCode::InternalError) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_open.rs b/crates/ruff_server/src/server/api/notifications/did_open.rs new file mode 100644 index 0000000000000..ff76a80875d45 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_open.rs @@ -0,0 +1,31 @@ +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidOpen; + +impl super::NotificationHandler for DidOpen { + type NotificationType = notif::DidOpenTextDocument; +} + +impl super::SyncNotificationHandler for DidOpen { + #[tracing::instrument(skip_all, fields(file=%url))] + fn run( + session: &mut Session, + _notifier: Notifier, + types::DidOpenTextDocumentParams { + text_document: + types::TextDocumentItem { + uri: ref url, + text, + version, + .. + }, + }: types::DidOpenTextDocumentParams, + ) -> Result<()> { + session.open_document(url, text, version); + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/requests.rs b/crates/ruff_server/src/server/api/requests.rs new file mode 100644 index 0000000000000..d29a60a660d49 --- /dev/null +++ b/crates/ruff_server/src/server/api/requests.rs @@ -0,0 +1,15 @@ +mod code_action; +mod diagnostic; +mod format; +mod format_range; + +use super::{ + define_document_url, + traits::{BackgroundDocumentRequestHandler, RequestHandler}, +}; +pub(super) use code_action::CodeAction; +pub(super) use diagnostic::DocumentDiagnostic; +pub(super) use format::Format; +pub(super) use format_range::FormatRange; + +type FormatResponse = Option>; diff --git a/crates/ruff_server/src/server/api/requests/code_action.rs b/crates/ruff_server/src/server/api/requests/code_action.rs new file mode 100644 index 0000000000000..235b651078f06 --- /dev/null +++ b/crates/ruff_server/src/server/api/requests/code_action.rs @@ -0,0 +1,81 @@ +use crate::edit::ToRangeExt; +use crate::server::api::LSPResult; +use crate::server::{client::Notifier, Result}; +use crate::session::DocumentSnapshot; +use lsp_types::{self as types, request as req}; +use ruff_text_size::Ranged; + +pub(crate) struct CodeAction; + +impl super::RequestHandler for CodeAction { + type RequestType = req::CodeActionRequest; +} + +impl super::BackgroundDocumentRequestHandler for CodeAction { + super::define_document_url!(params: &types::CodeActionParams); + fn run_with_snapshot( + snapshot: DocumentSnapshot, + _notifier: Notifier, + params: types::CodeActionParams, + ) -> Result> { + let document = snapshot.document(); + let url = snapshot.url(); + let encoding = snapshot.encoding(); + let version = document.version(); + let actions: Result> = params + .context + .diagnostics + .into_iter() + .map(|diagnostic| { + let Some(data) = diagnostic.data else { + return Ok(None); + }; + let diagnostic_fix: crate::lint::DiagnosticFix = serde_json::from_value(data) + .map_err(|err| anyhow::anyhow!("failed to deserialize diagnostic data: {err}")) + .with_failure_code(lsp_server::ErrorCode::ParseError)?; + let edits = diagnostic_fix + .fix + .edits() + .iter() + .map(|edit| types::TextEdit { + range: edit.range().to_range( + document.contents(), + document.index(), + encoding, + ), + new_text: edit.content().unwrap_or_default().to_string(), + }); + + let changes = vec![types::TextDocumentEdit { + text_document: types::OptionalVersionedTextDocumentIdentifier::new( + url.clone(), + version, + ), + edits: edits.map(types::OneOf::Left).collect(), + }]; + + let title = diagnostic_fix + .kind + .suggestion + .unwrap_or(diagnostic_fix.kind.name); + Ok(Some(types::CodeAction { + title, + kind: Some(types::CodeActionKind::QUICKFIX), + edit: Some(types::WorkspaceEdit { + document_changes: Some(types::DocumentChanges::Edits(changes)), + ..Default::default() + }), + ..Default::default() + })) + }) + .collect(); + + Ok(Some( + actions? + .into_iter() + .flatten() + .map(types::CodeActionOrCommand::CodeAction) + .collect(), + )) + } +} diff --git a/crates/ruff_server/src/server/api/requests/diagnostic.rs b/crates/ruff_server/src/server/api/requests/diagnostic.rs new file mode 100644 index 0000000000000..634f111f86d02 --- /dev/null +++ b/crates/ruff_server/src/server/api/requests/diagnostic.rs @@ -0,0 +1,39 @@ +use crate::server::{client::Notifier, Result}; +use crate::session::DocumentSnapshot; +use lsp_types::{self as types, request as req}; +use types::{ + DocumentDiagnosticReportResult, FullDocumentDiagnosticReport, + RelatedFullDocumentDiagnosticReport, +}; + +pub(crate) struct DocumentDiagnostic; + +impl super::RequestHandler for DocumentDiagnostic { + type RequestType = req::DocumentDiagnosticRequest; +} + +impl super::BackgroundDocumentRequestHandler for DocumentDiagnostic { + super::define_document_url!(params: &types::DocumentDiagnosticParams); + fn run_with_snapshot( + snapshot: DocumentSnapshot, + _notifier: Notifier, + _params: types::DocumentDiagnosticParams, + ) -> Result { + let diagnostics = crate::lint::check( + snapshot.document(), + &snapshot.configuration().linter, + snapshot.encoding(), + ); + + Ok(DocumentDiagnosticReportResult::Report( + types::DocumentDiagnosticReport::Full(RelatedFullDocumentDiagnosticReport { + related_documents: None, + full_document_diagnostic_report: FullDocumentDiagnosticReport { + // TODO(jane): eventually this will be important for caching diagnostic information. + result_id: None, + items: diagnostics, + }, + }), + )) + } +} diff --git a/crates/ruff_server/src/server/api/requests/format.rs b/crates/ruff_server/src/server/api/requests/format.rs new file mode 100644 index 0000000000000..384539ad092fa --- /dev/null +++ b/crates/ruff_server/src/server/api/requests/format.rs @@ -0,0 +1,147 @@ +use crate::edit::ToRangeExt; +use crate::server::api::LSPResult; +use crate::server::{client::Notifier, Result}; +use crate::session::DocumentSnapshot; +use lsp_types::{self as types, request as req}; +use ruff_source_file::LineIndex; +use ruff_text_size::{TextLen, TextRange, TextSize}; +use types::TextEdit; + +pub(crate) struct Format; + +impl super::RequestHandler for Format { + type RequestType = req::Formatting; +} + +impl super::BackgroundDocumentRequestHandler for Format { + super::define_document_url!(params: &types::DocumentFormattingParams); + fn run_with_snapshot( + snapshot: DocumentSnapshot, + _notifier: Notifier, + _params: types::DocumentFormattingParams, + ) -> Result { + let doc = snapshot.document(); + let source = doc.contents(); + let formatted = crate::format::format(doc, &snapshot.configuration().formatter) + .with_failure_code(lsp_server::ErrorCode::InternalError)?; + // fast path - if the code is the same, return early + if formatted == source { + return Ok(None); + } + let formatted_index: LineIndex = LineIndex::from_source_text(&formatted); + + let unformatted_index = doc.index(); + + let Replacement { + source_range: replace_range, + formatted_range: replacement_text_range, + } = Replacement::between( + source, + unformatted_index.line_starts(), + &formatted, + formatted_index.line_starts(), + ); + + Ok(Some(vec![TextEdit { + range: replace_range.to_range(source, unformatted_index, snapshot.encoding()), + new_text: formatted[replacement_text_range].to_owned(), + }])) + } +} + +struct Replacement { + source_range: TextRange, + formatted_range: TextRange, +} + +impl Replacement { + /// Creates a [`Replacement`] that describes the `replace_range` of `old_text` to replace + /// with `new_text` sliced by `replacement_text_range`. + fn between( + source: &str, + source_line_starts: &[TextSize], + formatted: &str, + formatted_line_starts: &[TextSize], + ) -> Self { + let mut source_start = TextSize::default(); + let mut formatted_start = TextSize::default(); + let mut source_end = source.text_len(); + let mut formatted_end = formatted.text_len(); + let mut line_iter = source_line_starts + .iter() + .copied() + .zip(formatted_line_starts.iter().copied()); + for (source_line_start, formatted_line_start) in line_iter.by_ref() { + if source_line_start != formatted_line_start + || source[TextRange::new(source_start, source_line_start)] + != formatted[TextRange::new(formatted_start, formatted_line_start)] + { + break; + } + source_start = source_line_start; + formatted_start = formatted_line_start; + } + + let mut line_iter = line_iter.rev(); + + for (old_line_start, new_line_start) in line_iter.by_ref() { + if old_line_start <= source_start + || new_line_start <= formatted_start + || source[TextRange::new(old_line_start, source_end)] + != formatted[TextRange::new(new_line_start, formatted_end)] + { + break; + } + source_end = old_line_start; + formatted_end = new_line_start; + } + + Replacement { + source_range: TextRange::new(source_start, source_end), + formatted_range: TextRange::new(formatted_start, formatted_end), + } + } +} + +#[cfg(test)] +mod tests { + use ruff_source_file::LineIndex; + + use crate::server::api::requests::format::Replacement; + + #[test] + fn find_replacement_range_works() { + let original = r#" + aaaa + bbbb + cccc + dddd + eeee + "#; + let original_index = LineIndex::from_source_text(original); + let new = r#" + bb + cccc + dd + "#; + let new_index = LineIndex::from_source_text(new); + let expected = r#" + bb + cccc + dd + "#; + let replacement = Replacement::between( + original, + original_index.line_starts(), + new, + new_index.line_starts(), + ); + let mut test = original.to_string(); + test.replace_range( + replacement.source_range.start().to_usize()..replacement.source_range.end().to_usize(), + &new[replacement.formatted_range], + ); + + assert_eq!(expected, &test); + } +} diff --git a/crates/ruff_server/src/server/api/requests/format_range.rs b/crates/ruff_server/src/server/api/requests/format_range.rs new file mode 100644 index 0000000000000..aef39d971be44 --- /dev/null +++ b/crates/ruff_server/src/server/api/requests/format_range.rs @@ -0,0 +1,34 @@ +use crate::edit::{RangeExt, ToRangeExt}; +use crate::server::api::LSPResult; +use crate::server::{client::Notifier, Result}; +use crate::session::DocumentSnapshot; +use lsp_types::{self as types, request as req}; + +pub(crate) struct FormatRange; + +impl super::RequestHandler for FormatRange { + type RequestType = req::RangeFormatting; +} + +impl super::BackgroundDocumentRequestHandler for FormatRange { + super::define_document_url!(params: &types::DocumentRangeFormattingParams); + fn run_with_snapshot( + snapshot: DocumentSnapshot, + _notifier: Notifier, + params: types::DocumentRangeFormattingParams, + ) -> Result { + let document = snapshot.document(); + let text = document.contents(); + let index = document.index(); + let range = params.range.to_text_range(text, index, snapshot.encoding()); + let formatted_range = + crate::format::format_range(document, &snapshot.configuration().formatter, range) + .with_failure_code(lsp_server::ErrorCode::InternalError)?; + Ok(Some(vec![types::TextEdit { + range: formatted_range + .source_range() + .to_range(text, index, snapshot.encoding()), + new_text: formatted_range.into_code(), + }])) + } +} diff --git a/crates/ruff_server/src/server/api/traits.rs b/crates/ruff_server/src/server/api/traits.rs new file mode 100644 index 0000000000000..54639546dc9fc --- /dev/null +++ b/crates/ruff_server/src/server/api/traits.rs @@ -0,0 +1,76 @@ +//! A stateful LSP implementation that calls into the Ruff API. + +use crate::server::client::Notifier; +use crate::session::{DocumentSnapshot, Session}; + +use lsp_types::notification::Notification as LSPNotification; +use lsp_types::request::Request; + +/// A supertrait for any server request handler. +pub(super) trait RequestHandler { + type RequestType: Request; + const METHOD: &'static str = <::RequestType as Request>::METHOD; +} + +/// A request handler that needs mutable access to the session. +/// This will block the main message receiver loop, meaning that no +/// incoming requests or notifications will be handled while `run` is +/// executing. Try to avoid doing any I/O or long-running computations. +pub(super) trait SyncRequestHandler: RequestHandler { + fn run( + session: &mut Session, + notifier: Notifier, + params: <::RequestType as Request>::Params, + ) -> super::Result<<::RequestType as Request>::Result>; +} + +/// A request handler that can be run on a background thread. +pub(super) trait BackgroundDocumentRequestHandler: RequestHandler { + /// `document_url` can be implemented automatically with + /// `define_document_url!(params: &)` in the trait + /// implementation. + fn document_url( + params: &<::RequestType as Request>::Params, + ) -> &lsp_types::Url; + + fn run_with_snapshot( + snapshot: DocumentSnapshot, + notifier: Notifier, + params: <::RequestType as Request>::Params, + ) -> super::Result<<::RequestType as Request>::Result>; +} + +/// A supertrait for any server notification handler. +pub(super) trait NotificationHandler { + type NotificationType: LSPNotification; + const METHOD: &'static str = + <::NotificationType as LSPNotification>::METHOD; +} + +/// A notification handler that needs mutable access to the session. +/// This will block the main message receiver loop, meaning that no +/// incoming requests or notifications will be handled while `run` is +/// executing. Try to avoid doing any I/O or long-running computations. +pub(super) trait SyncNotificationHandler: NotificationHandler { + fn run( + session: &mut Session, + notifier: Notifier, + params: <::NotificationType as LSPNotification>::Params, + ) -> super::Result<()>; +} + +/// A notification handler that can be run on a background thread. +pub(super) trait BackgroundDocumentNotificationHandler: NotificationHandler { + /// `document_url` can be implemented automatically with + /// `define_document_url!(params: &)` in the trait + /// implementation. + fn document_url( + params: &<::NotificationType as LSPNotification>::Params, + ) -> &lsp_types::Url; + + fn run_with_snapshot( + snapshot: DocumentSnapshot, + notifier: Notifier, + params: <::NotificationType as LSPNotification>::Params, + ) -> super::Result<()>; +} diff --git a/crates/ruff_server/src/server/client.rs b/crates/ruff_server/src/server/client.rs new file mode 100644 index 0000000000000..5eafdf9b82720 --- /dev/null +++ b/crates/ruff_server/src/server/client.rs @@ -0,0 +1,76 @@ +use lsp_server::{Notification, RequestId}; +use serde_json::Value; + +pub(crate) type ClientSender = crossbeam::channel::Sender; + +pub(crate) struct Client { + notifier: Notifier, + responder: Responder, +} + +#[derive(Clone)] +pub(crate) struct Notifier(ClientSender); + +#[derive(Clone)] +pub(crate) struct Responder(ClientSender); + +impl Client { + pub(super) fn new(sender: &ClientSender) -> Self { + Self { + notifier: Notifier(sender.clone()), + responder: Responder(sender.clone()), + } + } + + pub(super) fn notifier(&self) -> Notifier { + self.notifier.clone() + } + + pub(super) fn responder(&self) -> Responder { + self.responder.clone() + } +} + +#[allow(dead_code)] // we'll need to use `Notifier` in the future +impl Notifier { + pub(crate) fn notify(&self, params: N::Params) -> crate::Result<()> + where + N: lsp_types::notification::Notification, + { + let method = N::METHOD.to_string(); + + let message = lsp_server::Message::Notification(Notification::new(method, params)); + + Ok(self.0.send(message)?) + } + + pub(crate) fn notify_method(&self, method: String) -> crate::Result<()> { + Ok(self + .0 + .send(lsp_server::Message::Notification(Notification::new( + method, + Value::Null, + )))?) + } +} + +impl Responder { + pub(crate) fn respond( + &self, + id: RequestId, + result: crate::server::Result, + ) -> crate::Result<()> + where + R: serde::Serialize, + { + Ok(self.0.send( + match result { + Ok(res) => lsp_server::Response::new_ok(id, res), + Err(crate::server::api::Error { code, error }) => { + lsp_server::Response::new_err(id, code as i32, format!("{error}")) + } + } + .into(), + )?) + } +} diff --git a/crates/ruff_server/src/server/schedule.rs b/crates/ruff_server/src/server/schedule.rs new file mode 100644 index 0000000000000..fd2e59582b5e1 --- /dev/null +++ b/crates/ruff_server/src/server/schedule.rs @@ -0,0 +1,89 @@ +use crossbeam::channel::Sender; + +use crate::session::Session; + +mod task; +mod thread; + +pub(super) use task::{BackgroundSchedule, Task}; + +use self::{ + task::{BackgroundTaskBuilder, SyncTask}, + thread::ThreadPriority, +}; + +use super::client::Client; + +/// The event loop thread is actually a secondary thread that we spawn from the +/// _actual_ main thread. This secondary thread has a larger stack size +/// than some OS defaults (Windows, for example) and is also designated as +/// high-priority. +pub(crate) fn event_loop_thread( + func: impl FnOnce() -> crate::Result<()> + Send + 'static, +) -> crate::Result>> { + // Override OS defaults to avoid stack overflows on platforms with low stack size defaults. + const MAIN_THREAD_STACK_SIZE: usize = 2 * 1024 * 1024; + const MAIN_THREAD_NAME: &str = "ruff:main"; + Ok( + thread::Builder::new(thread::ThreadPriority::LatencySensitive) + .name(MAIN_THREAD_NAME.into()) + .stack_size(MAIN_THREAD_STACK_SIZE) + .spawn(func)?, + ) +} + +pub(crate) struct Scheduler { + session: Session, + client: Client, + fmt_pool: thread::Pool, + background_pool: thread::Pool, +} + +impl Scheduler { + pub(super) fn new( + session: Session, + thread_count: usize, + sender: &Sender, + ) -> Self { + Self { + session, + fmt_pool: thread::Pool::new(1), + background_pool: thread::Pool::new(thread_count), + client: Client::new(sender), + } + } + + /// Dispatches a `task` by either running it as a blocking function or + /// executing it on a background thread pool. + pub(super) fn dispatch<'s>(&'s mut self, task: task::Task<'s>) { + match task { + Task::Sync(SyncTask { func }) => { + func( + &mut self.session, + self.client.notifier(), + self.client.responder(), + ); + } + Task::Background(BackgroundTaskBuilder { + schedule, + builder: func, + }) => { + let static_func = func(&self.session); + let notifier = self.client.notifier(); + let responder = self.client.responder(); + let task = move || static_func(notifier, responder); + match schedule { + BackgroundSchedule::Worker => { + self.background_pool.spawn(ThreadPriority::Worker, task); + } + BackgroundSchedule::LatencySensitive => self + .background_pool + .spawn(ThreadPriority::LatencySensitive, task), + BackgroundSchedule::Fmt => { + self.fmt_pool.spawn(ThreadPriority::LatencySensitive, task); + } + } + } + } + } +} diff --git a/crates/ruff_server/src/server/schedule/task.rs b/crates/ruff_server/src/server/schedule/task.rs new file mode 100644 index 0000000000000..b4de2d8c97b0a --- /dev/null +++ b/crates/ruff_server/src/server/schedule/task.rs @@ -0,0 +1,92 @@ +use lsp_server::RequestId; +use serde::Serialize; + +use crate::{ + server::client::{Notifier, Responder}, + session::Session, +}; + +type LocalFn<'s> = Box; + +type BackgroundFn = Box; + +type BackgroundFnBuilder<'s> = Box BackgroundFn + 's>; + +/// Describes how the task should be run. +#[derive(Clone, Copy, Debug, Default)] +pub(in crate::server) enum BackgroundSchedule { + /// The task should be run on the background thread designated + /// for formatting actions. This is a high priority thread. + Fmt, + /// The task should be run on the general high-priority background + /// thread. + LatencySensitive, + /// The task should be run on a regular-priority background thread. + #[default] + Worker, +} + +/// A [`Task`] is a future that has not yet started, and it is the job of +/// the [`super::Scheduler`] to make that happen, via [`super::Scheduler::dispatch`]. +/// A task can either run on the main thread (in other words, the same thread as the +/// scheduler) or it can run in a background thread. The main difference between +/// the two is that background threads only have a read-only snapshot of the session, +/// while local tasks have exclusive access and can modify it as they please. Keep in mind that +/// local tasks will **block** the main event loop, so only use local tasks if you **need** +/// mutable state access or you need the absolute lowest latency possible. +pub(in crate::server) enum Task<'s> { + Background(BackgroundTaskBuilder<'s>), + Sync(SyncTask<'s>), +} + +// The reason why this isn't just a 'static background closure +// is because we need to take a snapshot of the session before sending +// this task to the background, and the inner closure can't take the session +// as an immutable reference since it's used mutably elsewhere. So instead, +// a background task is built using an outer closure that borrows the session to take a snapshot, +// that the inner closure can capture. This builder closure has a lifetime linked to the scheduler. +// When the task is dispatched, the scheduler runs the synchronous builder, which takes the session +// as a reference, to create the inner 'static closure. That closure is then moved to a background task pool. +pub(in crate::server) struct BackgroundTaskBuilder<'s> { + pub(super) schedule: BackgroundSchedule, + pub(super) builder: BackgroundFnBuilder<'s>, +} + +pub(in crate::server) struct SyncTask<'s> { + pub(super) func: LocalFn<'s>, +} + +impl<'s> Task<'s> { + /// Creates a new background task. + pub(crate) fn background( + schedule: BackgroundSchedule, + func: impl FnOnce(&Session) -> Box + 's, + ) -> Self { + Self::Background(BackgroundTaskBuilder { + schedule, + builder: Box::new(func), + }) + } + /// Creates a new local task. + pub(crate) fn local(func: impl FnOnce(&mut Session, Notifier, Responder) + 's) -> Self { + Self::Sync(SyncTask { + func: Box::new(func), + }) + } + /// Creates a local task that immediately + /// responds with the provided `request`. + pub(crate) fn immediate(id: RequestId, result: crate::server::Result) -> Self + where + R: Serialize + Send + 'static, + { + Self::local(move |_, _, responder| { + if let Err(err) = responder.respond(id, result) { + tracing::error!("Unable to send immediate response: {err}"); + } + }) + } + /// Creates a local task that does nothing. + pub(crate) fn nothing() -> Self { + Self::local(move |_, _, _| {}) + } +} diff --git a/crates/ruff_server/src/server/schedule/thread.rs b/crates/ruff_server/src/server/schedule/thread.rs new file mode 100644 index 0000000000000..da3ea8c2f2036 --- /dev/null +++ b/crates/ruff_server/src/server/schedule/thread.rs @@ -0,0 +1,109 @@ +// +------------------------------------------------------------+ +// | Code adopted from: | +// | Repository: https://github.com/rust-lang/rust-analyzer.git | +// | File: `crates/stdx/src/thread.rs` | +// | Commit: 03b3cb6be9f21c082f4206b35c7fe7f291c94eaa | +// +------------------------------------------------------------+ +//! A utility module for working with threads that automatically joins threads upon drop +//! and abstracts over operating system quality of service (QoS) APIs +//! through the concept of a “thread priority”. +//! +//! The priority of a thread is frozen at thread creation time, +//! i.e. there is no API to change the priority of a thread once it has been spawned. +//! +//! As a system, rust-analyzer should have the property that +//! old manual scheduling APIs are replaced entirely by QoS. +//! To maintain this invariant, we panic when it is clear that +//! old scheduling APIs have been used. +//! +//! Moreover, we also want to ensure that every thread has an priority set explicitly +//! to force a decision about its importance to the system. +//! Thus, [`ThreadPriority`] has no default value +//! and every entry point to creating a thread requires a [`ThreadPriority`] upfront. + +// Keeps us from getting warnings about the word `QoS` +#![allow(clippy::doc_markdown)] + +use std::fmt; + +mod pool; +mod priority; + +pub(super) use pool::Pool; +pub(super) use priority::ThreadPriority; + +pub(super) struct Builder { + priority: ThreadPriority, + inner: jod_thread::Builder, +} + +impl Builder { + pub(super) fn new(priority: ThreadPriority) -> Builder { + Builder { + priority, + inner: jod_thread::Builder::new(), + } + } + + pub(super) fn name(self, name: String) -> Builder { + Builder { + inner: self.inner.name(name), + ..self + } + } + + pub(super) fn stack_size(self, size: usize) -> Builder { + Builder { + inner: self.inner.stack_size(size), + ..self + } + } + + pub(super) fn spawn(self, f: F) -> std::io::Result> + where + F: FnOnce() -> T, + F: Send + 'static, + T: Send + 'static, + { + let inner_handle = self.inner.spawn(move || { + self.priority.apply_to_current_thread(); + f() + })?; + + Ok(JoinHandle { + inner: Some(inner_handle), + allow_leak: false, + }) + } +} + +pub(crate) struct JoinHandle { + // `inner` is an `Option` so that we can + // take ownership of the contained `JoinHandle`. + inner: Option>, + allow_leak: bool, +} + +impl JoinHandle { + pub(crate) fn join(mut self) -> T { + self.inner.take().unwrap().join() + } +} + +impl Drop for JoinHandle { + fn drop(&mut self) { + if !self.allow_leak { + return; + } + + if let Some(join_handle) = self.inner.take() { + join_handle.detach(); + } + } +} + +impl fmt::Debug for JoinHandle { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.pad("JoinHandle { .. }") + } +} diff --git a/crates/ruff_server/src/server/schedule/thread/pool.rs b/crates/ruff_server/src/server/schedule/thread/pool.rs new file mode 100644 index 0000000000000..9a69ce367ef4a --- /dev/null +++ b/crates/ruff_server/src/server/schedule/thread/pool.rs @@ -0,0 +1,107 @@ +// +------------------------------------------------------------+ +// | Code adopted from: | +// | Repository: https://github.com/rust-lang/rust-analyzer.git | +// | File: `crates/stdx/src/thread/pool.rs` | +// | Commit: 03b3cb6be9f21c082f4206b35c7fe7f291c94eaa | +// +------------------------------------------------------------+ +//! [`Pool`] implements a basic custom thread pool +//! inspired by the [`threadpool` crate](http://docs.rs/threadpool). +//! When you spawn a task you specify a thread priority +//! so the pool can schedule it to run on a thread with that priority. +//! rust-analyzer uses this to prioritize work based on latency requirements. +//! +//! The thread pool is implemented entirely using +//! the threading utilities in [`crate::server::schedule::thread`]. + +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; + +use crossbeam::channel::{Receiver, Sender}; + +use super::{Builder, JoinHandle, ThreadPriority}; + +pub(crate) struct Pool { + // `_handles` is never read: the field is present + // only for its `Drop` impl. + + // The worker threads exit once the channel closes; + // make sure to keep `job_sender` above `handles` + // so that the channel is actually closed + // before we join the worker threads! + job_sender: Sender, + _handles: Vec, + extant_tasks: Arc, +} + +struct Job { + requested_priority: ThreadPriority, + f: Box, +} + +impl Pool { + pub(crate) fn new(threads: usize) -> Pool { + // Override OS defaults to avoid stack overflows on platforms with low stack size defaults. + const STACK_SIZE: usize = 2 * 1024 * 1024; + const INITIAL_PRIORITY: ThreadPriority = ThreadPriority::Worker; + + let (job_sender, job_receiver) = crossbeam::channel::bounded(threads); + let extant_tasks = Arc::new(AtomicUsize::new(0)); + + let mut handles = Vec::with_capacity(threads); + for _ in 0..threads { + let handle = Builder::new(INITIAL_PRIORITY) + .stack_size(STACK_SIZE) + .name("Worker".into()) + .spawn({ + let extant_tasks = Arc::clone(&extant_tasks); + let job_receiver: Receiver = job_receiver.clone(); + move || { + let mut current_priority = INITIAL_PRIORITY; + for job in job_receiver { + if job.requested_priority != current_priority { + job.requested_priority.apply_to_current_thread(); + current_priority = job.requested_priority; + } + extant_tasks.fetch_add(1, Ordering::SeqCst); + (job.f)(); + extant_tasks.fetch_sub(1, Ordering::SeqCst); + } + } + }) + .expect("failed to spawn thread"); + + handles.push(handle); + } + + Pool { + _handles: handles, + extant_tasks, + job_sender, + } + } + + pub(crate) fn spawn(&self, priority: ThreadPriority, f: F) + where + F: FnOnce() + Send + 'static, + { + let f = Box::new(move || { + if cfg!(debug_assertions) { + priority.assert_is_used_on_current_thread(); + } + f(); + }); + + let job = Job { + requested_priority: priority, + f, + }; + self.job_sender.send(job).unwrap(); + } + + #[allow(dead_code)] + pub(super) fn len(&self) -> usize { + self.extant_tasks.load(Ordering::SeqCst) + } +} diff --git a/crates/ruff_server/src/server/schedule/thread/priority.rs b/crates/ruff_server/src/server/schedule/thread/priority.rs new file mode 100644 index 0000000000000..e6a555242fcb7 --- /dev/null +++ b/crates/ruff_server/src/server/schedule/thread/priority.rs @@ -0,0 +1,297 @@ +// +------------------------------------------------------------+ +// | Code adopted from: | +// | Repository: https://github.com/rust-lang/rust-analyzer.git | +// | File: `crates/stdx/src/thread/intent.rs` | +// | Commit: 03b3cb6be9f21c082f4206b35c7fe7f291c94eaa | +// +------------------------------------------------------------+ +//! An opaque façade around platform-specific QoS APIs. + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +// Please maintain order from least to most priority for the derived `Ord` impl. +pub(crate) enum ThreadPriority { + /// Any thread which does work that isn't in a critical path. + Worker, + + /// Any thread which does work caused by the user typing, or + /// work that the editor may wait on. + LatencySensitive, +} + +impl ThreadPriority { + // These APIs must remain private; + // we only want consumers to set thread priority + // during thread creation. + + pub(crate) fn apply_to_current_thread(self) { + let class = thread_priority_to_qos_class(self); + set_current_thread_qos_class(class); + } + + pub(crate) fn assert_is_used_on_current_thread(self) { + if IS_QOS_AVAILABLE { + let class = thread_priority_to_qos_class(self); + assert_eq!(get_current_thread_qos_class(), Some(class)); + } + } +} + +use imp::QoSClass; + +const IS_QOS_AVAILABLE: bool = imp::IS_QOS_AVAILABLE; + +fn set_current_thread_qos_class(class: QoSClass) { + imp::set_current_thread_qos_class(class); +} + +fn get_current_thread_qos_class() -> Option { + imp::get_current_thread_qos_class() +} + +fn thread_priority_to_qos_class(priority: ThreadPriority) -> QoSClass { + imp::thread_priority_to_qos_class(priority) +} + +// All Apple platforms use XNU as their kernel +// and thus have the concept of QoS. +#[cfg(target_vendor = "apple")] +mod imp { + use super::ThreadPriority; + + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] + // Please maintain order from least to most priority for the derived `Ord` impl. + pub(super) enum QoSClass { + // Documentation adapted from https://github.com/apple-oss-distributions/libpthread/blob/67e155c94093be9a204b69637d198eceff2c7c46/include/sys/qos.h#L55 + // + /// TLDR: invisible maintenance tasks + /// + /// Contract: + /// + /// * **You do not care about how long it takes for work to finish.** + /// * **You do not care about work being deferred temporarily.** + /// (e.g. if the device's battery is in a critical state) + /// + /// Examples: + /// + /// * in a video editor: + /// creating periodic backups of project files + /// * in a browser: + /// cleaning up cached sites which have not been accessed in a long time + /// * in a collaborative word processor: + /// creating a searchable index of all documents + /// + /// Use this QoS class for background tasks + /// which the user did not initiate themselves + /// and which are invisible to the user. + /// It is expected that this work will take significant time to complete: + /// minutes or even hours. + /// + /// This QoS class provides the most energy and thermally-efficient execution possible. + /// All other work is prioritized over background tasks. + Background, + + /// TLDR: tasks that don't block using your app + /// + /// Contract: + /// + /// * **Your app remains useful even as the task is executing.** + /// + /// Examples: + /// + /// * in a video editor: + /// exporting a video to disk - + /// the user can still work on the timeline + /// * in a browser: + /// automatically extracting a downloaded zip file - + /// the user can still switch tabs + /// * in a collaborative word processor: + /// downloading images embedded in a document - + /// the user can still make edits + /// + /// Use this QoS class for tasks which + /// may or may not be initiated by the user, + /// but whose result is visible. + /// It is expected that this work will take a few seconds to a few minutes. + /// Typically your app will include a progress bar + /// for tasks using this class. + /// + /// This QoS class provides a balance between + /// performance, responsiveness and efficiency. + Utility, + + /// TLDR: tasks that block using your app + /// + /// Contract: + /// + /// * **You need this work to complete + /// before the user can keep interacting with your app.** + /// * **Your work will not take more than a few seconds to complete.** + /// + /// Examples: + /// + /// * in a video editor: + /// opening a saved project + /// * in a browser: + /// loading a list of the user's bookmarks and top sites + /// when a new tab is created + /// * in a collaborative word processor: + /// running a search on the document's content + /// + /// Use this QoS class for tasks which were initiated by the user + /// and block the usage of your app while they are in progress. + /// It is expected that this work will take a few seconds or less to complete; + /// not long enough to cause the user to switch to something else. + /// Your app will likely indicate progress on these tasks + /// through the display of placeholder content or modals. + /// + /// This QoS class is not energy-efficient. + /// Rather, it provides responsiveness + /// by prioritizing work above other tasks on the system + /// except for critical user-interactive work. + UserInitiated, + + /// TLDR: render loops and nothing else + /// + /// Contract: + /// + /// * **You absolutely need this work to complete immediately + /// or your app will appear to freeze.** + /// * **Your work will always complete virtually instantaneously.** + /// + /// Examples: + /// + /// * the main thread in a GUI application + /// * the update & render loop in a game + /// * a secondary thread which progresses an animation + /// + /// Use this QoS class for any work which, if delayed, + /// will make your user interface unresponsive. + /// It is expected that this work will be virtually instantaneous. + /// + /// This QoS class is not energy-efficient. + /// Specifying this class is a request to run with + /// nearly all available system CPU and I/O bandwidth even under contention. + UserInteractive, + } + + pub(super) const IS_QOS_AVAILABLE: bool = true; + + pub(super) fn set_current_thread_qos_class(class: QoSClass) { + let c = match class { + QoSClass::UserInteractive => libc::qos_class_t::QOS_CLASS_USER_INTERACTIVE, + QoSClass::UserInitiated => libc::qos_class_t::QOS_CLASS_USER_INITIATED, + QoSClass::Utility => libc::qos_class_t::QOS_CLASS_UTILITY, + QoSClass::Background => libc::qos_class_t::QOS_CLASS_BACKGROUND, + }; + + #[allow(unsafe_code)] + let code = unsafe { libc::pthread_set_qos_class_self_np(c, 0) }; + + if code == 0 { + return; + } + + #[allow(unsafe_code)] + let errno = unsafe { *libc::__error() }; + + match errno { + libc::EPERM => { + // This thread has been excluded from the QoS system + // due to a previous call to a function such as `pthread_setschedparam` + // which is incompatible with QoS. + // + // Panic instead of returning an error + // to maintain the invariant that we only use QoS APIs. + panic!("tried to set QoS of thread which has opted out of QoS (os error {errno})") + } + + libc::EINVAL => { + // This is returned if we pass something other than a qos_class_t + // to `pthread_set_qos_class_self_np`. + // + // This is impossible, so again panic. + unreachable!( + "invalid qos_class_t value was passed to pthread_set_qos_class_self_np" + ) + } + + _ => { + // `pthread_set_qos_class_self_np`’s documentation + // does not mention any other errors. + unreachable!("`pthread_set_qos_class_self_np` returned unexpected error {errno}") + } + } + } + + pub(super) fn get_current_thread_qos_class() -> Option { + #[allow(unsafe_code)] + let current_thread = unsafe { libc::pthread_self() }; + let mut qos_class_raw = libc::qos_class_t::QOS_CLASS_UNSPECIFIED; + #[allow(unsafe_code)] + let code = unsafe { + libc::pthread_get_qos_class_np(current_thread, &mut qos_class_raw, std::ptr::null_mut()) + }; + + if code != 0 { + // `pthread_get_qos_class_np`’s documentation states that + // an error value is placed into errno if the return code is not zero. + // However, it never states what errors are possible. + // Inspecting the source[0] shows that, as of this writing, it always returns zero. + // + // Whatever errors the function could report in future are likely to be + // ones which we cannot handle anyway + // + // 0: https://github.com/apple-oss-distributions/libpthread/blob/67e155c94093be9a204b69637d198eceff2c7c46/src/qos.c#L171-L177 + #[allow(unsafe_code)] + let errno = unsafe { *libc::__error() }; + unreachable!("`pthread_get_qos_class_np` failed unexpectedly (os error {errno})"); + } + + match qos_class_raw { + libc::qos_class_t::QOS_CLASS_USER_INTERACTIVE => Some(QoSClass::UserInteractive), + libc::qos_class_t::QOS_CLASS_USER_INITIATED => Some(QoSClass::UserInitiated), + libc::qos_class_t::QOS_CLASS_DEFAULT => None, // QoS has never been set + libc::qos_class_t::QOS_CLASS_UTILITY => Some(QoSClass::Utility), + libc::qos_class_t::QOS_CLASS_BACKGROUND => Some(QoSClass::Background), + + libc::qos_class_t::QOS_CLASS_UNSPECIFIED => { + // Using manual scheduling APIs causes threads to “opt out” of QoS. + // At this point they become incompatible with QoS, + // and as such have the “unspecified” QoS class. + // + // Panic instead of returning an error + // to maintain the invariant that we only use QoS APIs. + panic!("tried to get QoS of thread which has opted out of QoS") + } + } + } + + pub(super) fn thread_priority_to_qos_class(priority: ThreadPriority) -> QoSClass { + match priority { + ThreadPriority::Worker => QoSClass::Utility, + ThreadPriority::LatencySensitive => QoSClass::UserInitiated, + } + } +} + +// FIXME: Windows has QoS APIs, we should use them! +#[cfg(not(target_vendor = "apple"))] +mod imp { + use super::ThreadPriority; + + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] + pub(super) enum QoSClass { + Default, + } + + pub(super) const IS_QOS_AVAILABLE: bool = false; + + pub(super) fn set_current_thread_qos_class(_: QoSClass) {} + + pub(super) fn get_current_thread_qos_class() -> Option { + None + } + + pub(super) fn thread_priority_to_qos_class(_: ThreadPriority) -> QoSClass { + QoSClass::Default + } +} diff --git a/crates/ruff_server/src/session.rs b/crates/ruff_server/src/session.rs new file mode 100644 index 0000000000000..a64c4844ee25d --- /dev/null +++ b/crates/ruff_server/src/session.rs @@ -0,0 +1,327 @@ +//! Data model, state management, and configuration resolution. + +mod types; + +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; +use std::{ops::Deref, sync::Arc}; + +use anyhow::anyhow; +use lsp_types::{ServerCapabilities, Url}; +use ruff_workspace::resolver::{ConfigurationTransformer, Relativity}; +use rustc_hash::FxHashMap; + +use crate::edit::{Document, DocumentVersion}; +use crate::PositionEncoding; + +/// The global state for the LSP +pub(crate) struct Session { + /// Workspace folders in the current session, which contain the state of all open files. + workspaces: Workspaces, + /// The global position encoding, negotiated during LSP initialization. + position_encoding: PositionEncoding, + /// Extension-specific settings, set by the client, that apply to all workspace folders. + #[allow(dead_code)] + lsp_settings: types::ExtensionSettings, +} + +/// An immutable snapshot of `Session` that references +/// a specific document. +pub(crate) struct DocumentSnapshot { + configuration: Arc, + document_ref: DocumentRef, + position_encoding: PositionEncoding, + url: Url, +} + +#[derive(Default)] +pub(crate) struct RuffConfiguration { + // settings to pass into the ruff linter + pub(crate) linter: ruff_linter::settings::LinterSettings, + // settings to pass into the ruff formatter + pub(crate) formatter: ruff_workspace::FormatterSettings, +} + +#[derive(Default)] +pub(crate) struct Workspaces(BTreeMap); + +pub(crate) struct Workspace { + open_documents: OpenDocuments, + configuration: Arc, +} + +#[derive(Default)] +pub(crate) struct OpenDocuments { + documents: FxHashMap, +} + +/// A mutable handler to an underlying document. +/// Handles copy-on-write mutation automatically when +/// calling `deref_mut`. +pub(crate) struct DocumentController { + document: Arc, +} + +/// A read-only reference to a document. +#[derive(Clone)] +pub(crate) struct DocumentRef { + document: Arc, +} + +impl Session { + pub(crate) fn new( + server_capabilities: &ServerCapabilities, + workspaces: &[Url], + ) -> crate::Result { + Ok(Self { + position_encoding: server_capabilities + .position_encoding + .as_ref() + .and_then(|encoding| encoding.try_into().ok()) + .unwrap_or_default(), + lsp_settings: types::ExtensionSettings, + workspaces: Workspaces::new(workspaces)?, + }) + } + + pub(crate) fn take_snapshot(&self, url: &Url) -> Option { + Some(DocumentSnapshot { + configuration: self.workspaces.configuration(url)?.clone(), + document_ref: self.workspaces.snapshot(url)?, + position_encoding: self.position_encoding, + url: url.clone(), + }) + } + + pub(crate) fn open_document(&mut self, url: &Url, contents: String, version: DocumentVersion) { + self.workspaces.open(url, contents, version); + } + + pub(crate) fn close_document(&mut self, url: &Url) -> crate::Result<()> { + self.workspaces.close(url)?; + Ok(()) + } + + pub(crate) fn document_controller( + &mut self, + url: &Url, + ) -> crate::Result<&mut DocumentController> { + self.workspaces + .controller(url) + .ok_or_else(|| anyhow!("Tried to open unavailable document `{url}`")) + } + + pub(crate) fn open_workspace_folder(&mut self, url: &Url) -> crate::Result<()> { + self.workspaces.open_workspace_folder(url)?; + Ok(()) + } + + pub(crate) fn close_workspace_folder(&mut self, url: &Url) -> crate::Result<()> { + self.workspaces.close_workspace_folder(url)?; + Ok(()) + } + + pub(crate) fn encoding(&self) -> PositionEncoding { + self.position_encoding + } +} + +impl OpenDocuments { + fn snapshot(&self, url: &Url) -> Option { + Some(self.documents.get(url)?.make_ref()) + } + + fn controller(&mut self, url: &Url) -> Option<&mut DocumentController> { + self.documents.get_mut(url) + } + + fn open(&mut self, url: &Url, contents: String, version: DocumentVersion) { + if self + .documents + .insert(url.clone(), DocumentController::new(contents, version)) + .is_some() + { + tracing::warn!("Opening document `{url}` that is already open!"); + } + } + + fn close(&mut self, url: &Url) -> crate::Result<()> { + let Some(_) = self.documents.remove(url) else { + return Err(anyhow!( + "Tried to close document `{url}`, which was not open" + )); + }; + Ok(()) + } +} + +impl DocumentController { + fn new(contents: String, version: DocumentVersion) -> Self { + Self { + document: Arc::new(Document::new(contents, version)), + } + } + + pub(crate) fn make_ref(&self) -> DocumentRef { + DocumentRef { + document: self.document.clone(), + } + } + + pub(crate) fn make_mut(&mut self) -> &mut Document { + Arc::make_mut(&mut self.document) + } +} + +impl Deref for DocumentController { + type Target = Document; + fn deref(&self) -> &Self::Target { + &self.document + } +} + +impl Deref for DocumentRef { + type Target = Document; + fn deref(&self) -> &Self::Target { + &self.document + } +} + +impl DocumentSnapshot { + pub(crate) fn configuration(&self) -> &RuffConfiguration { + &self.configuration + } + + pub(crate) fn document(&self) -> &DocumentRef { + &self.document_ref + } + + pub(crate) fn encoding(&self) -> PositionEncoding { + self.position_encoding + } + + pub(crate) fn url(&self) -> &Url { + &self.url + } +} + +impl Workspaces { + fn new(urls: &[Url]) -> crate::Result { + Ok(Self( + urls.iter() + .map(Workspace::new) + .collect::>()?, + )) + } + + fn open_workspace_folder(&mut self, folder_url: &Url) -> crate::Result<()> { + let (path, workspace) = Workspace::new(folder_url)?; + self.0.insert(path, workspace); + Ok(()) + } + + fn close_workspace_folder(&mut self, folder_url: &Url) -> crate::Result<()> { + let path = folder_url + .to_file_path() + .map_err(|()| anyhow!("Folder URI was not a proper file path"))?; + self.0 + .remove(&path) + .ok_or_else(|| anyhow!("Tried to remove non-existent folder {}", path.display()))?; + Ok(()) + } + + fn snapshot(&self, document_url: &Url) -> Option { + self.workspace_for_url(document_url) + .and_then(|w| w.open_documents.snapshot(document_url)) + } + + fn controller(&mut self, document_url: &Url) -> Option<&mut DocumentController> { + self.workspace_for_url_mut(document_url) + .and_then(|w| w.open_documents.controller(document_url)) + } + + fn configuration(&self, document_url: &Url) -> Option<&Arc> { + self.workspace_for_url(document_url) + .map(|w| &w.configuration) + } + + fn open(&mut self, url: &Url, contents: String, version: DocumentVersion) { + if let Some(w) = self.workspace_for_url_mut(url) { + w.open_documents.open(url, contents, version); + } + } + + fn close(&mut self, url: &Url) -> crate::Result<()> { + self.workspace_for_url_mut(url) + .ok_or_else(|| anyhow!("Workspace not found for {url}"))? + .open_documents + .close(url) + } + + fn workspace_for_url(&self, url: &Url) -> Option<&Workspace> { + let path = url.to_file_path().ok()?; + self.0 + .range(..path) + .next_back() + .map(|(_, workspace)| workspace) + } + + fn workspace_for_url_mut(&mut self, url: &Url) -> Option<&mut Workspace> { + let path = url.to_file_path().ok()?; + self.0 + .range_mut(..path) + .next_back() + .map(|(_, workspace)| workspace) + } +} + +impl Workspace { + pub(crate) fn new(root: &Url) -> crate::Result<(PathBuf, Self)> { + let path = root + .to_file_path() + .map_err(|()| anyhow!("workspace URL was not a file path!"))?; + // Fall-back to default configuration + let configuration = Self::find_configuration_or_fallback(&path); + + Ok(( + path, + Self { + open_documents: OpenDocuments::default(), + configuration: Arc::new(configuration), + }, + )) + } + + fn find_configuration_or_fallback(root: &Path) -> RuffConfiguration { + find_configuration_from_root(root).unwrap_or_else(|err| { + tracing::error!("The following error occurred when trying to find a configuration file at `{}`:\n{err}", root.display()); + tracing::error!("Falling back to default configuration for `{}`", root.display()); + RuffConfiguration::default() + }) + } +} + +pub(crate) fn find_configuration_from_root(root: &Path) -> crate::Result { + let pyproject = ruff_workspace::pyproject::find_settings_toml(root)? + .ok_or_else(|| anyhow!("No pyproject.toml/ruff.toml/.ruff.toml file was found"))?; + let settings = ruff_workspace::resolver::resolve_root_settings( + &pyproject, + Relativity::Parent, + &LSPConfigTransformer, + )?; + Ok(RuffConfiguration { + linter: settings.linter, + formatter: settings.formatter, + }) +} + +struct LSPConfigTransformer; + +impl ConfigurationTransformer for LSPConfigTransformer { + fn transform( + &self, + config: ruff_workspace::configuration::Configuration, + ) -> ruff_workspace::configuration::Configuration { + config + } +} diff --git a/crates/ruff_server/src/session/types.rs b/crates/ruff_server/src/session/types.rs new file mode 100644 index 0000000000000..1ed23ae69da38 --- /dev/null +++ b/crates/ruff_server/src/session/types.rs @@ -0,0 +1,3 @@ +#[allow(dead_code)] // TODO(jane): get this wired up after the pre-release +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub(crate) struct ExtensionSettings; diff --git a/crates/ruff_server/tests/document.rs b/crates/ruff_server/tests/document.rs new file mode 100644 index 0000000000000..0c4da4e053aca --- /dev/null +++ b/crates/ruff_server/tests/document.rs @@ -0,0 +1,91 @@ +const PANDAS_HTML_SRC: &str = include_str!("../resources/test/fixtures/pandas_html.py"); + +use lsp_types::{Position, Range, TextDocumentContentChangeEvent}; +use ruff_server::{Document, PositionEncoding}; + +#[test] +fn delete_lines_pandas_html() { + let mut document = Document::new(PANDAS_HTML_SRC.to_string(), 1); + + let changes = vec![ + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 79, + character: 0, + }, + end: Position { + line: 91, + character: 67, + }, + }), + range_length: Some(388), + text: String::new(), + }, + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 81, + character: 4, + }, + end: Position { + line: 81, + character: 36, + }, + }), + range_length: Some(32), + text: "p".into(), + }, + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 81, + character: 5, + }, + end: Position { + line: 81, + character: 5, + }, + }), + range_length: Some(0), + text: "a".into(), + }, + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 81, + character: 6, + }, + end: Position { + line: 81, + character: 6, + }, + }), + range_length: Some(0), + text: "s".into(), + }, + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 81, + character: 7, + }, + end: Position { + line: 81, + character: 7, + }, + }), + range_length: Some(0), + text: "s".into(), + }, + ]; + + let mut version = 2; + + for change in changes { + document.apply_changes(vec![change], version, PositionEncoding::UTF16); + version += 1; + } + + insta::assert_snapshot!(document.contents()); +} diff --git a/crates/ruff_server/tests/snapshots/document__delete_lines_pandas_html.snap b/crates/ruff_server/tests/snapshots/document__delete_lines_pandas_html.snap new file mode 100644 index 0000000000000..2ba81d2007ece --- /dev/null +++ b/crates/ruff_server/tests/snapshots/document__delete_lines_pandas_html.snap @@ -0,0 +1,1233 @@ +--- +source: crates/ruff_server/tests/document.rs +expression: document.contents() +--- +# +------------------------------------------------------------+ +# | Code adopted from: | +# | Repository: https://github.com/pandas-dev/pandas.git | +# | File: `io/html.py` | +# | Commit: 1f622e2b5303650fa5e497e4552d0554e51049cb | +# +------------------------------------------------------------+ +# This file should be used to test LSP functions that edit / fix a file. + +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with +HTML IO. + +""" + +from __future__ import annotations + +from collections import abc +import errno +import numbers +import os +import re +from re import Pattern +from typing import ( + TYPE_CHECKING, + Literal, + cast, +) + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + AbstractMethodError, + EmptyDataError, +) +from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.common import is_list_like + +from pandas import isna +from pandas.core.indexes.base import Index +from pandas.core.indexes.multi import MultiIndex +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + get_handle, + is_url, + stringify_path, + validate_header_arg, +) +from pandas.io.formats.printing import pprint_thing +from pandas.io.parsers import TextParser + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Sequence, + ) + + from pandas._typing import ( + BaseBuffer, + DtypeBackend, + FilePath, + HTMLFlavors, + ReadBuffer, + StorageOptions, + ) + + from pandas import DataFrame + +############# +# READ HTML # +############# +_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") + + +def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: + """ + + """ + pass + + +def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: + """ + Get an iterator given an integer, slice or container. + + Parameters + ---------- + skiprows : int, slice, container + The iterator to use to skip rows; can also be a slice. + + Raises + ------ + TypeError + * If `skiprows` is not a slice, integer, or Container + + Returns + ------- + it : iterable + A proper iterator to use to skip rows of a DataFrame. + """ + if isinstance(skiprows, slice): + start, step = skiprows.start or 0, skiprows.step or 1 + return list(range(start, skiprows.stop, step)) + elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): + return cast("int | Sequence[int]", skiprows) + elif skiprows is None: + return 0 + raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") + + +def _read( + obj: FilePath | BaseBuffer, + encoding: str | None, + storage_options: StorageOptions | None, +) -> str | bytes: + """ + Try to read from a url, file or string. + + Parameters + ---------- + obj : str, unicode, path object, or file-like object + + Returns + ------- + raw_text : str + """ + try: + with get_handle( + obj, "r", encoding=encoding, storage_options=storage_options + ) as handles: + return handles.handle.read() + except OSError as err: + if not is_url(obj): + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}" + ) from err + raise + + +class _HtmlFrameParser: + """ + Base class for parsers that parse HTML into DataFrames. + + Parameters + ---------- + io : str or file-like + This can be either a string path, a valid URL using the HTTP, + FTP, or FILE protocols or a file-like object. + + match : str or regex + The text to match in the document. + + attrs : dict + List of HTML element attributes to match. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + Attributes + ---------- + io : str or file-like + raw HTML, URL, or file-like object + + match : regex + The text to match in the raw HTML + + attrs : dict-like + A dictionary of valid table attributes to use to search for table + elements. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + Notes + ----- + To subclass this class effectively you must override the following methods: + * :func:`_build_doc` + * :func:`_attr_getter` + * :func:`_href_getter` + * :func:`_text_getter` + * :func:`_parse_td` + * :func:`_parse_thead_tr` + * :func:`_parse_tbody_tr` + * :func:`_parse_tfoot_tr` + * :func:`_parse_tables` + * :func:`_equals_tag` + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + io: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + match: str | Pattern, + attrs: dict[str, str] | None, + encoding: str, + displayed_only: bool, + extract_links: Literal[None, "header", "footer", "body", "all"], + storage_options: StorageOptions = None, + ) -> None: + self.io = io + self.match = match + self.attrs = attrs + self.encoding = encoding + self.displayed_only = displayed_only + self.extract_links = extract_links + self.storage_options = storage_options + + def parse_tables(self): + """ + Parse and return all tables from the DOM. + + Returns + ------- + list of parsed (header, body, footer) tuples from tables. + """ + tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + return (self._parse_thead_tbody_tfoot(table) for table in tables) + + def _attr_getter(self, obj, attr): + """ + Return the attribute value of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + attr : str or unicode + The attribute, such as "colspan" + + Returns + ------- + str or unicode + The attribute value. + """ + # Both lxml and BeautifulSoup have the same implementation: + return obj.get(attr) + + def _href_getter(self, obj) -> str | None: + """ + Return a href if the DOM node contains a child or None. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + href : str or unicode + The href from the child of the DOM node. + """ + raise AbstractMethodError(self) + + def _text_getter(self, obj): + """ + Return the text of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + text : str or unicode + The text from an individual DOM node. + """ + raise AbstractMethodError(self) + + def _parse_td(self, obj): + """ + Return the td elements from a row element. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + list of node-like + These are the elements of each row, i.e., the columns. + """ + raise AbstractMethodError(self) + + def _parse_thead_tr(self, table): + """ + Return the list of thead row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains zero or more thead elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tbody_tr(self, table): + """ + Return the list of tbody row elements from the parsed table element. + + HTML5 table bodies consist of either 0 or more elements (which + only contain elements) or 0 or more elements. This method + checks for both structures. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tfoot_tr(self, table): + """ + Return the list of tfoot row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tables(self, document, match, attrs): + """ + Return all tables from the parsed DOM. + + Parameters + ---------- + document : the DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. + + Raises + ------ + ValueError : `match` does not match any text in the document. + + Returns + ------- + list of node-like + HTML
elements to be parsed into raw data. + """ + raise AbstractMethodError(self) + + def _equals_tag(self, obj, tag) -> bool: + """ + Return whether an individual DOM node matches a tag + + Parameters + ---------- + obj : node-like + A DOM node. + + tag : str + Tag name to be checked for equality. + + Returns + ------- + boolean + Whether `obj`'s tag name is `tag` + """ + raise AbstractMethodError(self) + + def _build_doc(self): + """ + Return a tree-like object that can be used to iterate over the DOM. + + Returns + ------- + node-like + The DOM from which to parse the table element. + """ + raise AbstractMethodError(self) + + def _parse_thead_tbody_tfoot(self, table_html): + """ + Given a table, return parsed header, body, and foot. + + Parameters + ---------- + table_html : node-like + + Returns + ------- + tuple of (header, body, footer), each a list of list-of-text rows. + + Notes + ----- + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of str text. + + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are . Move the top all- or + while body_rows and row_is_all_th(body_rows[0]): + header_rows.append(body_rows.pop(0)) + + header = self._expand_colspan_rowspan(header_rows, section="header") + body = self._expand_colspan_rowspan(body_rows, section="body") + footer = self._expand_colspan_rowspan(footer_rows, section="footer") + + return header, body, footer + + def _expand_colspan_rowspan( + self, rows, section: Literal["header", "footer", "body"] + ) -> list[list]: + """ + Given a list of s, return a list of text rows. + + Parameters + ---------- + rows : list of node-like + List of s + section : the section that the rows belong to (header, body or footer). + + Returns + ------- + list of list + Each returned row is a list of str text, or tuple (text, link) + if extract_links is not None. + + Notes + ----- + Any cell with ``rowspan`` or ``colspan`` will have its contents copied + to subsequent cells. + """ + all_texts = [] # list of rows, each a list of str + text: str | tuple + remainder: list[ + tuple[int, str | tuple, int] + ] = [] # list of (index, text, nrows) + + for tr in rows: + texts = [] # the output for this row + next_remainder = [] + + index = 0 + tds = self._parse_td(tr) + for td in tds: + # Append texts from previous rows with rowspan>1 that come + # before this or (see _parse_thead_tr). + return row.xpath("./td|./th") + + def _parse_tables(self, document, match, kwargs): + pattern = match.pattern + + # 1. check all descendants for the given pattern and only search tables + # GH 49929 + xpath_expr = f"//table[.//text()[re:test(., {pattern!r})]]" + + # if any table attributes were given build an xpath expression to + # search for them + if kwargs: + xpath_expr += _build_xpath_expr(kwargs) + + tables = document.xpath(xpath_expr, namespaces=_re_namespace) + + tables = self._handle_hidden_tables(tables, "attrib") + if self.displayed_only: + for table in tables: + # lxml utilizes XPATH 1.0 which does not have regex + # support. As a result, we find all elements with a style + # attribute and iterate them to check for display:none + for elem in table.xpath(".//style"): + elem.drop_tree() + for elem in table.xpath(".//*[@style]"): + if "display:none" in elem.attrib.get("style", "").replace(" ", ""): + elem.drop_tree() + if not tables: + raise ValueError(f"No tables found matching regex {pattern!r}") + return tables + + def _equals_tag(self, obj, tag) -> bool: + return obj.tag == tag + + def _build_doc(self): + """ + Raises + ------ + ValueError + * If a URL that lxml cannot parse is passed. + + Exception + * Any other ``Exception`` thrown. For example, trying to parse a + URL that is syntactically correct on a machine with no internet + connection will fail. + + See Also + -------- + pandas.io.html._HtmlFrameParser._build_doc + """ + from lxml.etree import XMLSyntaxError + from lxml.html import ( + HTMLParser, + parse, + ) + + parser = HTMLParser(recover=True, encoding=self.encoding) + + if is_url(self.io): + with get_handle(self.io, "r", storage_options=self.storage_options) as f: + r = parse(f.handle, parser=parser) + else: + # try to parse the input in the simplest way + try: + r = parse(self.io, parser=parser) + except OSError as err: + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}" + ) from err + try: + r = r.getroot() + except AttributeError: + pass + else: + if not hasattr(r, "text_content"): + raise XMLSyntaxError("no text parsed from document", 0, 0, 0) + + for br in r.xpath("*//br"): + br.tail = "\n" + (br.tail or "") + + return r + + def _parse_thead_tr(self, table): + rows = [] + + for thead in table.xpath(".//thead"): + rows.extend(thead.xpath("./tr")) + + # HACK: lxml does not clean up the clearly-erroneous + # . (Missing ). Add + # the and _pretend_ it's a ; _parse_td() will find its + # children as though it's a . + # + # Better solution would be to use html5lib. + elements_at_root = thead.xpath("./td|./th") + if elements_at_root: + rows.append(thead) + + return rows + + def _parse_tbody_tr(self, table): + from_tbody = table.xpath(".//tbody//tr") + from_root = table.xpath("./tr") + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.xpath(".//tfoot//tr") + + +def _expand_elements(body) -> None: + data = [len(elem) for elem in body] + lens = Series(data) + lens_max = lens.max() + not_max = lens[lens != lens_max] + + empty = [""] + for ind, length in not_max.items(): + body[ind] += empty * (lens_max - length) + + +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop("data") + header = kwargs.pop("header") + kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) + if head: + body = head + body + + # Infer header when there is a or top
+ - Move rows from bottom of body to footer only if + all elements inside row are + """ + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) + + def row_is_all_th(row): + return all(self._equals_tag(t, "th") for t in self._parse_td(row)) + + if not header_rows: + # The table has no
rows from + # body_rows to header_rows. (This is a common case because many + # tables in the wild have no
+ while remainder and remainder[0][0] <= index: + prev_i, prev_text, prev_rowspan = remainder.pop(0) + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + index += 1 + + # Append the text from this , colspan times + text = _remove_whitespace(self._text_getter(td)) + if self.extract_links in ("all", section): + href = self._href_getter(td) + text = (text, href) + rowspan = int(self._attr_getter(td, "rowspan") or 1) + colspan = int(self._attr_getter(td, "colspan") or 1) + + for _ in range(colspan): + texts.append(text) + if rowspan > 1: + next_remainder.append((index, text, rowspan - 1)) + index += 1 + + # Append texts from previous rows at the final position + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + + all_texts.append(texts) + remainder = next_remainder + + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder + + return all_texts + + def _handle_hidden_tables(self, tbl_list, attr_name: str): + """ + Return list of tables, potentially removing hidden elements + + Parameters + ---------- + tbl_list : list of node-like + Type of list elements will vary depending upon parser used + attr_name : str + Name of the accessor for retrieving HTML attributes + + Returns + ------- + list of node-like + Return type matches `tbl_list` + """ + if not self.displayed_only: + return tbl_list + + return [ + x + for x in tbl_list + if "display:none" + not in getattr(x, attr_name).get("style", "").replace(" ", "") + ] + + +class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. + + See Also + -------- + pandas.io.html._HtmlFrameParser + pandas.io.html._LxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`pandas.io.html._HtmlFrameParser`. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + from bs4 import SoupStrainer + + self._strainer = SoupStrainer("table") + + def _parse_tables(self, document, match, attrs): + element_name = self._strainer.name + tables = document.find_all(element_name, attrs=attrs) + if not tables: + raise ValueError("No tables found") + + result = [] + unique_tables = set() + tables = self._handle_hidden_tables(tables, "attrs") + + for table in tables: + if self.displayed_only: + for elem in table.find_all("style"): + elem.decompose() + + for elem in table.find_all(style=re.compile(r"display:\s*none")): + elem.decompose() + + if table not in unique_tables and table.find(string=match) is not None: + result.append(table) + unique_tables.add(table) + if not result: + raise ValueError(f"No tables found matching pattern {match.pattern!r}") + return result + + def _href_getter(self, obj) -> str | None: + a = obj.find("a", href=True) + return None if not a else a["href"] + + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag) -> bool: + return obj.name == tag + + def _parse_td(self, row): + return row.find_all(("td", "th"), recursive=False) + + def _parse_thead_tr(self, table): + return table.select("thead tr") + + def _parse_tbody_tr(self, table): + from_tbody = table.select("tbody tr") + from_root = table.find_all("tr", recursive=False) + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.select("tfoot tr") + + def _setup_build_doc(self): + raw_text = _read(self.io, self.encoding, self.storage_options) + if not raw_text: + raise ValueError(f"No text parsed from document: {self.io}") + return raw_text + + def _build_doc(self): + from bs4 import BeautifulSoup + + bdoc = self._setup_build_doc() + if isinstance(bdoc, bytes) and self.encoding is not None: + udoc = bdoc.decode(self.encoding) + from_encoding = None + else: + udoc = bdoc + from_encoding = self.encoding + + soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + for br in soup.find_all("br"): + br.replace_with("\n" + br.text) + + return soup + + +def _build_xpath_expr(attrs) -> str: + """ + Build an xpath expression to simulate bs4's ability to pass in kwargs to + search for attributes when using the lxml parser. + + Parameters + ---------- + attrs : dict + A dict of HTML attributes. These are NOT checked for validity. + + Returns + ------- + expr : unicode + An XPath expression that checks for the given HTML attributes. + """ + # give class attribute as class_ because class is a python keyword + if "class_" in attrs: + attrs["class"] = attrs.pop("class_") + + s = " and ".join([f"@{k}={v!r}" for k, v in attrs.items()]) + return f"[{s}]" + + +_re_namespace = {"re": "http://exslt.org/regular-expressions"} + + +class _LxmlFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses lxml under the hood. + + Warning + ------- + This parser can only handle HTTP, FTP, and FILE urls. + + See Also + -------- + _HtmlFrameParser + _BeautifulSoupLxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`_HtmlFrameParser`. + """ + + def _href_getter(self, obj) -> str | None: + href = obj.xpath(".//a/@href") + return None if not href else href[0] + + def _text_getter(self, obj): + return obj.text_content() + + def _parse_td(self, row): + # Look for direct children only: the "row" element here may be a + #
foobar
-only rows + if header is None: + if len(head) == 1: + header = 0 + else: + # ignore all-empty-text rows + header = [i for i, row in enumerate(head) if any(text for text in row)] + + if foot: + body += foot + + # fill out elements of body that are "ragged" + _expand_elements(body) + with TextParser(body, header=header, **kwargs) as tp: + return tp.read() + + +_valid_parsers = { + "lxml": _LxmlFrameParser, + None: _LxmlFrameParser, + "html5lib": _BeautifulSoupHtml5LibFrameParser, + "bs4": _BeautifulSoupHtml5LibFrameParser, +} + + +def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]: + """ + Choose the parser based on the input flavor. + + Parameters + ---------- + flavor : {{"lxml", "html5lib", "bs4"}} or None + The type of parser to use. This must be a valid backend. + + Returns + ------- + cls : _HtmlFrameParser subclass + The parser class based on the requested input flavor. + + Raises + ------ + ValueError + * If `flavor` is not a valid backend. + ImportError + * If you do not have the requested `flavor` + """ + valid_parsers = list(_valid_parsers.keys()) + if flavor not in valid_parsers: + raise ValueError( + f"{flavor!r} is not a valid flavor, valid flavors are {valid_parsers}" + ) + + if flavor in ("bs4", "html5lib"): + import_optional_dependency("html5lib") + import_optional_dependency("bs4") + else: + import_optional_dependency("lxml.etree") + return _valid_parsers[flavor] + + +def _print_as_set(s) -> str: + arg = ", ".join([pprint_thing(el) for el in s]) + return f"{{{arg}}}" + + +def _validate_flavor(flavor): + if flavor is None: + flavor = "lxml", "bs4" + elif isinstance(flavor, str): + flavor = (flavor,) + elif isinstance(flavor, abc.Iterable): + if not all(isinstance(flav, str) for flav in flavor): + raise TypeError( + f"Object of type {type(flavor).__name__!r} " + f"is not an iterable of strings" + ) + else: + msg = repr(flavor) if isinstance(flavor, str) else str(flavor) + msg += " is not a valid flavor" + raise ValueError(msg) + + flavor = tuple(flavor) + valid_flavors = set(_valid_parsers) + flavor_set = set(flavor) + + if not flavor_set & valid_flavors: + raise ValueError( + f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " + f"flavors are {_print_as_set(valid_flavors)}" + ) + return flavor + + +def _parse( + flavor, + io, + match, + attrs, + encoding, + displayed_only, + extract_links, + storage_options, + **kwargs, +): + flavor = _validate_flavor(flavor) + compiled_match = re.compile(match) # you can pass a compiled regex here + + retained = None + for flav in flavor: + parser = _parser_dispatch(flav) + p = parser( + io, + compiled_match, + attrs, + encoding, + displayed_only, + extract_links, + storage_options, + ) + + try: + tables = p.parse_tables() + except ValueError as caught: + # if `io` is an io-like object, check if it's seekable + # and try to rewind it before trying the next parser + if hasattr(io, "seekable") and io.seekable(): + io.seek(0) + elif hasattr(io, "seekable") and not io.seekable(): + # if we couldn't rewind it, let the user know + raise ValueError( + f"The flavor {flav} failed to parse your input. " + "Since you passed a non-rewindable file " + "object, we can't rewind it to try " + "another parser. Try read_html() with a different flavor." + ) from caught + + retained = caught + else: + break + else: + assert retained is not None # for mypy + raise retained + + ret = [] + for table in tables: + try: + df = _data_to_frame(data=table, **kwargs) + # Cast MultiIndex header to an Index of tuples when extracting header + # links and replace nan with None (therefore can't use mi.to_flat_index()). + # This maintains consistency of selection (e.g. df.columns.str[1]) + if extract_links in ("all", "header") and isinstance( + df.columns, MultiIndex + ): + df.columns = Index( + ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), + tupleize_cols=False, + ) + + ret.append(df) + except EmptyDataError: # empty table + continue + return ret + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_html( + io: FilePath | ReadBuffer[str], + *, + match: str | Pattern = ".+", + flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None, + header: int | Sequence[int] | None = None, + index_col: int | Sequence[int] | None = None, + skiprows: int | Sequence[int] | slice | None = None, + attrs: dict[str, str] | None = None, + parse_dates: bool = False, + thousands: str | None = ",", + encoding: str | None = None, + decimal: str = ".", + converters: dict | None = None, + na_values: Iterable[object] | None = None, + keep_default_na: bool = True, + displayed_only: bool = True, + extract_links: Literal[None, "header", "footer", "body", "all"] = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + storage_options: StorageOptions = None, +) -> list[DataFrame]: + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. + + Parameters + ---------- + io : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``read()`` function. + The string can represent a URL. Note that + lxml only accepts the http, ftp and file url protocols. If you have a + URL that starts with ``'https'`` you might try removing the ``'s'``. + + .. deprecated:: 2.1.0 + Passing html literal strings is deprecated. + Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead. + + match : str or compiled regular expression, optional + The set of tables containing text matching this regex or string will be + returned. Unless the HTML is extremely simple you will probably need to + pass a non-empty string here. Defaults to '.+' (match any non-empty + string). The default value will return all tables contained on a page. + This value is converted to a regular expression so that there is + consistent behavior between Beautiful Soup and lxml. + + flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional + The parsing engine (or list of parsing engines) to use. 'bs4' and + 'html5lib' are synonymous with each other, they are both there for + backwards compatibility. The default of ``None`` tries to use ``lxml`` + to parse and if that fails it falls back on ``bs4`` + ``html5lib``. + + header : int or list-like, optional + The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to + make the columns headers. + + index_col : int or list-like, optional + The column (or list of columns) to use to create the index. + + skiprows : int, list-like or slice, optional + Number of rows to skip after parsing the column integer. 0-based. If a + sequence of integers or a slice is given, will skip the rows indexed by + that sequence. Note that a single element sequence means 'skip the nth + row' whereas an integer means 'skip n rows'. + + attrs : dict, optional + This is a dictionary of attributes that you can pass to use to identify + the table in the HTML. These are not checked for validity before being + passed to lxml or Beautiful Soup. However, these attributes must be + valid HTML table attributes to work correctly. For example, :: + + attrs = {{"id": "table"}} + + is a valid attribute dictionary because the 'id' HTML tag attribute is + a valid HTML attribute for *any* HTML tag as per `this document + `__. :: + + attrs = {{"asdf": "table"}} + + is *not* a valid attribute dictionary because 'asdf' is not a valid + HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 + table attributes can be found `here + `__. A + working draft of the HTML 5 spec can be found `here + `__. It contains the + latest information on table attributes for the modern web. + + parse_dates : bool, optional + See :func:`~read_csv` for more details. + + thousands : str, optional + Separator to use to parse thousands. Defaults to ``','``. + + encoding : str, optional + The encoding used to decode the web page. Defaults to ``None``.``None`` + preserves the previous encoding behavior, which depends on the + underlying parser library (e.g., the parser library will try to use + the encoding provided by the document). + + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European + data). + + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + na_values : iterable, default None + Custom NA values. + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to. + + displayed_only : bool, default True + Whether elements with "display: none" should be parsed. + + extract_links : {{None, "all", "header", "body", "footer"}} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + {storage_options} + + .. versionadded:: 2.1.0 + + Returns + ------- + dfs + A list of DataFrames. + + See Also + -------- + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Notes + ----- + Before using this function you should read the :ref:`gotchas about the + HTML parsing libraries `. + + Expect to do some cleanup after you call this function. For example, you + might need to manually assign column names if the column names are + converted to NaN when you pass the `header=0` argument. We try to assume as + little as possible about the structure of the table and push the + idiosyncrasies of the HTML contained in the table to the user. + + This function searches for ```` elements and only for ```` + and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only ``
`` rows and ```` elements within each ``
`` + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + Similar to :func:`~read_csv` the `header` argument is applied + **after** `skiprows` is applied. + + This function will *always* return a list of :class:`DataFrame` *or* + it will fail, e.g., it will *not* return an empty list. + + Examples + -------- + See the :ref:`read_html documentation in the IO section of the docs + ` for some examples of reading in HTML tables. + """ + # Type check here. We don't want to parse only to fail because of an + # invalid value of an integer skiprows. + if isinstance(skiprows, numbers.Integral) and skiprows < 0: + raise ValueError( + "cannot skip rows starting from the end of the " + "data (you passed a negative value)" + ) + if extract_links not in [None, "header", "footer", "body", "all"]: + raise ValueError( + "`extract_links` must be one of " + '{None, "header", "footer", "body", "all"}, got ' + f'"{extract_links}"' + ) + + validate_header_arg(header) + check_dtype_backend(dtype_backend) + + io = stringify_path(io) + + return _parse( + flavor=flavor, + io=io, + match=match, + header=header, + index_col=index_col, + skiprows=skiprows, + parse_dates=parse_dates, + thousands=thousands, + attrs=attrs, + encoding=encoding, + decimal=decimal, + converters=converters, + na_values=na_values, + keep_default_na=keep_default_na, + displayed_only=displayed_only, + extract_links=extract_links, + dtype_backend=dtype_backend, + storage_options=storage_options, + ) + diff --git a/crates/ruff_source_file/src/line_index.rs b/crates/ruff_source_file/src/line_index.rs index 31db33eb84f8c..7f9022fff4148 100644 --- a/crates/ruff_source_file/src/line_index.rs +++ b/crates/ruff_source_file/src/line_index.rs @@ -129,6 +129,11 @@ impl LineIndex { self.line_starts().len() } + /// Returns `true` if the text only consists of ASCII characters + pub fn is_ascii(&self) -> bool { + self.kind().is_ascii() + } + /// Returns the row number for a given offset. /// /// ## Examples diff --git a/crates/ruff_source_file/src/locator.rs b/crates/ruff_source_file/src/locator.rs index 792f31e247f44..30bd59d830fe4 100644 --- a/crates/ruff_source_file/src/locator.rs +++ b/crates/ruff_source_file/src/locator.rs @@ -23,6 +23,13 @@ impl<'a> Locator<'a> { } } + pub const fn with_index(contents: &'a str, index: LineIndex) -> Self { + Self { + contents, + index: OnceCell::with_value(index), + } + } + #[deprecated( note = "This is expensive, avoid using outside of the diagnostic phase. Prefer the other `Locator` methods instead." )] diff --git a/docs/configuration.md b/docs/configuration.md index 35cea26b85d34..bee5132b6c977 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -527,6 +527,7 @@ Commands: linter List all supported upstream linters clean Clear any caches in the current directory and any subdirectories format Run the Ruff formatter on the given files or directories + server Run the language server version Display Ruff's version help Print this message or the help of the given subcommand(s)