Skip to content
This repository has been archived by the owner on Jul 27, 2023. It is now read-only.

Lex Jupyter line magic with Mode::Jupyter #23

Merged
merged 18 commits into from
Jul 18, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions core/src/mode.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,42 @@
//! Control in the different modes by which a source file can be parsed.

/// The mode argument specifies in what way code must be parsed.
#[derive(Clone, Copy, Hash, PartialEq, Eq)]
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum Mode {
/// The code consists of a sequence of statements.
Module,
/// The code consists of a sequence of interactive statement.
Interactive,
/// The code consists of a single expression.
Expression,
/// The code consists of a sequence of statements which are part of a
/// Jupyter Notebook and thus could include escape commands scoped to
/// a single line.
///
/// ## Limitations:
///
/// These escaped commands are only supported when they are the only
/// statement on a line. If they're part of a larger statement such as
/// on the right-hand side of an assignment, the lexer will not recognize
/// them as escape commands.
///
/// For [Dynamic object information], the escape characters (`?`, `??`)
/// must be used before an object. For example, `?foo` will be recognized,
/// but `foo?` will not.
///
/// ## Supported escape commands:
/// - [Magic command system] which is limited to [line magics] and can start
/// with `?` or `??`.
/// - [Dynamic object information] which can start with `?` or `??`.
/// - [System shell access] which can start with `!` or `!!`.
/// - [Automatic parentheses and quotes] which can start with `/`, `;`, or `,`.
///
/// [Magic command system]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#magic-command-system
/// [line magics]: https://ipython.readthedocs.io/en/stable/interactive/magics.html#line-magics
/// [Dynamic object information]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#dynamic-object-information
/// [System shell access]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#system-shell-access
/// [Automatic parentheses and quotes]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#automatic-parentheses-and-quotes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. This comment is excellent!

Jupyter,
}

impl std::str::FromStr for Mode {
Expand All @@ -17,6 +45,7 @@ impl std::str::FromStr for Mode {
match s {
"exec" | "single" => Ok(Mode::Module),
"eval" => Ok(Mode::Expression),
"jupyter" => Ok(Mode::Jupyter),
_ => Err(ModeParseError),
}
}
Expand All @@ -28,6 +57,6 @@ pub struct ModeParseError;

impl std::fmt::Display for ModeParseError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, r#"mode must be "exec", "eval", or "single""#)
write!(f, r#"mode must be "exec", "eval", "jupyter", or "single""#)
}
}
247 changes: 244 additions & 3 deletions parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::{
soft_keywords::SoftKeywordTransformer,
string::FStringErrorType,
text_size::{TextLen, TextRange, TextSize},
token::{StringKind, Tok},
token::{MagicKind, StringKind, Tok},
Mode,
};
use log::trace;
Expand Down Expand Up @@ -175,6 +175,8 @@ pub struct Lexer<T: Iterator<Item = char>> {
pending: Vec<Spanned>,
// The current location.
location: TextSize,
// Lexer mode.
mode: Mode,
}

// generated in build.rs, in gen_phf()
Expand Down Expand Up @@ -213,7 +215,7 @@ pub fn lex_starts_at(
mode: Mode,
start_offset: TextSize,
) -> SoftKeywordTransformer<Lexer<std::str::Chars<'_>>> {
SoftKeywordTransformer::new(Lexer::new(source.chars(), start_offset), mode)
SoftKeywordTransformer::new(Lexer::new(source.chars(), mode, start_offset), mode)
}

impl<T> Lexer<T>
Expand All @@ -222,7 +224,7 @@ where
{
/// Create a new lexer from T and a starting location. You probably want to use
/// [`lex`] instead.
pub fn new(input: T, start: TextSize) -> Self {
pub fn new(input: T, mode: Mode, start: TextSize) -> Self {
let mut lxr = Lexer {
at_begin_of_line: true,
nesting: 0,
Expand All @@ -231,6 +233,7 @@ where
pending: Vec::with_capacity(5),
location: start,
window: CharWindow::new(input),
mode,
};
// Fill the window.
lxr.window.slide();
Expand Down Expand Up @@ -494,6 +497,58 @@ where
Ok(())
}

/// Lex a single magic command.
fn lex_magic_command(&mut self, kind: MagicKind) -> (Tok, TextRange) {
let start_pos = self.get_pos();
for _ in 0..u32::from(kind.prefix_len()) {
self.next_char();
}
let mut value = String::new();
loop {
match self.window[0] {
Some('\\') => {
// Only skip the line continuation if it is followed by a newline
// otherwise it is a normal backslash which is part of the magic command:
//
// Skip this backslash
// v
// !pwd \
// && ls -a | sed 's/^/\\ /'
// ^^
// Don't skip these backslashes
if matches!(self.window[1], Some('\n' | '\r')) {
dhruvmanila marked this conversation as resolved.
Show resolved Hide resolved
self.next_char();
self.next_char();
}
dhruvmanila marked this conversation as resolved.
Show resolved Hide resolved
}
Some('\n' | '\r') | None => {
let end_pos = self.get_pos();
return (
Tok::MagicCommand { kind, value },
TextRange::new(start_pos, end_pos),
);
}
Some(_) => {}
}
value.push(self.next_char().unwrap());
}
}

fn lex_and_emit_magic_command(&mut self) {
let kind = match self.window[..2] {
[Some(c1), Some(c2)] => {
MagicKind::try_from([c1, c2]).map_or_else(|_| MagicKind::try_from(c1), Ok)
}
// When the escape character is the last character of the file.
[Some(c), None] => MagicKind::try_from(c),
_ => return,
};
if let Ok(kind) = kind {
let magic_command = self.lex_magic_command(kind);
self.emit(magic_command);
}
}

/// Lex a string literal.
fn lex_string(&mut self, kind: StringKind) -> LexResult {
let start_pos = self.get_pos();
Expand Down Expand Up @@ -644,6 +699,10 @@ where
spaces = 0;
tabs = 0;
}
// https://github.com/ipython/ipython/blob/635815e8f1ded5b764d66cacc80bbe25e9e2587f/IPython/core/inputtransformer2.py#L345
Some('%' | '!' | '?' | '/' | ';' | ',') if self.mode == Mode::Jupyter => {
self.lex_and_emit_magic_command();
}
Some('\x0C') => {
// Form feed character!
// Reset indentation for the Emacs user.
Expand Down Expand Up @@ -1381,6 +1440,11 @@ mod tests {
lexer.map(|x| x.unwrap().0).collect()
}

pub fn lex_jupyter_source(source: &str) -> Vec<Tok> {
let lexer = lex(source, Mode::Jupyter);
lexer.map(|x| x.unwrap().0).collect()
}

fn str_tok(s: &str) -> Tok {
Tok::String {
value: s.to_owned(),
Expand All @@ -1397,6 +1461,183 @@ mod tests {
}
}

macro_rules! test_jupyter_magic_line_continuation_eol {
($($name:ident: $eol:expr,)*) => {
$(
#[test]
fn $name() {
let source = format!("%matplotlib \\{} --inline", $eol);
let tokens = lex_jupyter_source(&source);
assert_eq!(
tokens,
vec![
Tok::MagicCommand { value: "matplotlib --inline".to_string(), kind: MagicKind::Magic },
]
)
}
)*
};
}
dhruvmanila marked this conversation as resolved.
Show resolved Hide resolved

test_jupyter_magic_line_continuation_eol! {
test_jupyter_magic_line_continuation_windows_eol: WINDOWS_EOL,
test_jupyter_magic_line_continuation_mac_eol: MAC_EOL,
test_jupyter_magic_line_continuation_unix_eol: UNIX_EOL,
}

#[test]
fn test_empty_jupyter_magic() {
let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
let tokens = lex_jupyter_source(source);
assert_eq!(
tokens,
vec![
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Magic,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Comment on lines +1530 to +1531
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this is the correct way to do this. It seems the tests in CI are not run using --all-features flag.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the easiest is to remove the feature flag. We always use full-lexer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the CI doesn't use it:

CARGO_ARGS: --no-default-features --features stdlib,zlib,importlib,encodings,ssl,jit

Or, it uses it only for certain steps:

run: cargo clippy --all --features malachite-bigint,full-lexer,serde -- -Dwarnings

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reasoning that this is a non logical newline. Isn't it a logical newline, because it terminates a statement?

Can we add a test where a magic command uses an invalid indent.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reasoning that this is a non logical newline. Isn't it a logical newline, because it terminates a statement?

My main reason being that as these tokens are filtered out before parsing, they should end with a NonLogicalNewline as otherwise there'll be multiple newline tokens i.e., multiple blank lines. This is similar to the Comment token.

Although, now that I think of it, without the full-lexer feature (but with Mode::Jupyter), the NonLogicalNewline won't be filtered out while the MagicCommand token will be.

Can we add a test where a magic command uses an invalid indent.

Do you mean something like the following?

for i in range(10):
    print('hello')
   !pwd

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean something like the following?

Yes, exactly. Sorry, I should have provided an example.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've decided to move ahead with the current implementation but will have to consider indentation once the parser is updated to account for any indentation error.

Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Magic2,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Shell,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::ShCap,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Help,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Help2,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Paren,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Quote,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Quote2,
},
]
)
}

#[test]
fn test_jupyter_magic() {
let source = r"
?foo
??foo
%timeit a = b
%timeit a % 3
%matplotlib \
--inline
!pwd \
&& ls -a | sed 's/^/\\ /'
!!cd /Users/foo/Library/Application\ Support/
/foo 1 2
,foo 1 2
;foo 1 2
!ls
"
.trim();
let tokens = lex_jupyter_source(source);
assert_eq!(
tokens,
vec![
Tok::MagicCommand {
value: "foo".to_string(),
kind: MagicKind::Help,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "foo".to_string(),
kind: MagicKind::Help2,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "timeit a = b".to_string(),
kind: MagicKind::Magic,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "timeit a % 3".to_string(),
kind: MagicKind::Magic,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "matplotlib --inline".to_string(),
kind: MagicKind::Magic,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "pwd && ls -a | sed 's/^/\\\\ /'".to_string(),
kind: MagicKind::Shell,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "cd /Users/foo/Library/Application\\ Support/".to_string(),
kind: MagicKind::ShCap,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "foo 1 2".to_string(),
kind: MagicKind::Paren,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "foo 1 2".to_string(),
kind: MagicKind::Quote,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "foo 1 2".to_string(),
kind: MagicKind::Quote2,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "ls".to_string(),
kind: MagicKind::Shell,
},
]
)
}

#[test]
fn test_numbers() {
let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j";
Expand Down
Loading